Merge branch 'develop' of github.com:PaddlePaddle/Paddle into overlap_memcpy_with_dist

4444e79e · Yancey1989 · d5a88b93 · 19fd0717 · 4444e79e · 4444e79e
150 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -55,12 +55,13 @@ option(WITH_FLUID_ONLY  "Compile PaddlePaddle fluid only"               OFF)
 option(WITH_GOLANG      "Compile PaddlePaddle with GOLANG"              OFF)
 option(GLIDE_INSTALL    "Download and install go dependencies "         ON)
 option(USE_NNPACK       "Compile PaddlePaddle with NNPACK library"      OFF)
-option(WITH_DISTRIBUTE  "Compile with grpc distributed support"         OFF)
+option(WITH_DISTRIBUTE  "Compile with distributed support"              OFF)
 option(USE_EIGEN_FOR_BLAS   "Use matrix multiplication in Eigen"        OFF)
 option(EIGEN_USE_THREADS "Compile with multi-threaded Eigen"            OFF)
 option(WITH_ARM_FP16    "Use half precision support on armv8.2-a cpu"   OFF)
 option(WITH_FAST_BUNDLE_TEST    "Bundle tests that can be run in a single process together to reduce launch overhead"   OFF)
 option(WITH_CONTRIB     "Compile the third-party contributation"        OFF)
+option(WITH_GRPC     "Use grpc as the default rpc framework"            ${WITH_DISTRIBUTE})
 # CMAKE_BUILD_TYPE
 if(NOT CMAKE_BUILD_TYPE)
@@ -147,7 +148,16 @@ include(external/any)       # download libn::any
 include(external/eigen)     # download eigen3
 include(external/pybind11)  # download pybind11
 include(external/cares)
-include(external/grpc)
+if(WITH_DISTRIBUTE)
+    if(WITH_GRPC)
+        include(external/grpc)
+    else()
+        include(external/leveldb)
+        include(external/brpc)
+    endif()
+endif()
 include(external/snappy)    # download snappy
 include(external/snappystream)
 include(external/threadpool)

--- a/benchmark/fluid/README.md
+++ b/benchmark/fluid/README.md
@@ -24,10 +24,12 @@ Currently supported `--model` argument include:
 * Run the following command to start a benchmark job locally:
    ```bash
-      python fluid_benchmark.py --model mnist  --device GPU
+      python fluid_benchmark.py --model mnist --device GPU
    ```
    You can choose to use GPU/CPU training. With GPU training, you can specify
    `--gpus <gpu_num>` to run multi GPU training.
+    You can set async mode parameter server. With async mode, you can specify
+    `--async_mode` to train model asynchronous.
 * Run distributed training with parameter servers:
    * see [run_fluid_benchmark.sh](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/fluid/run_fluid_benchmark.sh) as an example.
    * start parameter servers:

--- a/benchmark/fluid/args.py
+++ b/benchmark/fluid/args.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+__all__ = ['parse_args', ]
+BENCHMARK_MODELS = [
+    "machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm"
+]
+def parse_args():
+    parser = argparse.ArgumentParser('Fluid model benchmarks.')
+    parser.add_argument(
+        '--model',
+        type=str,
+        choices=BENCHMARK_MODELS,
+        default='resnet',
+        help='The model to run benchmark with.')
+    parser.add_argument(
+        '--batch_size', type=int, default=32, help='The minibatch size.')
+    #  args related to learning rate
+    parser.add_argument(
+        '--learning_rate', type=float, default=0.001, help='The learning rate.')
+    # TODO(wuyi): add "--use_fake_data" option back.
+    parser.add_argument(
+        '--skip_batch_num',
+        type=int,
+        default=5,
+        help='The first num of minibatch num to skip, for better performance test'
+    )
+    parser.add_argument(
+        '--iterations', type=int, default=80, help='The number of minibatches.')
+    parser.add_argument(
+        '--pass_num', type=int, default=100, help='The number of passes.')
+    parser.add_argument(
+        '--data_format',
+        type=str,
+        default='NCHW',
+        choices=['NCHW', 'NHWC'],
+        help='The data data_format, now only support NCHW.')
+    parser.add_argument(
+        '--device',
+        type=str,
+        default='GPU',
+        choices=['CPU', 'GPU'],
+        help='The device type.')
+    parser.add_argument(
+        '--gpus',
+        type=int,
+        default=1,
+        help='If gpus > 1, will use ParallelExecutor to run, else use Executor.')
+    # this option is available only for vgg and resnet.
+    parser.add_argument(
+        '--cpus',
+        type=int,
+        default=1,
+        help='If cpus > 1, will use ParallelDo to run, else use Executor.')
+    parser.add_argument(
+        '--data_set',
+        type=str,
+        default='flowers',
+        choices=['cifar10', 'flowers'],
+        help='Optional dataset for benchmark.')
+    parser.add_argument(
+        '--infer_only', action='store_true', help='If set, run forward only.')
+    parser.add_argument(
+        '--use_cprof', action='store_true', help='If set, use cProfile.')
+    parser.add_argument(
+        '--use_nvprof',
+        action='store_true',
+        help='If set, use nvprof for CUDA.')
+    parser.add_argument(
+        '--no_test',
+        action='store_true',
+        help='If set, do not test the testset during training.')
+    parser.add_argument(
+        '--memory_optimize',
+        action='store_true',
+        help='If set, optimize runtime memory before start.')
+    parser.add_argument(
+        '--use_fake_data',
+        action='store_true',
+        help='If set ommit the actual read data operators.')
+    parser.add_argument(
+        '--profile', action='store_true', help='If set, profile a few steps.')
+    parser.add_argument(
+        '--update_method',
+        type=str,
+        default='local',
+        choices=['local', 'pserver', 'nccl2'],
+        help='Choose parameter update method, can be local, pserver, nccl2.')
+    parser.add_argument(
+        '--no_split_var',
+        action='store_true',
+        default=False,
+        help='Whether split variables into blocks when update_method is pserver')
+    parser.add_argument(
+        '--async_mode',
+        action='store_true',
+        default=False,
+        help='Whether start pserver in async mode to support ASGD')
+    parser.add_argument(
+        '--use_reader_op',
+        action='store_true',
+        help='Whether to use reader op, and must specify the data path if set this to true.'
+    )
+    parser.add_argument(
+        '--data_path',
+        type=str,
+        default="",
+        help='Directory that contains all the training recordio files.')
+    args = parser.parse_args()
+    return args
--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
@@ -24,108 +24,7 @@ import paddle.fluid.core as core
 import paddle.fluid.profiler as profiler
 import paddle.fluid.transpiler.distribute_transpiler as distribute_transpiler
-BENCHMARK_MODELS = [
+from args import *
-    "machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm"
-]
-def parse_args():
-    parser = argparse.ArgumentParser('Fluid model benchmarks.')
-    parser.add_argument(
-        '--model',
-        type=str,
-        choices=BENCHMARK_MODELS,
-        default='resnet',
-        help='The model to run benchmark with.')
-    parser.add_argument(
-        '--batch_size',
-        type=int,
-        default=32,
-        help='The batch size on each gpu.')
-    parser.add_argument(
-        '--learning_rate', type=float, default=0.001, help='The learning rate.')
-    parser.add_argument(
-        '--skip_batch_num',
-        type=int,
-        default=5,
-        help='The first num of minibatch num to skip, for better performance test'
-    )
-    parser.add_argument(
-        '--iterations',
-        type=int,
-        default=80,
-        help='The number of minibatches, set to -1 to run all batches.')
-    parser.add_argument(
-        '--pass_num', type=int, default=100, help='The number of passes.')
-    parser.add_argument(
-        '--data_format',
-        type=str,
-        default='NCHW',
-        choices=['NCHW', 'NHWC'],
-        help='The data data_format, now only support NCHW.')
-    parser.add_argument(
-        '--device',
-        type=str,
-        default='GPU',
-        choices=['CPU', 'GPU'],
-        help='The device type.')
-    parser.add_argument(
-        '--gpus',
-        type=int,
-        default=1,
-        help='If gpus > 1, will use ParallelExecutor to run, else use Executor.')
-    # this option is available only for vgg and resnet.
-    parser.add_argument(
-        '--cpus',
-        type=int,
-        default=1,
-        help='If cpus > 1, will use ParallelDo to run, else use Executor.')
-    parser.add_argument(
-        '--data_set',
-        type=str,
-        default='flowers',
-        choices=['cifar10', 'flowers', 'imagenet'],
-        help='Optional dataset for benchmark.')
-    parser.add_argument(
-        '--infer_only', action='store_true', help='If set, run forward only.')
-    parser.add_argument(
-        '--use_cprof', action='store_true', help='If set, use cProfile.')
-    parser.add_argument(
-        '--use_nvprof',
-        action='store_true',
-        help='If set, use nvprof for CUDA.')
-    parser.add_argument(
-        '--no_test',
-        action='store_true',
-        help='If set, do not test the testset during training.')
-    parser.add_argument(
-        '--memory_optimize',
-        action='store_true',
-        help='If set, optimize runtime memory before start.')
-    parser.add_argument(
-        '--use_fake_data',
-        action='store_true',
-        help='If set ommit the actual read data operators.')
-    parser.add_argument(
-        '--profile', action='store_true', help='If set, profile a few steps.')
-    parser.add_argument(
-        '--update_method',
-        type=str,
-        default='local',
-        choices=['local', 'pserver', 'nccl2'],
-        help='Choose parameter update method, can be local, pserver, nccl2.')
-    parser.add_argument(
-        '--use_reader_op',
-        action='store_true',
-        help='Whether to use reader op, and must specify the data path if set this to true.'
-    )
-    parser.add_argument(
-        '--data_path',
-        type=str,
-        default="",
-        help='Directory that contains all the training recordio files.')
-    args = parser.parse_args()
-    return args
 def append_nccl2_prepare(trainer_id):
@@ -160,7 +59,7 @@ def append_nccl2_prepare(trainer_id):
                        "nccl-based dist train.")
-def dist_transpile(trainer_id):
+def dist_transpile(trainer_id, args):
    if trainer_id < 0:
        return None, None
@@ -182,7 +81,12 @@ def dist_transpile(trainer_id):
    training_role = os.getenv("PADDLE_TRAINING_ROLE")
    t = distribute_transpiler.DistributeTranspiler()
-    t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
+    t.transpile(
+        trainer_id,
+        pservers=pserver_endpoints,
+        trainers=trainers,
+        sync_mode=not args.async_mode,
+        slice_var_up=not args.no_split_var)
    if training_role == "PSERVER":
        pserver_program = t.get_pserver_program(current_endpoint)
        pserver_startup_program = t.get_startup_program(current_endpoint,
@@ -417,7 +321,7 @@ def main():
        fluid.memory_optimize(fluid.default_main_program())
    if args.update_method == "pserver":
-        train_prog, startup_prog = dist_transpile(trainer_id)
+        train_prog, startup_prog = dist_transpile(trainer_id, args)
        if not train_prog:
            raise Exception(
                "Must configure correct environments to run dist train.")

--- a/benchmark/fluid/models/stacked_dynamic_lstm.py
+++ b/benchmark/fluid/models/stacked_dynamic_lstm.py
@@ -104,8 +104,9 @@ def get_model(args):
    loss = fluid.layers.mean(x=loss)
    # add acc
+    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
    batch_acc = fluid.layers.accuracy(input=logit, label=fluid.layers.data(name='label', \
-                shape=[1], dtype='int64'))
+                shape=[1], dtype='int64'), total=batch_size_tensor)
    inference_program = fluid.default_main_program().clone()
    with fluid.program_guard(inference_program):

--- a/benchmark/fluid/models/vgg.py
+++ b/benchmark/fluid/models/vgg.py
@@ -82,7 +82,8 @@ def get_model(args):
                data_file, batch_size=args.batch_size))
        images, label = fluid.layers.read_file(data_file)
    else:
-        images = fluid.layers.data(name='data', shape=dshape, dtype='float32')
+        images = fluid.layers.data(
+            name='data', shape=data_shape, dtype='float32')
        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
    # Train program

--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -166,3 +166,7 @@ if(WITH_GOLANG)
  endif()
 endif(WITH_GOLANG)
+if(WITH_GRPC)
+    add_definitions(-DPADDLE_WITH_GRPC)
+endif(WITH_GRPC)
--- a/cmake/external/brpc.cmake
+++ b/cmake/external/brpc.cmake
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+INCLUDE(ExternalProject)
+SET(BRPC_SOURCES_DIR ${THIRD_PARTY_PATH}/brpc)
+SET(BRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/brpc)
+SET(BRPC_INCLUDE_DIR "${BRPC_INSTALL_DIR}/include" CACHE PATH "brpc include directory." FORCE)
+SET(BRPC_LIBRARIES "${BRPC_INSTALL_DIR}/lib/libbrpc.a" CACHE FILEPATH "brpc library." FORCE)
+INCLUDE_DIRECTORIES(${BRPC_INCLUDE_DIR})
+# Reference https://stackoverflow.com/questions/45414507/pass-a-list-of-prefix-paths-to-externalproject-add-in-cmake-args
+set(prefix_path "${THIRD_PARTY_PATH}/install/gflags|${THIRD_PARTY_PATH}/install/leveldb|${THIRD_PARTY_PATH}/install/snappy|${THIRD_PARTY_PATH}/install/gtest|${THIRD_PARTY_PATH}/install/protobuf")
+# If minimal .a is need, you can set  WITH_DEBUG_SYMBOLS=OFF
+ExternalProject_Add(
+    extern_brpc
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    GIT_REPOSITORY  "https://github.com/brpc/brpc"
+    GIT_TAG         "6d153dd7ff00f960ae6895c9c5fff0ce9f07aff2"
+    PREFIX          ${BRPC_SOURCES_DIR}
+    UPDATE_COMMAND  ""
+    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+                    -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+                    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+                    -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                    -DCMAKE_INSTALL_PREFIX=${BRPC_INSTALL_DIR}
+                    -DCMAKE_INSTALL_LIBDIR=${BRPC_INSTALL_DIR}/lib
+                    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+                    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+                    -DCMAKE_PREFIX_PATH=${prefix_path}
+                    -DBRPC_WITH_GLOG=ON
+                    ${EXTERNAL_OPTIONAL_ARGS}
+    LIST_SEPARATOR |
+    CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${BRPC_INSTALL_DIR}
+                     -DCMAKE_INSTALL_LIBDIR:PATH=${BRPC_INSTALL_DIR}/lib
+                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+                     -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+)
+ADD_DEPENDENCIES(extern_brpc protobuf leveldb gflags glog gtest snappy)
+ADD_LIBRARY(brpc STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET brpc PROPERTY IMPORTED_LOCATION ${BRPC_LIBRARIES})
+ADD_DEPENDENCIES(brpc extern_brpc)
+LIST(APPEND external_project_dependencies brpc)
--- a/cmake/external/leveldb.cmake
+++ b/cmake/external/leveldb.cmake
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+INCLUDE(ExternalProject)
+SET(LEVELDB_SOURCES_DIR ${THIRD_PARTY_PATH}/leveldb)
+SET(LEVELDB_INSTALL_DIR ${THIRD_PARTY_PATH}/install/leveldb)
+SET(LEVELDB_INCLUDE_DIR "${LEVELDB_INSTALL_DIR}/include" CACHE PATH "leveldb include directory." FORCE)
+SET(LEVELDB_LIBRARIES "${LEVELDB_INSTALL_DIR}/lib/libleveldb.a" CACHE FILEPATH "leveldb library." FORCE)
+INCLUDE_DIRECTORIES(${LEVELDB_INCLUDE_DIR})
+ExternalProject_Add(
+    extern_leveldb
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    PREFIX ${LEVELDB_SOURCES_DIR}
+    URL "https://github.com/google/leveldb/archive/v1.18.tar.gz"
+    URL_MD5 "73770de34a2a5ab34498d2e05b2b7fa0"
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND CXXFLAGS=-fPIC make -j ${NUM_OF_PROCESSOR} libleveldb.a
+    INSTALL_COMMAND mkdir -p ${LEVELDB_INSTALL_DIR}/lib/ 
+        && cp ${LEVELDB_SOURCES_DIR}/src/extern_leveldb/libleveldb.a ${LEVELDB_LIBRARIES}
+        && cp -r ${LEVELDB_SOURCES_DIR}/src/extern_leveldb/include ${LEVELDB_INSTALL_DIR}/
+    BUILD_IN_SOURCE 1
+)
+ADD_DEPENDENCIES(extern_leveldb snappy)
+ADD_LIBRARY(leveldb STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET leveldb PROPERTY IMPORTED_LOCATION ${LEVELDB_LIBRARIES})
+ADD_DEPENDENCIES(leveldb extern_leveldb)
+LIST(APPEND external_project_dependencies leveldb)
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -610,3 +610,21 @@ function(grpc_library TARGET_NAME)
    COMPILE_FLAGS  "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
  cc_library("${TARGET_NAME}" SRCS "${grpc_library_SRCS}" DEPS "${TARGET_NAME}_grpc" "${TARGET_NAME}_proto" "${grpc_library_DEPS}")
 endfunction()
+function(brpc_library TARGET_NAME)
+  set(oneValueArgs PROTO)
+  set(multiValueArgs SRCS DEPS)
+  set(options "")
+  cmake_parse_arguments(brpc_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  message(STATUS "generating brpc ${brpc_library_PROTO}")
+  get_filename_component(ABS_PROTO ${brpc_library_PROTO} ABSOLUTE)
+  get_filename_component(PROTO_WE ${brpc_library_PROTO} NAME_WE)
+  get_filename_component(PROTO_PATH ${ABS_PROTO} PATH)
+  protobuf_generate_cpp(brpc_proto_srcs brpc_proto_hdrs "${ABS_PROTO}")
+  cc_library("${TARGET_NAME}_proto" SRCS "${brpc_proto_srcs}")
+  cc_library("${TARGET_NAME}" SRCS "${brpc_library_SRCS}" DEPS "${TARGET_NAME}_proto" "${brpc_library_DEPS}")
+endfunction()
--- a/doc/fluid/dev/api_doc_std_cn.md
+++ b/doc/fluid/dev/api_doc_std_cn.md
 # API注释撰写标准
- [API注释模块](#API注释模块)
+- [API注释撰写标准](#api)
- [格式及示例](#格式及示例)
+    - [API注释模块](#api)
- [完整示例](#完整示例)
+    - [格式及示例](#)
+    - [完整示例](#)
 ## API注释模块
@@ -217,4 +218,4 @@ API文档须使用reStructuredText格式撰写，该格式详情请参考[链接
 ## 完整示例
-fc 的完整注释见[示例](src/fc.py)。
+fc 的完整注释见[示例](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/src/fc.py)。
--- a/doc/fluid/dev/api_doc_std_en.md
+++ b/doc/fluid/dev/api_doc_std_en.md
 # API Doc Standard
- [API Doc Structure](#API Doc Structure)
+- [API Doc Standard](#api-doc-standard)
- [Format and Examples](#Format and Examples)
+    - [API Doc Structure](#api-doc-structure)
- [Complete Example](#Complete Example)
+    - [Format and Examples](#format-and-examples)
+    - [Complete Example](#complete-example)
 ## API Doc Structure
@@ -223,4 +224,4 @@ Format and examples of each part of API documantation are as follows: (take fc f
 ## Complete Example
-Complete Example of fc please see [here](src/fc.py)。
+Complete Example of fc please see [here](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/src/fc.py)。
--- a/doc/survey/dynamic_graph.md
+++ b/doc/survey/dynamic_graph.md
+# Automatic Differentiation with the Tape
+## Automatic Differentiation
+A key challenge in the field of deep learning is to automatically derive the backward pass from the forward pass described algorithmically by researchers.  Such a derivation, or a transformation of the forward pass program, has been long studied before the recent prosperity of deep learning in the field known as [automatic differentiation](https://arxiv.org/pdf/1502.05767.pdf).
+## The Tape
+Given the forward pass program (usually in Python in practices), there are two strategies to derive the backward pass:
+1. from the forward pass program itself, or
+1. from the execution trace of the forward pass program, which is often known as the *tape*.
+This article surveys systems that follow the latter strategy.
+## Dynamic Network
+When we train a deep learning model, the tape changes every iteration as the input data change, so we have to re-derive the backward pass every iteration.  This is known as *dynamic network*.
+Deep learning systems that utilize the idea of dynamic network gained their popularities in recent years.  This article surveys two representative systems: [PyTorch](https://pytorch.org/) and [DyNet](https://dynet.readthedocs.io/en/latest/).
+## An Overview
+Both frameworks record a ‘tape’ of the computation and interpreting (or run-time compiling) a transformation of the tape played back in reverse. This tape is a different kind of entity than the original program.[[link]](http://www.bcl.hamilton.ie/~barak/papers/toplas-reverse.pdf)
+Consider the following code feedforward model.
+```python
+x = Variable(randn(20, 1)))
+label = Variable(randint(1))
+W_1, W_2 = Variable(randn(20, 20)), Variable(randn(10, 20))
+h = matmul(W_1, x)
+pred = matmul(W_2, x)
+loss = softmax(pred, label)
+loss.backward()
+```
+### 1) Dynet uses List to encode the Tape
+During the forward execution, a list of operators, in this case `matmul`, `matmul` and `softmax`, are recorded in the tape, along with the necessary information needed to do the backward such as pointers to the inputs and outputs. Then the tape is played in reverse order at `loss.backward()`.
+<details> 
+<summary></summary>
+digraph g {
+    graph [
+        rankdir = "LR"
+    ];
+    node [
+        fontsize = "16"
+        shape = "ellipse"
+    ];
+    edge [];
+    "node0" [
+        label = "<f0> type: matmul | <f1> input: W_1, x | <f2> output: h"
+        shape = "record"
+    ];
+    "node1" [
+        label = "<f0> type: matmul | <f1> input: W_2, h | <f2> output: pred"
+        shape = "record"
+    ];
+    "node2" [
+        label = "<f0> type: softmax | <f1> input: pred, label | <f2> output: loss"
+        shape = "record"
+    ];
+    "node0":f0 -> "node1":f0 [];
+    "node1":f0 -> "node2":f0 [];
+}
+</details>
+![Alt text](https://g.gravizo.com/svg?digraph%20g%20{%20graph%20[%20rankdir%20=%20%22LR%22%20];%20node%20[%20fontsize%20=%20%2216%22%20shape%20=%20%22ellipse%22%20];%20edge%20[];%20%22node0%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20matmul%20|%20%3Cf1%3E%20input:%20W_1,%20x%20|%20%3Cf2%3E%20output:%20h%22%20shape%20=%20%22record%22%20];%20%22node1%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20matmul%20|%20%3Cf1%3E%20input:%20W_2,%20h%20|%20%3Cf2%3E%20output:%20pred%22%20shape%20=%20%22record%22%20];%20%22node2%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20softmax%20|%20%3Cf1%3E%20input:%20pred,%20label%20|%20%3Cf2%3E%20output:%20loss%22%20shape%20=%20%22record%22%20];%20%22node0%22:f0%20-%3E%20%22node1%22:f0%20[%20id%20=%200%20];%20%22node1%22:f0%20-%3E%20%22node2%22:f0%20[%20id%20=%201%20];%20})
+### 2) Pytorch uses Node Graph to encode the Tape
+The graph is composed of `Variable`s and `Function`s. During the forward execution, a `Variable` records its creator function, e.g. `h.creator = matmul`. And a Function records its inputs' previous/dependent functions `prev_func` through `creator`, e.g. `matmul.prev_func = matmul1`. At `loss.backward()`, a topological sort is performed on all `prev_func`s. Then the grad op is performed by the sorted order.
+<details> 
+<summary></summary>
+digraph g {
+    graph [
+        rankdir = "LR"
+    ];
+    subgraph function {
+        node [
+            fontsize = "16"
+            style = filled
+            shape = "record"
+        ];
+        "matmul0" [ label = "<f0> type: matmul | prev_func: None" ];
+        "matmul1" [ label = "<f0> type: matmul | prev_func: matmul" ];
+        "softmax" [ label = "<f0> type: softmax | prev_func: matmul" ];
+    }
+    subgraph variable {
+        node [
+            fontsize = "16"
+            shape = "Mrecord"
+            style = filled
+            fillcolor = white
+        ];
+        "x" [ label = "<f0> x | <f1> creator: None" ];
+        "label" [ label = "<f0> label | <f1> creator: None" ];
+        "W_1" [ label = "<f0> W_1 | <f1> creator: None" ];
+        "W_2" [ label = "<f0> W_2 | <f1> creator: None" ];
+        "h" [ label = "<f0> h | <f1> creator: None" ];
+        "pred" [ label = "<f0> pred | <f1> creator: matmul" ];
+        "loss" [ label = "<f0> loss | <f1> creator: softmax" ];
+    }
+    subgraph data_flow {
+        "x":f0 -> "matmul0":f0;
+        "W_1":f0 -> "matmul0":f0;
+        "matmul0":f0 -> "h":f0;
+        "h":f0 -> "matmul1":f0;
+        "W_2":f0 -> "matmul1":f0;
+        "matmul1":f0 -> "pred":f0;
+        "pred":f0 -> "softmax":f0;
+        "label":f0 -> "softmax":f0;
+        "softmax":f0 -> "loss":f0;
+    }
+    subgraph prev_func {
+        edge [color="red", arrowsize="0.6", penwidth="1", constraint=false];
+        "matmul1":f1 -> "matmul0":f0;
+        "softmax":f1 -> "matmul1":f0;
+        label = "prev_func";
+    }
+}
+</details>
+![Alt text](https://g.gravizo.com/svg?digraph%20g%20{%20graph%20[%20rankdir%20=%20%22LR%22%20];%20subgraph%20function%20{%20node%20[%20fontsize%20=%20%2216%22%20style%20=%20filled%20shape%20=%20%22record%22%20];%20%22matmul0%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20matmul%20|%20prev_func:%20None%22%20];%20%22matmul1%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20matmul%20|%20prev_func:%20matmul%22%20];%20%22softmax%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20softmax%20|%20prev_func:%20matmul%22%20];%20}%20subgraph%20variable%20{%20node%20[%20fontsize%20=%20%2216%22%20shape%20=%20%22Mrecord%22%20style%20=%20filled%20fillcolor%20=%20white%20];%20%22x%22%20[%20label%20=%20%22%3Cf0%3E%20x%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22label%22%20[%20label%20=%20%22%3Cf0%3E%20label%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22W_1%22%20[%20label%20=%20%22%3Cf0%3E%20W_1%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22W_2%22%20[%20label%20=%20%22%3Cf0%3E%20W_2%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22h%22%20[%20label%20=%20%22%3Cf0%3E%20h%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22pred%22%20[%20label%20=%20%22%3Cf0%3E%20pred%20|%20%3Cf1%3E%20creator:%20matmul%22%20];%20%22loss%22%20[%20label%20=%20%22%3Cf0%3E%20loss%20|%20%3Cf1%3E%20creator:%20softmax%22%20];%20}%20subgraph%20data_flow%20{%20%22x%22:f0%20-%3E%20%22matmul0%22:f0;%20%22W_1%22:f0%20-%3E%20%22matmul0%22:f0;%20%22matmul0%22:f0%20-%3E%20%22h%22:f0;%20%22h%22:f0%20-%3E%20%22matmul1%22:f0;%20%22W_2%22:f0%20-%3E%20%22matmul1%22:f0;%20%22matmul1%22:f0%20-%3E%20%22pred%22:f0;%20%22pred%22:f0%20-%3E%20%22softmax%22:f0;%20%22label%22:f0%20-%3E%20%22softmax%22:f0;%20%22softmax%22:f0%20-%3E%20%22loss%22:f0;%20}%20subgraph%20prev_func%20{%20edge%20[color=%22red%22,%20arrowsize=%220.6%22,%20penwidth=%221%22,%20constraint=false];%20%22matmul1%22:f1%20-%3E%20%22matmul0%22:f0;%20%22softmax%22:f1%20-%3E%20%22matmul1%22:f0;%20label%20=%20%22prev_func%22;%20}%20})
+Chainer and Autograd uses the similar techniques to record the forward pass. For details please refer to the appendix.
+## Design choices
+### 1) Dynet's List vs Pytorch's Node Graph
+What's good about List:
+1. It avoids a topological sort. One only needs to traverse the list of operators in reverse and calling the corresponding backward operator.
+1. It promises effient data parallelism implementations. One could count the time of usage of a certain variable during the construction list. Then in the play back, one knows the calculation of a variable has completed. This enables communication and computation overlapping.
+What's good about Node Graph:
+1. More flexibility. PyTorch users can mix and match independent graphs however they like, in whatever threads they like (without explicit synchronization). An added benefit of structuring graphs this way is that when a portion of the graph becomes dead, it is automatically freed. [[2]](https://openreview.net/pdf?id=BJJsrmfCZ) Consider the following example, Pytorch only does backward on SmallNet while Dynet does both BigNet and SmallNet.
+```python
+result = BigNet(data)
+loss = SmallNet(data)
+loss.backward()
+```
+### 2) Dynet's Lazy evaluation vs Pytorch's Immediate evaluation
+Dynet builds the list in a symbolic matter. Consider the following example
+```python
+for epoch in range(num_epochs):
+    for in_words, out_label in training_data:
+        dy.renew_cg()
+        W = dy.parameter(W_p)
+        b = dy.parameter(b_p)
+        score_sym = dy.softmax(W*dy.concatenate([E[in_words[0]],E[in_words[1]]])+b)
+        loss_sym = dy.pickneglogsoftmax(score_sym, out_label)
+        loss_val = loss_sym.value()
+        loss_sym.backward()
+```
+The computation of `lookup`, `concat`, `matmul` and `softmax` didn't happen until the call of `loss_sym.value()`. This defered execution is useful because it allows some graph-like optimization possible, e.g. kernel fusion.
+Pytorch chooses immediate evaluation. It avoids ever materializing a "forward graph"/"tape" (no need to explicitly call `dy.renew_cg()` to reset the list), recording only what is necessary to differentiate the computation, i.e. `creator` and `prev_func`.
+## What can fluid learn from them?
+TBD
+# Appendix
+### Overview
+| Framework | Has Tape | Core in C++ | First Release Date |
+|-----------|----------|-------------|--------------------|
+| Autograd  | No       | No          | Mar 5, 2015        |
+| Chainer   | No       | No          | Jun 5, 2015        |
+| Pytorch   | No       | Yes         | Aug 31, 2016       |
+| Dynet     | Yes      | Yes         | Oct 12, 2016       |
+### Source Code
+#### Autograd
+[Backward code](https://github.com/HIPS/autograd/blob/442205dfefe407beffb33550846434baa90c4de7/autograd/core.py#L8-L40). In the forward pass, a graph of VJPNode is constructed.
+```python
+# User API
+def make_grad(fun, x):
+    start_node = VJPNode.new_root()
+    end_value, end_node =  trace(start_node, fun, x)
+    return backward_pass(g, end_node), end_value
+# trace the forward pass by creating VJPNodes
+def trace(start_node, fun, x):
+    with trace_stack.new_trace() as t:
+        start_box = new_box(x, t, start_node)
+        end_box = fun(start_box)
+        return end_box._value, end_box._node
+def backward_pass(g, end_node):
+    outgrads = {end_node : (g, False)}
+    for node in toposort(end_node):
+        outgrad = outgrads.pop(node)
+        ingrads = node.vjp(outgrad[0])
+        for parent, ingrad in zip(node.parents, ingrads):
+            outgrads[parent] = add_outgrads(outgrads.get(parent), ingrad)
+    return outgrad[0]
+# Every VJPNode corresponds to a op_grad
+class VJPNode(Node):
+    __slots__ = ['parents', 'vjp']
+    def __init__(self, value, fun, args, kwargs, parent_argnums, parents):
+        self.parents = parents
+        vjpmaker = primitive_vjps[fun]
+        self.vjp = vjpmaker(parent_argnums, value, args, kwargs)
+```
+#### Chainer
+Example Code
+```python
+# (1) Function Set definition, creates FunctionNode
+model = FunctionSet(
+    l1=F.Linear(784, 100),
+    l2=F.Linear(100, 100),
+    l3=F.Linear(100, 10)).to_gpu()
+# (2) Optimizer Setup
+opt = optimizers.SGD()
+opt.setup(model)
+# (3) Forward computation
+def forward(x, t):
+    h1 = F.relu(model.l1(x))
+    h2 = F.relu(model.l2(h1))
+    y = model.l3(h2)
+    return F.softmax_cross_entropy(y, t)
+# (4) Training loop
+for epoch in xrange(n_epoch):
+    for i in xrange(0, N, b_size):
+        x = Variable(to_gpu(...))
+        t = Variable(to_gpu(...))
+        opt.zero_grads()
+        loss = forward(x, t)
+        loss.backward()
+        opt.update()
+```
+In `forward(x, t)`, a graph of [`VariableNode`](https://github.com/chainer/chainer/blob/master/chainer/variable.py#L110) and [`FunctionNode`](https://github.com/chainer/chainer/blob/a69103a4aa59d5b318f39b01dbcb858d465b89cf/chainer/function_node.py#L19) is constructed. Every output's `VariableNode.creator` is pointed to the `FunctionNode`.
+```python
+class FunctionNode(object):
+    ...
+    def apply(self, inputs):
+        outputs = self.forward(inputs)
+        ret = tuple([variable.Variable(y, requires_grad=requires_grad)
+                     for y in outputs])
+        # Topological ordering
+        self.rank = max([x.rank for x in inputs]) if input_vars else 0
+        # Add backward edges
+        for y in ret:
+            y.creator_node = self
+        self.inputs = tuple([x.node for x in input_vars])
+        self.outputs = tuple([y.node for y in ret])
+        return ret
+```
+`loss.backward()` will calculate the accumulated gradient of all variables. All the backward of `FunctionNode`s will be called based on the topological order.
+```python
+class VariableNode(object):
+    ...
+    def backward(self, retain_grad, loss_scale):
+        if self.creator_node is None:
+            return
+        cand_funcs = []
+        seen_set = set()
+        grads = {}
+        # Initialize error by 1, if this is a loss variable
+        if self.data.size == 1 and self._grad_var is None:
+            self.grad = numpy.ones_like(self.data)
+        grads[self._node] = self._grad_var
+        def add_cand(cand):
+            if cand not in seen_set:
+                # Negate since heapq is min-heap. This is a global variable
+                heapq.heappush(cand_funcs, (-cand.rank, len(seen_set), cand))
+                seen_set.add(cand)
+        add_cand(self.creator_node)
+        while cand_funcs:
+            _, _, func = heapq.heappop(cand_funcs)
+            gxs = func.backward_accumulate(func.inputs, func.outputs, func.outputs.grad)
+            for x, gx in enumerate(gxs):
+                if x in grads:
+                    grads[x] += gx
+                else:
+                    grads[x] = gx
+                if x.creator_node is not None:
+                    add_cand(x.creator_node)
+```
+#### PyTorch
+Example Code
+```python
+x = Variable(torch.ones(5, 5))
+y = Variable(torch.ones(5, 5) * 4)
+z = x ** 2 + x * 2 + x * y + y
+z.backward(torch.ones(5, 5))
+```
+The trace is done by `Variable.creator` and `Function.previous_functions`.
+```python
+class Variable(object):
+    def __init__(self, tensor, creator=None, requires_grad=True):
+        if creator is None:
+            creator = Leaf(self, requires_grad)
+        self.data = tensor
+        self.creator = creator
+        self._grad = None
+    def backward(self, gradient=None):
+        if gradient is None:
+            if self.data.numel() != 1:
+                raise RuntimeError('backward should be called only on a scalar (i.e. 1-element tensor) or with gradient w.r.t. the variable')
+            gradient = self.data.new(1).fill_(1)
+        self._execution_engine.run_backward(self, gradient)
+class Function(obejct):
+    # ...
+    def _do_forward(self, *input):
+        unpacked_input = tuple(arg.data for arg in input)
+        raw_output = self.forward(*unpacked_input)
+        # mark output.creator = self for backward trace
+        output = tuple(Variable(tensor, self) for tensor in raw_output)
+        self.previous_functions = [(arg.creator, id(arg)) for arg in input]
+        self.output_ids = {id(var): i for i, var in enumerate(output)}
+        return output
+    def _do_backward(self, grad_output):
+        return self.backwaerd(grad_output)
+```
+The [backward](https://github.com/pytorch/pytorch/blob/v0.1.1/torch/autograd/engine.py) is similar to Autograd.
+#### DyNet
+Example code
+```python
+model = dy.model()
+W_p = model.add_parameters((20, 100))
+b_p = model.add_parameters(20)
+E = model.add_lookup_parameters((20000, 50))
+for epoch in range(num_epochs):
+    for in_words, out_label in training_data:
+        dy.renew_cg() # init tape
+        W = dy.parameter(W_p)
+        b = dy.parameter(b_p)
+        score_sym = dy.softmax(W*dy.concatenate([E[in_words[0]],E[in_words[1]]])+b)
+        loss_sym = dy.pickneglogsoftmax(score_sym, out_label)
+        loss_val = loss_sym.value()
+        loss_sym.backward()
+```
+[forward](https://github.com/clab/dynet/blob/740a9626a13a2732544de142e256ad0d0a166658/dynet/exec.cc#L84-L158), [backward](https://github.com/clab/dynet/blob/740a9626a13a2732544de142e256ad0d0a166658/dynet/exec.cc#L166-L284). The trace is done by creating a tape of expressions in every iteration. Backward is done by traverse the tape in the reverse order.
+```c++
+void SimpleExecutionEngine::backward(VariableIndex from_where, bool full) {
+  ...  
+  for (int i = num_nodes - 1; i >= 0; --i) {
+    // each node corresponds to an op
+    node->backward(xs, node_fx, node_dEdfx, ai, node_dEdxai);
+  }
+  ...
+}
+```
--- a/doc/v2/build_and_install/build_from_source_cn.rst
+++ b/doc/v2/build_and_install/build_from_source_cn.rst
@@ -106,7 +106,7 @@ PaddlePaddle需要使用Docker环境完成编译，这样可以免去单独安
 - 学习 Docker 有多难？
-  理解 Docker 并不难，大概花十分钟看一下 `这篇文章 <https://zhuanlan.zhihu.com/p/19902938>`_ 。这可以帮您省掉花一小时安装和配置各种开发工具，以及切换机器时需要新安装的辛苦。别忘了 PaddlePaddle 更新可能导致需要新的开发工具。更别提简化问题复现带来的好处了。
+  理解 Docker 并不难，大概花十分钟看一下 `如何使用Docker <https://zhuanlan.zhihu.com/p/19902938>`_ 。这可以帮您省掉花一小时安装和配置各种开发工具，以及切换机器时需要新安装的辛苦。别忘了 PaddlePaddle 更新可能导致需要新的开发工具。更别提简化问题复现带来的好处了。
 - 我可以用 IDE 吗？
@@ -123,7 +123,7 @@ PaddlePaddle需要使用Docker环境完成编译，这样可以免去单独安
 - 可以并行编译吗？
-  是的。我们的 Docker image 运行一个 `Bash脚本 <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh>`_ 。这个脚本调用 `make -j$(nproc)` 来启动和 CPU 核一样多的进程来并行编译。
+  是的。我们的 Docker image 运行一个 `Paddle编译Bash脚本 <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh>`_ 。这个脚本调用 `make -j$(nproc)` 来启动和 CPU 核一样多的进程来并行编译。
 - Docker 需要 sudo
@@ -131,11 +131,11 @@ PaddlePaddle需要使用Docker环境完成编译，这样可以免去单独安
 - 在 Windows/MacOS 上编译很慢
-  Docker 在 Windows 和 MacOS 都可以运行。不过实际上是运行在一个 Linux 虚拟机上。可能需要注意给这个虚拟机多分配一些 CPU 和内存，以保证编译高效。具体做法请参考 `这个issue <https://github.com/PaddlePaddle/Paddle/issues/627>`_ 。
+  Docker 在 Windows 和 MacOS 都可以运行。不过实际上是运行在一个 Linux 虚拟机上。可能需要注意给这个虚拟机多分配一些 CPU 和内存，以保证编译高效。具体做法请参考 `如何为Windows/Mac计算机上的Docker增加内存和虚拟机 <https://github.com/PaddlePaddle/Paddle/issues/627>`_ 。
 - 磁盘不够
-  本文中的例子里，`docker run` 命令里都用了 `--rm` 参数，这样保证运行结束之后的 containers 不会保留在磁盘上。可以用 `docker ps -a` 命令看到停止后但是没有删除的 containers。`docker build` 命令有时候会产生一些中间结果，是没有名字的 images，也会占用磁盘。可以参考 `这篇文章 <https://zaiste.net/posts/removing_docker_containers/>`_ 来清理这些内容。
+  本文中的例子里，`docker run` 命令里都用了 `--rm` 参数，这样保证运行结束之后的 containers 不会保留在磁盘上。可以用 `docker ps -a` 命令看到停止后但是没有删除的 containers。`docker build` 命令有时候会产生一些中间结果，是没有名字的 images，也会占用磁盘。可以参考 `如何删除Docker Container <https://zaiste.net/posts/removing_docker_containers/>`_ 来清理这些内容。
 .. _compile_deps:
@@ -195,7 +195,7 @@ BLAS
 PaddlePaddle支持 `MKL <https://software.intel.com/en-us/intel-mkl>`_ 和
 `OpenBlAS <http://www.openblas.net/>`_ 两种BLAS库。默认使用MKL。如果使用MKL并且机器含有AVX2指令集，
-还会下载MKL-DNN数学库，详细参考 `这里 <https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/mkldnn#cmake>`_ 。
+还会下载MKL-DNN数学库，详细参考 `mkldnn设计文档 <https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/mkldnn#cmake>`_ 。
 如果关闭MKL，则会使用OpenBLAS作为BLAS库。

--- a/paddle/contrib/inference/test_paddle_inference_api_impl.cc
+++ b/paddle/contrib/inference/test_paddle_inference_api_impl.cc
@@ -109,7 +109,6 @@ void MainWord2Vec(bool use_gpu) {
 void MainImageClassification(bool use_gpu) {
  int batch_size = 2;
-  bool use_mkldnn = false;
  bool repeat = false;
  NativeConfig config = GetConfig();
  config.use_gpu = use_gpu;
@@ -134,12 +133,8 @@ void MainImageClassification(bool use_gpu) {
  std::vector<framework::LoDTensor*> cpu_fetchs1;
  cpu_fetchs1.push_back(&output1);
-  TestInference<platform::CPUPlace, false, true>(config.model_dir,
+  TestInference<platform::CPUPlace, false, true>(
-                                                 cpu_feeds,
+      config.model_dir, cpu_feeds, cpu_fetchs1, repeat, is_combined);
-                                                 cpu_fetchs1,
-                                                 repeat,
-                                                 is_combined,
-                                                 use_mkldnn);
  auto predictor = CreatePaddlePredictor(config);
  std::vector<PaddleTensor> paddle_tensor_feeds;

--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -87,7 +87,7 @@ cc_library(executor SRCS executor.cc DEPS op_registry device_context scope
 framework_proto glog lod_rank_table feed_fetch_method)
-cc_library(parallel_executor SRCS parallel_executor.cc DEPS graph_builder_factory threaded_ssa_graph_executor scope_buffered_ssa_graph_executor)
+cc_library(parallel_executor SRCS parallel_executor.cc DEPS ssa_graph_builder_factory threaded_ssa_graph_executor scope_buffered_ssa_graph_executor)
 cc_library(prune SRCS prune.cc DEPS framework_proto)
 cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)

--- a/paddle/fluid/framework/data_type.cc
+++ b/paddle/fluid/framework/data_type.cc
@@ -28,6 +28,9 @@ struct DataTypeMap {
 };
 static DataTypeMap* InitDataTypeMap();
+// C++11 removes the need for manual locking. Concurrent execution shall wait if
+// a static local variable is already being initialized.
+// https://stackoverflow.com/questions/11711920/how-to-implement-multithread-safe-singleton-in-c11-without-using-mutex
 static DataTypeMap& gDataTypeMap() {
  static DataTypeMap* g_data_type_map_ = InitDataTypeMap();
  return *g_data_type_map_;

--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -8,18 +8,19 @@ cc_library(rpc_op_handle SRCS rpc_op_handle.cc DEPS framework_proto scope place
 cc_library(ssa_graph SRCS ssa_graph.cc DEPS var_handle op_handle_base)
 cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS ssa_graph)
 cc_library(ssa_graph_printer SRCS ssa_graph_printer.cc DEPS ssa_graph_builder)
+cc_library(ssa_graph_checker SRCS ssa_graph_checker.cc DEPS ssa_graph_builder)
 cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_rows)
 if(WITH_GPU)
-    nv_library(nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
+    nv_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
            dynload_cuda variable_visitor)
-    set(multi_devices_graph_builder_deps nccl_all_reduce_op_handle)
    nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim dynload_cuda)
    nv_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor dynload_cuda)
 else()
-    set(multi_devices_graph_builder_deps)
+    cc_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
+             variable_visitor)
    cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim)
    cc_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
 endif()
@@ -28,10 +29,10 @@ cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope d
 cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope)
 cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle
-        scale_loss_grad_op_handle rpc_op_handle ${multi_devices_graph_builder_deps} reduce_op_handle broadcast_op_handle)
+        scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle)
-cc_library(graph_builder_factory SRCS graph_builder_factory.cc DEPS multi_devices_graph_builder ssa_graph_printer)
+cc_library(ssa_graph_builder_factory SRCS ssa_graph_builder_factory.cc DEPS multi_devices_graph_builder ssa_graph_printer ssa_graph_checker)
 cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph framework_proto)
 cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope

--- a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
@@ -13,25 +13,33 @@
 // limitations under the License.
 #include <algorithm>
+#include "paddle/fluid/framework/details/all_reduce_op_handle.h"
 #include "paddle/fluid/framework/details/container_cast.h"
-#include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h"
 #include "paddle/fluid/framework/details/reduce_and_gather.h"
 #include "paddle/fluid/framework/details/variable_visitor.h"
 namespace paddle {
 namespace framework {
 namespace details {
-NCCLAllReduceOpHandle::NCCLAllReduceOpHandle(
-    const std::vector<Scope *> &local_scopes,
+#ifdef PADDLE_WITH_CUDA
-    const std::vector<platform::Place> &places,
+AllReduceOpHandle::AllReduceOpHandle(const std::vector<Scope *> &local_scopes,
-    const platform::NCCLContextMap &ctxs)
+                                     const std::vector<platform::Place> &places,
+                                     const platform::NCCLContextMap *ctxs)
    : local_scopes_(local_scopes), places_(places), nccl_ctxs_(ctxs) {
-  for (auto &p : places_) {
+  if (nccl_ctxs_) {
-    this->dev_ctxes_[p] = nccl_ctxs_.DevCtx(p);
+    for (auto &p : places_) {
+      this->dev_ctxes_[p] = nccl_ctxs_->DevCtx(p);
+    }
  }
 }
+#else
+AllReduceOpHandle::AllReduceOpHandle(const std::vector<Scope *> &local_scopes,
+                                     const std::vector<platform::Place> &places)
+    : local_scopes_(local_scopes), places_(places) {}
+#endif
-void NCCLAllReduceOpHandle::RunImpl() {
+void AllReduceOpHandle::RunImpl() {
  if (NoDummyInputSize() == 1) {
    return;  // No need to all reduce when GPU count = 1;
  } else {
@@ -58,6 +66,8 @@ void NCCLAllReduceOpHandle::RunImpl() {
    }
    if (platform::is_gpu_place(lod_tensors[0]->place())) {
+#ifdef PADDLE_WITH_CUDA
+      PADDLE_ENFORCE(nccl_ctxs_, "nccl_ctxs should not be nullptr.");
      int dtype = -1;
      size_t numel = 0;
      std::vector<std::function<void()>> all_reduce_calls;
@@ -75,7 +85,7 @@ void NCCLAllReduceOpHandle::RunImpl() {
        }
        int dev_id = boost::get<platform::CUDAPlace>(p).device;
-        auto &nccl_ctx = nccl_ctxs_.at(dev_id);
+        auto &nccl_ctx = nccl_ctxs_->at(dev_id);
        auto stream = nccl_ctx.stream();
        auto comm = nccl_ctx.comm_;
        all_reduce_calls.emplace_back([=] {
@@ -90,22 +100,25 @@ void NCCLAllReduceOpHandle::RunImpl() {
          call();
        }
      });
+#else
+      PADDLE_THROW("Not compiled with CUDA");
+#endif
    } else {  // Special handle CPU only Operator's gradient. Like CRF
      auto &trg = *this->local_scopes_[0]
                       ->FindVar(kLocalExecScopeName)
                       ->Get<Scope *>()
-                       ->Var()
+                       ->FindVar(out_var_handles[0]->name_)
                       ->GetMutable<framework::LoDTensor>();
      // Reduce All Tensor to trg in CPU
      ReduceLoDTensor func(lod_tensors, &trg);
      VisitDataType(ToDataType(lod_tensors[0]->type()), func);
-      for (size_t i = 0; i < local_scopes_.size(); ++i) {
+      for (size_t i = 1; i < local_scopes_.size(); ++i) {
        auto &scope =
            *local_scopes_[i]->FindVar(kLocalExecScopeName)->Get<Scope *>();
        auto &p = places_[i];
-        auto *var = scope.FindVar(in_var_handles[i]->name_);
+        auto *var = scope.FindVar(out_var_handles[i]->name_);
        auto *dev_ctx = dev_ctxes_[p];
        RunAndRecordEvent(p, [&trg, var, dev_ctx, p] {
@@ -118,7 +131,7 @@ void NCCLAllReduceOpHandle::RunImpl() {
  }
 }
-std::string NCCLAllReduceOpHandle::Name() const { return "nccl_all_reduce"; }
+std::string AllReduceOpHandle::Name() const { return "all_reduce"; }
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h
+++ b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h
@@ -20,17 +20,23 @@
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
+#ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/nccl_helper.h"
+#endif
 namespace paddle {
 namespace framework {
 namespace details {
-struct NCCLAllReduceOpHandle : public OpHandleBase {
+struct AllReduceOpHandle : public OpHandleBase {
-  NCCLAllReduceOpHandle(const std::vector<Scope *> &local_scopes,
+#ifdef PADDLE_WITH_CUDA
-                        const std::vector<platform::Place> &places,
+  AllReduceOpHandle(const std::vector<Scope *> &local_scopes,
-                        const platform::NCCLContextMap &ctxs);
+                    const std::vector<platform::Place> &places,
+                    const platform::NCCLContextMap *ctxs);
+#else
+  AllReduceOpHandle(const std::vector<Scope *> &local_scopes,
+                    const std::vector<platform::Place> &places);
+#endif
  std::string Name() const override;
  // Delay and buffer nccl_all_reduce together can significantly increase
@@ -43,7 +49,9 @@ struct NCCLAllReduceOpHandle : public OpHandleBase {
 private:
  std::vector<Scope *> local_scopes_;
  std::vector<platform::Place> places_;
-  const platform::NCCLContextMap &nccl_ctxs_;
+#ifdef PADDLE_WITH_CUDA
+  const platform::NCCLContextMap *nccl_ctxs_;
+#endif
 };
 }  // namespace details

--- a/paddle/fluid/framework/details/execution_strategy.h
+++ b/paddle/fluid/framework/details/execution_strategy.h
@@ -20,7 +20,7 @@ namespace details {
 struct ExecutionStrategy {
  size_t num_threads_{0};
-  bool use_event_{true};
+  bool use_cuda_{true};
  bool allow_op_delay_{false};
  size_t num_iteration_per_drop_scope_{100};
 };

--- a/paddle/fluid/framework/details/fuse_vars_op_handle.cc
+++ b/paddle/fluid/framework/details/fuse_vars_op_handle.cc
@@ -42,7 +42,7 @@ void FuseVarsOpHandle::RunImpl() {
    out_t->ShareDataWith(out_tensor->Slice(s, s + numel));
    s += numel;
  }
-  this->RunAndRecordEvent([this] {});
+  this->RunAndRecordEvent([] {});
 }
 std::string FuseVarsOpHandle::Name() const { return "fuse vars"; }

--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -17,6 +17,7 @@
 #include <utility>
 #include <vector>
+#include "paddle/fluid/framework/details/all_reduce_op_handle.h"
 #include "paddle/fluid/framework/details/broadcast_op_handle.h"
 #include "paddle/fluid/framework/details/computation_op_handle.h"
 #include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
@@ -26,10 +27,6 @@
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/scope.h"
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h"
-#endif
 namespace paddle {
 namespace framework {
 namespace details {
@@ -89,7 +86,7 @@ std::vector<std::string> MultiDevSSAGraphBuilder::FindDistTrainSendVars(
  for (auto *op : program.Block(0).AllOps()) {
    // TODO(Yancey1989): use a graceful method to find send op,
    // instead of the the hard code string
-    if (op->Type() == "send_vars") {
+    if (op->Type() == "send") {
      auto op_vars = op->InputArgumentNames();
      send_vars.reserve(send_vars.size() +
                        std::distance(op_vars.begin(), op_vars.end()));
@@ -282,7 +279,7 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
                    CreateReduceOp(&result, g_name, 0);
                    CreateBroadcastOp(&result, g_name, 0);
                  } else {
-                    InsertNCCLAllReduceOp(&result, g_name);
+                    InsertAllReduceOp(&result, g_name);
                  }
                  break;
              }
@@ -325,6 +322,19 @@ bool MultiDevSSAGraphBuilder::IsSparseGradient(
  return false;
 }
+void MultiDevSSAGraphBuilder::SetCommunicationContext(
+    OpHandleBase *op_handle, const platform::Place &p) const {
+#ifdef PADDLE_WITH_CUDA
+  if (nccl_ctxs_ == nullptr) {
+    op_handle->SetDeviceContext(p,
+                                platform::DeviceContextPool::Instance().Get(p));
+  }
+#else
+  op_handle->SetDeviceContext(p,
+                              platform::DeviceContextPool::Instance().Get(p));
+#endif
+}
 void MultiDevSSAGraphBuilder::CreateBroadcastOp(SSAGraph *result,
                                                const std::string &p_name,
                                                size_t src_dev_id) const {
@@ -339,15 +349,12 @@ void MultiDevSSAGraphBuilder::CreateBroadcastOp(SSAGraph *result,
  op_handle->AddInput(in);
  for (size_t i = 0; i < places_.size(); ++i) {
-    auto &vars = result->vars_.at(i).at(p_name);
    auto &p = places_[i];
+    SetCommunicationContext(op_handle, p);
+    auto &vars = result->vars_.at(i).at(p_name);
    auto *out_var = new VarHandle(vars.size(), i, p_name, p);
    vars.emplace_back(out_var);
    op_handle->AddOutput(out_var);
-#ifndef ADDLE_WITH_CUDA
-    op_handle->SetDeviceContext(p,
-                                platform::DeviceContextPool::Instance().Get(p));
-#endif
  }
 }
@@ -359,15 +366,19 @@ void MultiDevSSAGraphBuilder::CreateComputationalOp(SSAGraph *result,
  CreateOpHandleIOs(result, op, dev_id);
 }
-void MultiDevSSAGraphBuilder::InsertNCCLAllReduceOp(
+void MultiDevSSAGraphBuilder::InsertAllReduceOp(SSAGraph *result,
-    SSAGraph *result, const std::string &og) const {
+                                                const std::string &og) const {
 #ifdef PADDLE_WITH_CUDA
  result->ops_.emplace_back(
-      new NCCLAllReduceOpHandle(local_scopes_, places_, *nccl_ctxs_));
+      new AllReduceOpHandle(local_scopes_, places_, nccl_ctxs_));
+#else
+  result->ops_.emplace_back(new AllReduceOpHandle(local_scopes_, places_));
+#endif
  auto *op_handle = result->ops_.back().get();
  for (size_t i = 0; i < places_.size(); ++i) {
    auto &p = places_[i];
+    SetCommunicationContext(op_handle, p);
    auto &vars = result->vars_[i][og];
    PADDLE_ENFORCE(!vars.empty());
    auto &prev_grad = vars.back();
@@ -377,9 +388,6 @@ void MultiDevSSAGraphBuilder::InsertNCCLAllReduceOp(
    vars.emplace_back(var);
    op_handle->AddOutput(var);
  }
-#else
-  PADDLE_ENFORCE("Not implemented");
-#endif
 }
 bool MultiDevSSAGraphBuilder::IsParameterGradientOnce(
@@ -418,7 +426,9 @@ void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(SSAGraph *result) const {
  for (size_t i = 0; i < places_.size(); ++i) {
 // Insert ScaleCost OpHandle
 #ifdef PADDLE_WITH_CUDA
-    auto *communication_dev_ctx = nccl_ctxs_->DevCtx(places_[i]);
+    auto *communication_dev_ctx =
+        nccl_ctxs_ ? nccl_ctxs_->DevCtx(places_[i])
+                   : platform::DeviceContextPool::Instance().Get(places_[i]);
 #else
    auto *communication_dev_ctx =
        platform::DeviceContextPool::Instance().Get(platform::CPUPlace());
@@ -463,12 +473,9 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(SSAGraph *result,
  auto *op_handle = result->ops_.back().get();
  for (size_t i = 0; i < places_.size(); ++i) {
-    auto &vars = result->vars_[i][og];
-#ifndef PADDLE_WITH_CUDA
    auto &p = places_[i];
-    op_handle->SetDeviceContext(p,
+    SetCommunicationContext(op_handle, p);
-                                platform::DeviceContextPool::Instance().Get(p));
+    auto &vars = result->vars_[i][og];
-#endif
    PADDLE_ENFORCE(!vars.empty());
    auto &prev_grad = vars.back();
    op_handle->AddInput(prev_grad.get());
@@ -508,17 +515,17 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(SSAGraph *result, const OpDesc &op,
                                            op.Type(), places_[device_id]));
  if (op.Type() == "send_barrier") {
-    ConnectOp(result, result->ops_.back().get(), "send_vars");
+    ConnectOp(result, result->ops_.back().get(), "send");
  } else if (op.Type() == "recv") {
    ConnectOp(result, result->ops_.back().get(), "send_barrier");
  } else if (op.Type() == "fetch_barrier") {
    ConnectOp(result, result->ops_.back().get(), "recv");
-  } else if (op.Type() == "send_vars") {
+  } else if (op.Type() == "send") {
    // do nothing
  } else {
    PADDLE_THROW(
        "rpc op should be in ["
-        "send_vars, send_barrier. recv, fetch_barrier]");
+        "send, send_barrier. recv, fetch_barrier]");
  }
  // TODO(Yancey1989): schedule rpc op on different place may

--- a/paddle/fluid/framework/details/multi_devices_graph_builder.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h
@@ -109,7 +109,7 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
      const std::vector<std::unordered_set<std::string>> &var_name_on_devices,
      const OpDesc &op) const;
-  void InsertNCCLAllReduceOp(SSAGraph *result, const std::string &og) const;
+  void InsertAllReduceOp(SSAGraph *result, const std::string &og) const;
  void CreateBroadcastOp(SSAGraph *result, const std::string &p_name,
                         size_t src_dev_id) const;
@@ -121,6 +121,9 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
 private:
  BuildStrategy strategy_;
  mutable std::unordered_map<std::string, int> remote_vars_devices_;
+  void SetCommunicationContext(OpHandleBase *op_handle,
+                               const platform::Place &p) const;
 };
 }  // namespace details
 }  // namespace framework

--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -39,9 +39,9 @@ OpHandleBase::~OpHandleBase() {
 #endif
 }
-void OpHandleBase::Run(bool use_event) {
+void OpHandleBase::Run(bool use_cuda) {
 #ifdef PADDLE_WITH_CUDA
-  if (events_.empty() && use_event) {
+  if (events_.empty() && use_cuda) {
    for (auto &p : dev_ctxes_) {
      int dev_id = boost::get<platform::CUDAPlace>(p.first).device;
      PADDLE_ENFORCE(cudaSetDevice(dev_id));
@@ -50,7 +50,7 @@ void OpHandleBase::Run(bool use_event) {
    }
  }
 #else
-  PADDLE_ENFORCE(!use_event);
+  PADDLE_ENFORCE(!use_cuda);
 #endif
  RunImpl();

--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -36,7 +36,7 @@ class OpHandleBase {
  virtual std::string Name() const = 0;
-  void Run(bool use_event);
+  void Run(bool use_cuda);
  virtual void RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx);

--- a/paddle/fluid/framework/details/reduce_and_gather.h
+++ b/paddle/fluid/framework/details/reduce_and_gather.h
@@ -37,7 +37,9 @@ struct ReduceLoDTensor {
    PADDLE_ENFORCE_NE(t0.numel(), 0);
    dst_tensor_.Resize(t0.dims());
    T *dst = dst_tensor_.mutable_data<T>(platform::CPUPlace());
-    std::copy(t0.data<T>(), t0.data<T>() + t0.numel(), dst);
+    if (dst != t0.data<T>()) {
+      std::copy(t0.data<T>(), t0.data<T>() + t0.numel(), dst);
+    }
    for (size_t i = 1; i < src_tensors_.size(); ++i) {
      auto &t = *src_tensors_[i];

--- a/paddle/fluid/framework/details/ssa_graph_builder.cc
+++ b/paddle/fluid/framework/details/ssa_graph_builder.cc
@@ -11,8 +11,8 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/framework/details/ssa_graph_builder.h"
+#include <utility>
 namespace paddle {
 namespace framework {

--- a/paddle/fluid/framework/details/graph_builder_factory.cc
+++ b/paddle/fluid/framework/details/graph_builder_factory.cc
@@ -12,9 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/framework/details/graph_builder_factory.h"
+#include "paddle/fluid/framework/details/ssa_graph_builder_factory.h"
 #include <fstream>
 #include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
+#include "paddle/fluid/framework/details/ssa_graph_checker.h"
 #include "paddle/fluid/framework/details/ssa_graph_printer.h"
 namespace paddle {
@@ -40,6 +41,8 @@ std::unique_ptr<SSAGraphBuilder> SSAGraphBuilderFactory::Create() {
    res.reset(new SSAGraghBuilderWithPrinter(
        std::move(fout), std::move(graphviz_printer), std::move(res)));
  }
+  res.reset(new SSAGraghBuilderWithChecker(std::move(res)));
  return res;
 }
 }  // namespace details

--- a/paddle/fluid/framework/details/graph_builder_factory.h
+++ b/paddle/fluid/framework/details/graph_builder_factory.h
@@ -40,7 +40,11 @@ class SSAGraphBuilderFactory {
        loss_var_name_(loss_var_name),
        param_names_(param_names),
        local_scopes_(local_scopes),
-        strategy_(strategy) {}
+        strategy_(strategy) {
+#ifdef PADDLE_WITH_CUDA
+    nccl_ctxs_ = nullptr;
+#endif
+  }
 #ifdef PADDLE_WITH_CUDA
  void SetNCCLContextMap(platform::NCCLContextMap* nccl_ctxs) {

--- a/paddle/fluid/framework/details/ssa_graph_checker.cc
+++ b/paddle/fluid/framework/details/ssa_graph_checker.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/details/ssa_graph.h"
+#include <string>
+#include "paddle/fluid/framework/details/ssa_graph_checker.h"
+namespace paddle {
+namespace framework {
+namespace details {
+bool SSAGraghBuilderWithChecker::IsValidGraph(const SSAGraph *graph) const {
+  std::unordered_map<OpHandleBase *, size_t> pending_ops;
+  std::unordered_set<VarHandleBase *> pending_vars;
+  std::unordered_set<VarHandleBase *> ready_vars;
+  std::unordered_set<OpHandleBase *> ready_ops;
+  auto insert_pending_var = [&](VarHandleBase *var) {
+    pending_vars.insert(var);
+    if (var->generated_op_ == nullptr) {
+      ready_vars.emplace(var);
+    }
+  };
+  for (auto &var_map : graph->vars_) {
+    for (auto &name_pair : var_map) {
+      for (auto &version_pair : name_pair.second) {
+        insert_pending_var(version_pair.get());
+      }
+    }
+  }
+  for (auto &var : graph->dep_vars_) {
+    insert_pending_var(var.get());
+  }
+  for (auto &op : graph->ops_) {
+    if (op->Inputs().empty()) {
+      ready_ops.insert(op.get());
+    } else {
+      pending_ops.insert({op.get(), op.get()->NoDupInputSize()});
+    }
+  }
+  auto run_all_ops = [&](std::unordered_set<OpHandleBase *> &set) {
+    for (auto *op : set) {
+      for (auto out : op->Outputs()) {
+        ready_vars.emplace(out);
+      }
+    }
+    set.clear();
+  };
+  while (!pending_vars.empty()) {
+    run_all_ops(ready_ops);
+    if (ready_vars.empty()) {
+      return false;
+    }
+    for (auto ready_var : ready_vars) {
+      pending_vars.erase(ready_var);
+      for (auto *op : ready_var->pending_ops_) {
+        auto &deps = --pending_ops[op];
+        if (deps == 0) {
+          ready_ops.insert(op);
+        }
+      }
+    }
+    ready_vars.clear();
+  }
+  return true;
+}
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/ssa_graph_checker.h
+++ b/paddle/fluid/framework/details/ssa_graph_checker.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/fluid/framework/details/ssa_graph_builder.h"
+namespace paddle {
+namespace framework {
+namespace details {
+class SSAGraph;
+class SSAGraghBuilderWithChecker : public SSAGraphBuilder {
+ public:
+  explicit SSAGraghBuilderWithChecker(
+      std::unique_ptr<SSAGraphBuilder>&& builder)
+      : builder_(std::move(builder)) {}
+  std::unique_ptr<SSAGraph> Build(const ProgramDesc& program) const override {
+    auto graph = builder_->Build(program);
+    PADDLE_ENFORCE(IsValidGraph(graph.get()));
+    return graph;
+  }
+  bool IsValidGraph(const SSAGraph* graph) const;
+ private:
+  std::unique_ptr<SSAGraphBuilder> builder_;
+};
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -185,6 +185,7 @@ void ThreadedSSAGraphExecutor::InsertPendingVar(
    ready_vars->Push(var);
  }
 }
 void ThreadedSSAGraphExecutor::RunOp(
    BlockingQueue<VarHandleBase *> *ready_var_q, details::OpHandleBase *op) {
  auto op_run = [ready_var_q, op, this] {
@@ -192,7 +193,7 @@ void ThreadedSSAGraphExecutor::RunOp(
      if (VLOG_IS_ON(10)) {
        VLOG(10) << op << " " << op->Name() << " : " << op->DebugString();
      }
-      op->Run(strategy_.use_event_);
+      op->Run(strategy_.use_cuda_);
      VLOG(10) << op << " " << op->Name() << " Done ";
      running_ops_--;
      ready_var_q->Extend(op->Outputs());

--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -24,6 +24,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/profiler.h"
 DECLARE_bool(benchmark);
+DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run");
 namespace paddle {
 namespace framework {
@@ -115,6 +116,7 @@ void Executor::CreateVariables(const ProgramDesc& pdesc, Scope* scope,
 void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
                   bool create_local_scope, bool create_vars) {
  platform::RecordBlock b(block_id);
+  if (FLAGS_use_mkldnn) EnableMKLDNN(pdesc);
  auto ctx = Prepare(pdesc, block_id);
  RunPreparedContext(ctx.get(), scope, create_local_scope, create_vars);
 }
@@ -214,6 +216,7 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
                   const std::string& feed_holder_name,
                   const std::string& fetch_holder_name) {
  platform::RecordBlock b(kProgramId);
+  if (FLAGS_use_mkldnn) EnableMKLDNN(program);
  bool has_feed_ops =
      has_feed_operators(program.Block(0), *feed_targets, feed_holder_name);
  bool has_fetch_ops =
@@ -225,7 +228,6 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
    unique_ptr_of_copy_program.reset(new ProgramDesc(program));
    copy_program = unique_ptr_of_copy_program.get();
  }
  auto* global_block = copy_program->MutableBlock(0);
  if (!has_feed_ops) {
@@ -378,5 +380,19 @@ void Executor::RunPreparedContext(
  }
 }
+void Executor::EnableMKLDNN(const ProgramDesc& program) {
+#ifdef PADDLE_WITH_MKLDNN
+  VLOG(3) << "use_mkldnn=True";
+  for (size_t bid = 0; bid < program.Size(); ++bid) {
+    auto* block = const_cast<ProgramDesc&>(program).MutableBlock(bid);
+    for (auto* op : block->AllOps()) {
+      if (op->HasAttr("use_mkldnn")) {
+        op->SetAttr("use_mkldnn", true);
+      }
+    }
+  }
+#endif
+}
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
@@ -81,6 +81,8 @@ class Executor {
                          const std::string& feed_holder_name = "feed",
                          const std::string& fetch_holder_name = "fetch");
+  void EnableMKLDNN(const ProgramDesc& program);
 private:
  const platform::Place place_;
 };

--- a/paddle/fluid/framework/framework.proto
+++ b/paddle/fluid/framework/framework.proto
@@ -71,6 +71,7 @@ message OpProto {
    optional bool duplicable = 3 [ default = false ];
    optional bool intermediate = 4 [ default = false ];
    optional bool dispensable = 5 [ default = false ];
+    optional string reuse = 6;
  }
  // AttrProto describes the C++ type Attribute.

--- a/paddle/fluid/framework/op_info.cc
+++ b/paddle/fluid/framework/op_info.cc
@@ -17,12 +17,11 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
-static OpInfoMap* g_op_info_map = nullptr;
+// C++11 removes the need for manual locking. Concurrent execution shall wait if
+// a static local variable is already being initialized.
+// https://stackoverflow.com/questions/11711920/how-to-implement-multithread-safe-singleton-in-c11-without-using-mutex
 OpInfoMap& OpInfoMap::Instance() {
-  if (g_op_info_map == nullptr) {
+  static OpInfoMap* g_op_info_map = new OpInfoMap();
-    g_op_info_map = new OpInfoMap();
-  }
  return *g_op_info_map;
 }
 }  // namespace framework

--- a/paddle/fluid/framework/op_proto_maker.cc
+++ b/paddle/fluid/framework/op_proto_maker.cc
@@ -21,6 +21,7 @@ namespace framework {
 void OpProtoAndCheckerMaker::Validate() {
  validated_ = true;
  CheckNoDuplicatedInOutAttrs();
+  CheckReuseVars();
 }
 OpProtoAndCheckerMaker::VariableBuilder OpProtoAndCheckerMaker::AddInput(
@@ -56,6 +57,24 @@ void OpProtoAndCheckerMaker::CheckNoDuplicatedInOutAttrs() {
  }
 }
+void OpProtoAndCheckerMaker::CheckReuseVars() {
+  std::unordered_set<std::string> names;
+  for (auto& input : proto_->inputs()) {
+    names.insert(input.name());
+  }
+  auto checker = [&](const std::string& name, const std::string& reused) {
+    PADDLE_ENFORCE(
+        names.count(reused),
+        "Output [%s] reuse Input [%s], but the input is not registered.", name,
+        reused);
+  };
+  for (auto& output : proto_->outputs()) {
+    if (output.has_reuse()) {
+      checker(output.name(), output.reuse());
+    }
+  }
+}
 void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto,
                                        OpAttrChecker* attr_checker) {
  proto_ = proto;

--- a/paddle/fluid/framework/op_proto_maker.h
+++ b/paddle/fluid/framework/op_proto_maker.h
@@ -14,6 +14,8 @@ limitations under the License. */
 #pragma once
 #include <string>
+#include <unordered_set>
 #include "glog/logging.h"
 #include "paddle/fluid/framework/attribute.h"
 #include "paddle/fluid/framework/framework.pb.h"
@@ -64,6 +66,11 @@ class OpProtoAndCheckerMaker {
      var_->set_dispensable(true);
      return *this;
    }
+    VariableBuilder &Reuse(const std::string &name) {
+      var_->set_reuse(name);
+      return *this;
+    }
  };
  VariableBuilder AddInput(const std::string &name, const std::string &comment);
@@ -89,6 +96,8 @@ class OpProtoAndCheckerMaker {
  void CheckNoDuplicatedInOutAttrs();
  void Validate();
+  void CheckReuseVars();
  proto::OpProto *proto_;
  OpAttrChecker *op_checker_;
  bool validated_{false};

--- a/paddle/fluid/framework/op_proto_maker_test.cc
+++ b/paddle/fluid/framework/op_proto_maker_test.cc
@@ -47,3 +47,23 @@ TEST(ProtoMaker, DuplicatedInOut) {
  ASSERT_THROW(proto_maker(&op_proto, &op_checker),
               paddle::platform::EnforceNotMet);
 }
+class TestInplaceProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "input of test op");
+    AddOutput("XOut", "output of test op").Reuse("X");
+    AddOutput("NoOut", "output of test op").Reuse("NotExists");
+  }
+};
+TEST(ProtoMaker, InplaceOutput) {
+  paddle::framework::proto::OpProto op_proto;
+  paddle::framework::OpAttrChecker op_checker;
+  TestInplaceProtoMaker proto_maker;
+  ASSERT_THROW(proto_maker(&op_proto, &op_checker),
+               paddle::platform::EnforceNotMet);
+  // proto_maker(&op_proto, &op_checker);
+  // proto_maker.Make();
+  // ASSERT_THROW(proto_maker.Validate(), paddle::platform::EnforceNotMet);
+}
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -22,8 +22,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
-#include "paddle/fluid/framework/details/graph_builder_factory.h"
 #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
+#include "paddle/fluid/framework/details/ssa_graph_builder_factory.h"
 #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -43,7 +43,8 @@ class ParallelExecutorPrivate {
 #ifdef PADDLE_WITH_CUDA
  std::unique_ptr<platform::NCCLContextMap> nccl_ctxs_;
 #endif
-  bool own_local_scope;
+  bool own_local_scope_;
+  bool use_cuda_;
 };
 std::vector<Scope *> &ParallelExecutor::GetLocalScopes() {
@@ -60,35 +61,40 @@ ParallelExecutor::ParallelExecutor(
    size_t num_trainers, size_t trainer_id)
    : member_(new ParallelExecutorPrivate(places)) {
  member_->global_scope_ = scope;
+  member_->use_cuda_ = exec_strategy.use_cuda_;
  // Step 1. Bcast the params to devs.
  // Create local scopes
  if (local_scopes.empty()) {
-    member_->own_local_scope = true;
+    member_->own_local_scope_ = true;
    member_->local_scopes_.emplace_back(member_->global_scope_);
    for (size_t i = 1; i < member_->places_.size(); ++i) {
      member_->local_scopes_.emplace_back(&scope->NewScope());
    }
  } else {
-    member_->own_local_scope = false;
+    member_->own_local_scope_ = false;
    PADDLE_ENFORCE_EQ(member_->places_.size(), local_scopes.size());
    for (size_t i = 0; i < member_->places_.size(); ++i) {
      member_->local_scopes_.emplace_back(&local_scopes[i]->NewScope());
    }
  }
+  if (member_->use_cuda_) {
 // Bcast Parameters to all GPUs
 #ifdef PADDLE_WITH_CUDA
-  auto *nccl_id_var = scope->FindVar(NCCL_ID_VARNAME);
+    auto *nccl_id_var = scope->FindVar(NCCL_ID_VARNAME);
-  ncclUniqueId *nccl_id = nullptr;
+    ncclUniqueId *nccl_id = nullptr;
-  if (nccl_id_var != nullptr) {
+    if (nccl_id_var != nullptr) {
-    nccl_id = nccl_id_var->GetMutable<ncclUniqueId>();
+      nccl_id = nccl_id_var->GetMutable<ncclUniqueId>();
-  }
+    }
-  member_->nccl_ctxs_.reset(new platform::NCCLContextMap(
+    member_->nccl_ctxs_.reset(new platform::NCCLContextMap(
-      member_->places_, nccl_id, num_trainers, trainer_id));
+        member_->places_, nccl_id, num_trainers, trainer_id));
+#else
+    PADDLE_THROW("Not compiled with CUDA");
 #endif
-  if (platform::is_gpu_place(places[0]) && member_->local_scopes_.size() != 1 &&
+  }
-      local_scopes.empty()) {  // Is CUDA
+  if (member_->local_scopes_.size() != 1 && local_scopes.empty()) {
    BCastParamsToGPUs(bcast_vars);
  }
  // Startup Program has been run. All local scopes has correct parameters.
@@ -107,10 +113,14 @@ ParallelExecutor::ParallelExecutor(
  details::SSAGraphBuilderFactory builder_factory(
      member_->places_, loss_var_name, params, member_->local_scopes_,
      build_strategy);
+  if (member_->use_cuda_) {
 #ifdef PADDLE_WITH_CUDA
-  builder_factory.SetNCCLContextMap(member_->nccl_ctxs_.get());
+    builder_factory.SetNCCLContextMap(member_->nccl_ctxs_.get());
+#else
+    PADDLE_THROW("Not compiled with CUDA");
 #endif
+  }
  builder_ = std::move(builder_factory.Create());
  member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
      exec_strategy, member_->local_scopes_, places,
@@ -123,7 +133,6 @@ ParallelExecutor::ParallelExecutor(
 void ParallelExecutor::BCastParamsToGPUs(
    const std::unordered_set<std::string> &vars) const {
-#ifdef PADDLE_WITH_CUDA
  auto *main_scope = member_->local_scopes_[0];
  for (auto &var : vars) {
@@ -135,6 +144,7 @@ void ParallelExecutor::BCastParamsToGPUs(
    auto &main_tensor = main_var->Get<LoDTensor>();
    auto &dims = main_tensor.dims();
    if (paddle::platform::is_gpu_place(main_tensor.place())) {
+#ifdef PADDLE_WITH_CUDA
      size_t numel = main_tensor.numel();
      ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type());
      platform::NCCLGroupGuard guard;
@@ -161,6 +171,10 @@ void ParallelExecutor::BCastParamsToGPUs(
                                       nccl_ctx.comm_, nccl_ctx.stream());
        }
      }
+      member_->nccl_ctxs_->WaitAll();
+#else
+      PADDLE_THROW("Not compiled with CUDA");
+#endif
    } else {
      platform::CPUPlace cpu;
      for (size_t i = 1; i < member_->places_.size(); ++i) {
@@ -171,11 +185,7 @@ void ParallelExecutor::BCastParamsToGPUs(
        paddle::framework::TensorCopy(main_tensor, cpu, t);
      }
    }
-    member_->nccl_ctxs_->WaitAll();
  }
-#else
-  PADDLE_THROW("Not compiled with CUDA");
-#endif
 }
 void ParallelExecutor::Run(const std::vector<std::string> &fetch_tensors,
@@ -221,7 +231,7 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
 }
 ParallelExecutor::~ParallelExecutor() {
-  if (member_->own_local_scope) {
+  if (member_->own_local_scope_) {
    for (size_t i = 1; i < member_->local_scopes_.size(); ++i) {
      member_->global_scope_->DeleteScope(member_->local_scopes_[i]);
    }

--- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h
+++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
@@ -64,7 +64,8 @@ class TRTConvertValidation {
  TRTConvertValidation(int batch_size,
                       const std::unordered_set<std::string>& parameters,
-                       framework::Scope& scope, int workspace_size = 1 << 10)
+                       framework::Scope& scope,  // NOLINT
+                       int workspace_size = 1 << 10)
      : parameters_(parameters), scope_(scope) {
    // create engine.
    engine_.reset(new TensorRTEngine(10, 1 << 10, &stream_));
@@ -151,7 +152,8 @@ class TRTConvertValidation {
      // Compare two output
      ASSERT_FALSE(fluid_out.empty());
      for (size_t i = 0; i < fluid_out.size(); i++) {
-        EXPECT_LT(std::abs(fluid_out[i] - trt_out[i]), 1e-6);
+        // Loose the threshold for CI in different machine model.
+        EXPECT_LT(std::abs(fluid_out[i] - trt_out[i]), 2e-5);
      }
    }
  }

--- a/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
@@ -21,7 +21,6 @@ DEFINE_string(fp16_dirname, "", "Directory of the float16 inference model.");
 DEFINE_int32(batch_size, 1, "Batch size of input data");
 DEFINE_int32(repeat, 1, "Running the inference program repeat times");
 DEFINE_bool(skip_cpu, false, "Skip the cpu test");
-DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run inference");
 TEST(inference, image_classification) {
  if (FLAGS_dirname.empty() || FLAGS_batch_size < 1 || FLAGS_repeat < 1) {
@@ -59,10 +58,8 @@ TEST(inference, image_classification) {
    // Run inference on CPU
    LOG(INFO) << "--- CPU Runs: ---";
    LOG(INFO) << "Batch size is " << FLAGS_batch_size;
-    LOG(INFO) << "FLAGS_use_mkldnn: " << FLAGS_use_mkldnn;
    TestInference<paddle::platform::CPUPlace, false, true>(
-        dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, is_combined,
+        dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, is_combined);
-        FLAGS_use_mkldnn);
    LOG(INFO) << output1.dims();
  }

--- a/paddle/fluid/inference/tests/book/test_inference_nlp.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
@@ -27,7 +27,6 @@ limitations under the License. */
 DEFINE_string(model_path, "", "Directory of the inference model.");
 DEFINE_string(data_file, "", "File of input index data.");
 DEFINE_int32(repeat, 100, "Running the inference program repeat times");
-DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run inference");
 DEFINE_bool(prepare_vars, true, "Prepare variables before executor");
 DEFINE_int32(num_threads, 1, "Number of threads should be used");
@@ -190,9 +189,6 @@ TEST(inference, nlp) {
    std::unique_ptr<paddle::framework::ProgramDesc> inference_program;
    inference_program = InitProgram(&executor, scope.get(), FLAGS_model_path,
                                    /*model combined*/ false);
-    if (FLAGS_use_mkldnn) {
-      EnableMKLDNN(inference_program);
-    }
    // always prepare context
    std::unique_ptr<paddle::framework::ExecutorPrepareContext> ctx;
    ctx = executor.Prepare(*inference_program, 0);

--- a/paddle/fluid/inference/tests/test_helper.h
+++ b/paddle/fluid/inference/tests/test_helper.h
@@ -22,6 +22,8 @@ limitations under the License. */
 #include "paddle/fluid/inference/io.h"
 #include "paddle/fluid/platform/profiler.h"
+DECLARE_bool(use_mkldnn);
 template <typename T>
 void SetupTensor(paddle::framework::LoDTensor* input,
                 paddle::framework::DDim dims, T lower, T upper) {
@@ -133,24 +135,11 @@ std::vector<std::vector<int64_t>> GetFeedTargetShapes(
  return feed_target_shapes;
 }
-void EnableMKLDNN(
-    const std::unique_ptr<paddle::framework::ProgramDesc>& program) {
-  for (size_t bid = 0; bid < program->Size(); ++bid) {
-    auto* block = program->MutableBlock(bid);
-    for (auto* op : block->AllOps()) {
-      if (op->HasAttr("use_mkldnn")) {
-        op->SetAttr("use_mkldnn", true);
-      }
-    }
-  }
-}
 template <typename Place, bool CreateVars = true, bool PrepareContext = false>
 void TestInference(const std::string& dirname,
                   const std::vector<paddle::framework::LoDTensor*>& cpu_feeds,
                   const std::vector<paddle::framework::LoDTensor*>& cpu_fetchs,
-                   const int repeat = 1, const bool is_combined = false,
+                   const int repeat = 1, const bool is_combined = false) {
-                   const bool use_mkldnn = false) {
  // 1. Define place, executor, scope
  auto place = Place();
  auto executor = paddle::framework::Executor(place);
@@ -182,9 +171,6 @@ void TestInference(const std::string& dirname,
        "init_program",
        paddle::platform::DeviceContextPool::Instance().Get(place));
    inference_program = InitProgram(&executor, scope, dirname, is_combined);
-    if (use_mkldnn) {
-      EnableMKLDNN(inference_program);
-    }
  }
  // Disable the profiler and print the timing information
  paddle::platform::DisableProfiler(paddle::platform::EventSortingKey::kDefault,
@@ -210,7 +196,10 @@ void TestInference(const std::string& dirname,
    fetch_targets[fetch_target_names[i]] = cpu_fetchs[i];
  }
-  // 6. Run the inference program
+  // 6. If export Flags_use_mkldnn=True, use mkldnn related ops.
+  if (FLAGS_use_mkldnn) executor.EnableMKLDNN(*inference_program);
+  // 7. Run the inference program
  {
    if (!CreateVars) {
      // If users don't want to create and destroy variables every time they

--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -186,19 +186,23 @@ endif()
 add_subdirectory(detail)
 if(WITH_DISTRIBUTE)
-    set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf)
+    set(DISTRIBUTE_DEPS "")
+    if(WITH_GRPC)
+        set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf)
+    else()
+        set(DISTRIBUTE_DEPS sendrecvop_brpc brpc leveldb snappystream snappy protobuf ssl crypto zlib)
+    endif()
    set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
-    op_library(send_op DEPS ${DISTRIBUTE_DEPS})
-    set_source_files_properties(send_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
    op_library(prefetch_op DEPS ${DISTRIBUTE_DEPS})
    set_source_files_properties(prefetch_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
    op_library(recv_op DEPS ${DISTRIBUTE_DEPS})
    set_source_files_properties(recv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
    op_library(listen_and_serv_op DEPS ${DISTRIBUTE_DEPS})
    set_source_files_properties(listen_and_serv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-    op_library(send_vars_op DEPS ${DISTRIBUTE_DEPS})
+    op_library(send_op DEPS ${DISTRIBUTE_DEPS})
-    set_source_files_properties(send_vars_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+    set_source_files_properties(send_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
    op_library(send_barrier_op DEPS ${DISTRIBUTE_DEPS})
    op_library(fetch_barrier_op DEPS ${DISTRIBUTE_DEPS})
    set_source_files_properties(send_barrier_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
@@ -208,15 +212,18 @@ if(WITH_DISTRIBUTE)
    #        listen_and_serv_op sum_op executor SERIAL)
    if(WITH_GPU)
        set_source_files_properties(test_send_nccl_id.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-        cc_test(test_send_nccl_id SRCS test_send_nccl_id.cc DEPS send_op
+        cc_test(test_send_nccl_id SRCS test_send_nccl_id.cc DEPS listen_and_serv_op executor SERIAL)
-                listen_and_serv_op executor SERIAL)
+        if(WITH_GRPC)
-        op_library(gen_nccl_id_op DEPS nccl_common sendrecvop_grpc)
+            op_library(gen_nccl_id_op DEPS nccl_common sendrecvop_grpc)
+        else()
+            op_library(gen_nccl_id_op DEPS nccl_common sendrecvop_brpc)
+        endif()
        set_source_files_properties(gen_nccl_id_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
    else()
        set(DEPS_OPS ${DEPS_OPS} gen_nccl_id_op)
    endif()
 else()
-    set(DEPS_OPS ${DEPS_OPS} send_op prefetch_op recv_op listen_and_serv_op send_vars_op send_barrier_op fetch_barrier_op gen_nccl_id_op)
+    set(DEPS_OPS ${DEPS_OPS}  prefetch_op recv_op listen_and_serv_op send_op send_barrier_op fetch_barrier_op gen_nccl_id_op)
 endif()
 op_library(cross_entropy_op DEPS cross_entropy)

--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -24,12 +24,12 @@ namespace operators {
      : public ::paddle::framework::OpProtoAndCheckerMaker {            \
   public:                                                              \
    void Make() override {                                              \
-      AddInput("X", "Input of " #OP_NAME "operator");                   \
+      AddInput("X", "Input of " #OP_NAME " operator");                  \
-      AddOutput("Out", "Output of" #OP_NAME "operator");                \
+      AddOutput("Out", "Output of " #OP_NAME " operator").Reuse("X");   \
      AddAttr<bool>("use_mkldnn",                                       \
                    "(bool, default false) Only used in mkldnn kernel") \
          .SetDefault(false);                                           \
-      AddComment(#OP_COMMENT);                                          \
+      AddComment(OP_COMMENT);                                           \
    }                                                                   \
  }

--- a/paddle/fluid/operators/adam_op.cc
+++ b/paddle/fluid/operators/adam_op.cc
@@ -89,9 +89,9 @@ class AdamOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("Beta1Pow", "(Tensor) Input beta1 power accumulator");
    AddInput("Beta2Pow", "(Tensor) Input beta2 power accumulator");
-    AddOutput("ParamOut", "(Tensor) Output parameter");
+    AddOutput("ParamOut", "(Tensor) Output parameter").Reuse("Param");
-    AddOutput("Moment1Out", "(Tensor) Output first moment");
+    AddOutput("Moment1Out", "(Tensor) Output first moment").Reuse("Moment1");
-    AddOutput("Moment2Out", "(Tensor) Output second moment");
+    AddOutput("Moment2Out", "(Tensor) Output second moment").Reuse("Moment2");
    AddAttr<float>("beta1",
                   "(float, default 0.9) "

--- a/paddle/fluid/operators/arg_max_op.cc
+++ b/paddle/fluid/operators/arg_max_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/arg_min_max_op_base.h"
+REGISTER_OPERATOR(arg_max, paddle::operators::ArgMinMaxOp,
+                  paddle::operators::ArgMaxOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    arg_max,
+    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext, float>,
+    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext, double>,
+    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext,
+                                    int64_t>,
+    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext,
+                                    int32_t>,
+    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext,
+                                    int16_t>,
+    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext, size_t>,
+    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext,
+                                    uint8_t>);
--- a/paddle/fluid/operators/arg_max_op.cu
+++ b/paddle/fluid/operators/arg_max_op.cu
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/arg_min_max_op_base.h"
+REGISTER_OP_CUDA_KERNEL(
+    arg_max,
+    paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext, float>,
+    paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
+                                    double>,
+    paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
+                                    int64_t>,
+    paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
+                                    int32_t>,
+    paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
+                                    int16_t>,
+    paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
+                                    size_t>,
+    paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
+                                    uint8_t>);
--- a/paddle/fluid/operators/arg_min_max_op_base.h
+++ b/paddle/fluid/operators/arg_min_max_op_base.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <string>
+#include <type_traits>
+#include <vector>
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/string/printf.h"
+namespace paddle {
+namespace operators {
+enum ArgMinMaxType { kArgMin, kArgMax };
+template <typename DeviceContext, typename T, typename Tout, int64_t Rank,
+          ArgMinMaxType argMinMaxValue>
+struct ArgMinMaxFunctor {};
+#define DECLARE_ARG_MIN_MAX_FUNCTOR(eigen_op_type, enum_argminmax_value)      \
+  template <typename DeviceContext, typename T, typename Tout, int64_t Rank>  \
+  struct ArgMinMaxFunctor<DeviceContext, T, Tout, Rank,                       \
+                          enum_argminmax_value> {                             \
+    void operator()(const DeviceContext& ctx, const framework::LoDTensor& in, \
+                    framework::LoDTensor* out, int64_t axis) {                \
+      auto in_eigen = framework::EigenTensor<T, Rank>::From(in);              \
+      auto out_eigen = framework::EigenTensor<Tout, Rank - 1>::From(*out);    \
+      out_eigen.device(*(ctx.eigen_device())) =                               \
+          in_eigen.eigen_op_type(axis).template cast<Tout>();                 \
+    }                                                                         \
+  }
+DECLARE_ARG_MIN_MAX_FUNCTOR(argmin, ArgMinMaxType::kArgMin);
+DECLARE_ARG_MIN_MAX_FUNCTOR(argmax, ArgMinMaxType::kArgMax);
+template <typename DeviceContext, typename T, typename Tout,
+          ArgMinMaxType EnumArgMinMaxValue>
+class ArgMinMaxKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& x = *(ctx.Input<framework::LoDTensor>("X"));
+    auto& out = *(ctx.Output<framework::LoDTensor>("Out"));
+    out.mutable_data<Tout>(ctx.GetPlace());
+    auto axis = ctx.Attr<int64_t>("axis");
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+#define CALL_ARG_MINMAX_FUNCTOR(rank)                                \
+  ArgMinMaxFunctor<DeviceContext, T, Tout, rank, EnumArgMinMaxValue> \
+      functor##rank;                                                 \
+  functor##rank(dev_ctx, x, &out, axis)
+    switch (x.dims().size()) {
+      case 1:
+        CALL_ARG_MINMAX_FUNCTOR(1);
+        break;
+      case 2:
+        CALL_ARG_MINMAX_FUNCTOR(2);
+        break;
+      case 3:
+        CALL_ARG_MINMAX_FUNCTOR(3);
+        break;
+      case 4:
+        CALL_ARG_MINMAX_FUNCTOR(4);
+        break;
+      case 5:
+        CALL_ARG_MINMAX_FUNCTOR(5);
+        break;
+      case 6:
+        CALL_ARG_MINMAX_FUNCTOR(6);
+        break;
+      default:
+        PADDLE_THROW(
+            "%s operator doesn't supports tensors whose ranks are greater "
+            "than 6.",
+            (EnumArgMinMaxValue == kArgMin ? "argmin" : "argmax"));
+        break;
+#undef CALL_ARG_MINMAX_FUNCTOR
+    }
+  }
+};
+template <typename DeviceContext, typename T>
+using ArgMinKernel =
+    ArgMinMaxKernel<DeviceContext, T, int64_t, ArgMinMaxType::kArgMin>;
+template <typename DeviceContext, typename T>
+using ArgMaxKernel =
+    ArgMinMaxKernel<DeviceContext, T, int64_t, ArgMinMaxType::kArgMax>;
+class ArgMinMaxOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null");
+    const auto& x_dims = ctx->GetInputDim("X");
+    int64_t axis = ctx->Attrs().Get<int64_t>("axis");
+    PADDLE_ENFORCE(axis >= -x_dims.size() && axis < x_dims.size(),
+                   "'axis' must be inside [-Rank(X), Rank(X))");
+    auto x_rank = x_dims.size();
+    if (axis < 0) axis += x_rank;
+    std::vector<int64_t> vec;
+    for (int64_t i = 0; i < axis; i++) vec.push_back(x_dims[i]);
+    for (int64_t i = axis + 1; i < x_rank; i++) vec.push_back(x_dims[i]);
+    ctx->SetOutputDim("Out", framework::make_ddim(vec));
+  }
+};
+class BaseArgMinMaxOpMaker : public framework::OpProtoAndCheckerMaker {
+ protected:
+  virtual const char* OpName() const = 0;
+  virtual const char* Name() const = 0;
+ public:
+  void Make() override {
+    AddInput("X", "Input tensor.");
+    AddOutput("Out", "Output tensor.");
+    AddAttr<int64_t>("axis", "The axis in which to compute the arg indics.");
+    AddComment(string::Sprintf(R"DOC(
+      %s Operator.
+      Computes the indices of the %s elements of the input tensor's element
+      along the provided axis.
+)DOC",
+                               OpName(), Name()));
+  }
+};
+class ArgMinOpMaker : public BaseArgMinMaxOpMaker {
+ protected:
+  const char* OpName() const override { return "ArgMin"; }
+  const char* Name() const override { return "min"; }
+};
+class ArgMaxOpMaker : public BaseArgMinMaxOpMaker {
+ protected:
+  const char* OpName() const override { return "ArgMax"; }
+  const char* Name() const override { return "max"; }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/arg_min_op.cc
+++ b/paddle/fluid/operators/arg_min_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/arg_min_max_op_base.h"
+REGISTER_OPERATOR(arg_min, paddle::operators::ArgMinMaxOp,
+                  paddle::operators::ArgMinOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    arg_min,
+    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext, float>,
+    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext, double>,
+    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext,
+                                    int64_t>,
+    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext,
+                                    int32_t>,
+    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext,
+                                    int16_t>,
+    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext, size_t>,
+    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext,
+                                    uint8_t>);
--- a/paddle/fluid/operators/arg_min_op.cu
+++ b/paddle/fluid/operators/arg_min_op.cu
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/arg_min_max_op_base.h"
+REGISTER_OP_CUDA_KERNEL(
+    arg_min,
+    paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext, float>,
+    paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext,
+                                    double>,
+    paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext,
+                                    int64_t>,
+    paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext,
+                                    int32_t>,
+    paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext,
+                                    int16_t>,
+    paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext,
+                                    size_t>,
+    paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext,
+                                    uint8_t>);
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -151,13 +151,15 @@ class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("Variance",
             "The global variance (for training) "
             "or estimated Variance (for testing)");
-    AddOutput("Y", "result after normalization");
+    AddOutput("Y", "result after normalization").Reuse("X");
    AddOutput("MeanOut",
              "Share memory with Mean. "
-              "Store the global mean when training");
+              "Store the global mean when training")
+        .Reuse("Mean");
    AddOutput("VarianceOut",
              "Share memory with Variance. "
-              "Store the global Variance when training");
+              "Store the global Variance when training")
+        .Reuse("Variance");
    AddOutput("SavedMean",
              "Mean of the current mini batch, "
              "will apply to output when training")

--- a/paddle/fluid/operators/batch_size_like.h
+++ b/paddle/fluid/operators/batch_size_like.h
@@ -54,18 +54,18 @@ class BatchSizeLikeOp : public framework::OperatorWithKernel {
 class BatchSizeLikeOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() final {
-    AddInput("Input",
+    AddInput(
-             "(Tensor) Tensor "
+        "Input",
-             "whose input_dim_idx'th dimension specifies the batch_size");
+        "Tensor whose input_dim_idx'th dimension specifies the batch_size");
    AddOutput("Out",
-              "(Tensor) Tensor of specified shape will be filled "
+              "Tensor of specified shape will be filled "
              "with the specified value");
-    AddAttr<std::vector<int>>("shape", "(vector<int>) The shape of the output");
+    AddAttr<std::vector<int>>("shape", "The shape of the output");
    AddAttr<int>("input_dim_idx",
-                 "(int, default 0) The index of input's batch size dimension")
+                 "default 0. The index of input's batch size dimension")
        .SetDefault(0);
    AddAttr<int>("output_dim_idx",
-                 "(int, default 0) The index of output's batch size dimension")
+                 "default 0. The index of output's batch size dimension")
        .SetDefault(0);
    Apply();
  }

--- a/paddle/fluid/operators/bilinear_interp_op.cc
+++ b/paddle/fluid/operators/bilinear_interp_op.cc
@@ -56,17 +56,16 @@ class BilinearInterpOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override {
    AddInput("X",
-             "(Tensor) The input tensor of bilinear interpolation, "
+             "The input tensor of bilinear interpolation, "
             "This is a 4-D tensor with shape of (N x C x h x w)");
    AddInput("OutSize",
-             "(Tensor) This is a 1-D tensor with two number. "
+             "This is a 1-D tensor with two number. "
             "The first number is height and the second number is width.")
        .AsDispensable();
-    AddOutput("Out",
+    AddOutput("Out", "The dimension of output is (N x C x out_h x out_w)");
-              "(Tensor) The dimension of output is (N x C x out_h x out_w]");
-    AddAttr<int>("out_h", "(int) output height of bilinear interpolation op.");
+    AddAttr<int>("out_h", "output height of bilinear interpolation op.");
-    AddAttr<int>("out_w", "(int) output width of bilinear interpolation op.");
+    AddAttr<int>("out_w", "output width of bilinear interpolation op.");
    AddComment(R"DOC(
          Bilinear interpolation is an extension of linear interpolation for 
          interpolating functions of two variables (e.g. H-direction and 

--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -125,7 +125,8 @@ void Conv2DOpMaker::Make() {
           "input image channels divided by the groups.");
  AddOutput("Output",
            "(Tensor) The output tensor of convolution operator. "
-            "The format of output tensor is also NCHW.");
+            "The format of output tensor is also NCHW.")
+      .Reuse("Input");
  AddAttr<std::vector<int>>("strides",
                            "(vector<int> default:{1, 1}), the "
                            "strides(h_stride, w_stride) of "
@@ -220,7 +221,8 @@ void Conv3DOpMaker::Make() {
           "input image channels divided by the groups.");
  AddOutput("Output",
            "(Tensor) The output tensor of convolution operator."
-            "The format of output tensor is also NCDHW.");
+            "The format of output tensor is also NCDHW.")
+      .Reuse("Input");
  AddAttr<std::vector<int>>("strides",
                            "(vector<int>, default:{1, 1, 1}), the "
                            "strides(d_stride, h_stride, w_stride) of "

--- a/paddle/fluid/operators/crop_op.cc
+++ b/paddle/fluid/operators/crop_op.cc
@@ -48,6 +48,13 @@ class CropOp : public framework::OperatorWithKernel {
      ctx->SetOutputDim("Out", y_dim);
    }
  }
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+        ctx.device_context());
+  }
 };
 class CropOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -60,13 +67,19 @@ class CropOpMaker : public framework::OpProtoAndCheckerMaker {
             "The input used as reference for cropping, "
             "which is of the same dimensions as X.")
        .AsDispensable();
+    AddInput("Offsets",
+             "The input used to describe offsets in runtime, which is a "
+             "1-D vector whose size equals to the rank of input 'X'. The "
+             "elements data type must be int.")
+        .AsDispensable();
    AddOutput("Out",
              "The output of crop op, "
              "which is of the same dimensions as X.");
    AddAttr<std::vector<int>>("offsets",
                              "A list<int> describing offsets to be cropped. "
                              "The size of offsets list should be the same as "
-                              "the dimension size of input X.");
+                              "the dimension size of input X.")
+        .SetDefault(std::vector<int>());
    AddAttr<std::vector<int>>("shape",
                              "A list<int> describing the shape of output. "
                              "The size of shape list should be the same as "
@@ -77,6 +90,17 @@ Crop Operator.
 Crop input into output, as specified by offsets and shape.
+There are two ways to set the offsets:
+1. In runtime: Using the input 'Offsets', which is a Vairbale and can be 
+               output of other operators. This way is suitable for 
+               dynamic offsets.
+2. In network configuration: Using the attribute 'offsets', which will be 
+                             set in Python configure script. This way is 
+                             suitable for fixed offsets.
+You CANNOT use these two ways at the same time. An exception will be raised 
+if input 'Offset' is configured and meanwhile the attribute 'offsets' is 
+not empty.
 There are two ways to set shape:
 1. reference input: crop input X into the same shape as reference input.
                    The dimension of reference input should
@@ -146,6 +170,15 @@ class CropOpGrad : public framework::OperatorWithKernel {
      ctx->SetOutputDim(x_grad_name, x_dims);
    }
  }
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(
+            ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"))
+                ->type()),
+        ctx.device_context());
+  }
 };
 }  // namespace operators

--- a/paddle/fluid/operators/crop_op.h
+++ b/paddle/fluid/operators/crop_op.h
@@ -27,6 +27,37 @@ template <typename T, size_t D, int MajorType = Eigen::RowMajor,
 using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
 using framework::Tensor;
+static std::vector<int> GetOffsets(const framework::ExecutionContext& ctx) {
+  std::vector<int> res;
+  int rank = ctx.Input<Tensor>("X")->dims().size();
+  if (ctx.HasInput("Offsets")) {
+    PADDLE_ENFORCE(ctx.Attr<std::vector<int>>("offsets").empty(),
+                   "Input 'Offsets' and attribute 'offsets' should not be used "
+                   "at the same time.");
+    const auto* offsets_tensor = ctx.Input<Tensor>("Offsets");
+    PADDLE_ENFORCE_EQ(offsets_tensor->dims().size(), 1);
+    PADDLE_ENFORCE_EQ(
+        rank, offsets_tensor->dims()[0],
+        "Offsets size should be equal to dimension size of input tensor.");
+    const int* offsets_data;
+    framework::Tensor cpu_tmp_tensor;
+    if (platform::is_cpu_place(offsets_tensor->place())) {
+      offsets_data = offsets_tensor->data<int>();
+    } else {
+      framework::TensorCopySync(*offsets_tensor, platform::CPUPlace(),
+                                &cpu_tmp_tensor);
+      offsets_data = cpu_tmp_tensor.data<int>();
+    }
+    res = std::vector<int>(offsets_data, offsets_data + rank);
+  } else {
+    res = ctx.Attr<std::vector<int>>("offsets");
+    PADDLE_ENFORCE_EQ(
+        rank, res.size(),
+        "Offsets size should be equal to dimension size of input tensor.");
+  }
+  return res;
+}
 template <typename T>
 class CropKernel : public framework::OpKernel<T> {
 public:
@@ -37,10 +68,7 @@ class CropKernel : public framework::OpKernel<T> {
    T* out_data = out->mutable_data<T>(context.GetPlace());
    auto x_stride = framework::stride(x->dims());
    auto out_stride = framework::stride(out->dims());
-    auto offsets = context.Attr<std::vector<int>>("offsets");
+    auto offsets = GetOffsets(context);
-    PADDLE_ENFORCE_EQ(
-        x->dims().size(), static_cast<int64_t>(offsets.size()),
-        "Offsets size should be equal to dimension size of input tensor.");
    int64_t offset = 0;
    for (size_t i = 0; i < offsets.size(); ++i) {
      offset += (x_stride[i] * offsets[i]);
@@ -56,7 +84,7 @@ void CropGradFunction(const framework::ExecutionContext& context) {
  if (d_x != nullptr) {
    auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
    d_x->mutable_data<T>(context.GetPlace());
-    auto offsets = context.Attr<std::vector<int>>("offsets");
+    auto offsets = GetOffsets(context);
    Eigen::array<std::pair<int, int>, D> paddings;
    for (size_t i = 0; i < D; ++i) {
      paddings[i].first = offsets[i];

--- a/paddle/fluid/operators/cross_entropy_op.cc
+++ b/paddle/fluid/operators/cross_entropy_op.cc
@@ -124,7 +124,8 @@ class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
             "Tensor<float/double> with shape [N x D].");
    AddOutput("Y",
              "(Tensor, default Tensor<float>), a 2-D tensor with shape "
-              "[N x 1]. The cross entropy loss.");
+              "[N x 1]. The cross entropy loss.")
+        .Reuse("X");
    AddAttr<bool>("soft_label",
                  "(bool, default false), a flag indicating whether to "
                  "interpretate the given labels as soft labels.")

--- a/paddle/fluid/operators/detail/CMakeLists.txt
+++ b/paddle/fluid/operators/detail/CMakeLists.txt
-if(WITH_DISTRIBUTE)
+if(NOT WITH_DISTRIBUTE)
+    return()
+endif()
+if(WITH_GRPC)
  grpc_library(sendrecvop_grpc SRCS bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc
      request_handler_impl.cc rpc_client.cc rpc_server.cc grpc_server.cc variable_response.cc PROTO send_recv.proto DEPS lod_tensor
      selected_rows memory)
  set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
-  set_source_files_properties(serde_test.cc grpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+  set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-  cc_test(serde_test SRCS serde_test.cc variable_response.cc DEPS grpc++_unsecure grpc_unsecure gpr
+  cc_test(serde_test SRCS grpc_serde_test.cc variable_response.cc DEPS grpc++_unsecure grpc_unsecure gpr
          cares zlib protobuf sendrecvop_grpc SERIAL)
-  cc_test(grpc_server_test SRCS grpc_server_test.cc DEPS sendrecvop_grpc
+  cc_test(grpc_server_test SRCS rpc_server_test.cc DEPS sendrecvop_grpc
          grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor
          proto_desc lookup_table_op SERIAL)
+  return()
 endif()
+set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+set_source_files_properties(brpc_server.cc brpc_client.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+brpc_library(sendrecvop_brpc SRCS brpc_client.cc brpc_server.cc rpc_server.cc  rpc_client.cc request_handler_impl.cc
+  PROTO send_recv.proto
+  DEPS lod_tensor selected_rows memory)
+find_library(OPENSSL_CRYPTO_LIBRARY_STATIC NAMES libcrypto.so)
+ADD_LIBRARY(crypto SHARED IMPORTED GLOBAL)
+SET_PROPERTY(TARGET crypto PROPERTY IMPORTED_LOCATION ${OPENSSL_CRYPTO_LIBRARY_STATIC})
+find_library(OPENSSL_SSL_LIBRARY_STATIC NAMES libssl.so)
+ADD_LIBRARY(ssl SHARED IMPORTED GLOBAL)
+SET_PROPERTY(TARGET ssl PROPERTY IMPORTED_LOCATION ${OPENSSL_SSL_LIBRARY_STATIC})
+cc_test(brpc_server_test SRCS rpc_server_test.cc DEPS sendrecvop_brpc 
+       brpc protobuf leveldb gflags glog
+       protobuf executor proto_desc lookup_table_op snappystream snappy ssl crypto SERIAL)
--- a/paddle/fluid/operators/detail/brpc_client.cc
+++ b/paddle/fluid/operators/detail/brpc_client.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/operators/detail/brpc_client.h"
+#include "paddle/fluid/framework/threadpool.h"
+namespace paddle {
+namespace operators {
+namespace detail {
+DEFINE_int32(brpc_channel_num, 24,
+             "Number of channels to send requests connected to one server");
+DEFINE_int32(timeout_ms, 30000, "RPC timeout in milliseconds");
+DEFINE_int32(max_retry, 3, "Max retries(not including the first RPC)");
+BRPCClient::~BRPCClient() { Wait(); }
+void HandleSendResponse(brpc::Controller* cntl,
+                        sendrecv::VoidMessage* response) {
+  // std::unique_ptr makes sure cntl/response will be deleted before returning.
+  std::unique_ptr<brpc::Controller> cntl_guard(cntl);
+  std::unique_ptr<sendrecv::VoidMessage> response_guard(response);
+  if (cntl->Failed()) {
+    LOG(WARNING) << "Fail to send EchoRequest, " << cntl->ErrorText();
+    return;
+  }
+  LOG(INFO) << "Received response from " << cntl->remote_side()
+            << " latency=" << cntl->latency_us() << "us";
+}
+bool BRPCClient::AsyncSendVar(const std::string& ep,
+                              const platform::DeviceContext& ctx,
+                              const framework::Scope& scope,
+                              const std::string& var_name, int64_t time_out) {
+  const platform::DeviceContext* p_ctx = &ctx;
+  const std::string ep_val = ep;
+  const std::string var_name_val = var_name;
+  const framework::Scope* p_scope = &scope;
+  const auto ch_ptr = GetChannel(ep_val);
+  framework::AsyncIO(
+      [var_name_val, p_ctx, ep_val, p_scope, time_out, ch_ptr, this] {
+        auto ch_ctx = ch_ptr->Pop();
+        brpc::Controller* cntl = new brpc::Controller();
+        sendrecv::VoidMessage* response = new sendrecv::VoidMessage();
+        cntl->set_timeout_ms(time_out);
+        google::protobuf::Closure* done =
+            brpc::NewCallback(&HandleSendResponse, cntl, response);
+        sendrecv::VariableMessage request;
+        ch_ctx->stub->SendVariable(cntl, &request, response, done);
+      });
+  req_count_++;
+  return true;
+}
+void HandleGetResponse(brpc::Controller* cntl,
+                       sendrecv::VariableMessage* response) {
+  // std::unique_ptr makes sure cntl/response will be deleted before returning.
+  std::unique_ptr<brpc::Controller> cntl_guard(cntl);
+  std::unique_ptr<sendrecv::VariableMessage> response_guard(response);
+  if (cntl->Failed()) {
+    LOG(WARNING) << "Fail to send EchoRequest, " << cntl->ErrorText();
+    return;
+  }
+  LOG(INFO) << "Received response from " << cntl->remote_side()
+            << " latency=" << cntl->latency_us() << "us";
+  // framework::Variable* outvar = nullptr;
+  // DeserializeFromByteBuffer(ret_msg, *var_h.ctx, var_h.scope, &outvar);
+}
+bool BRPCClient::AsyncGetVar(const std::string& ep,
+                             const platform::DeviceContext& ctx,
+                             const framework::Scope& scope,
+                             const std::string& var_name, int64_t time_out) {
+  const platform::DeviceContext* p_ctx = &ctx;
+  const std::string ep_val = ep;
+  const std::string var_name_val = var_name;
+  const framework::Scope* p_scope = &scope;
+  const auto ch = GetChannel(ep_val);
+  framework::AsyncIO(
+      [var_name_val, ep_val, p_scope, p_ctx, time_out, ch, this] {});
+  req_count_++;
+  return true;
+}
+bool BRPCClient::AsyncPrefetchVar(const std::string& ep,
+                                  const platform::DeviceContext& ctx,
+                                  const framework::Scope& scope,
+                                  const std::string& in_var_name,
+                                  const std::string& out_var_name,
+                                  int64_t time_out) {
+  const platform::DeviceContext* p_ctx = &ctx;
+  const std::string ep_val = ep;
+  const std::string in_var_name_val = in_var_name;
+  const std::string out_var_name_val = out_var_name;
+  const framework::Scope* p_scope = &scope;
+  const auto ch = GetChannel(ep_val);
+  framework::AsyncIO([in_var_name_val, out_var_name_val, ep_val, p_scope, p_ctx,
+                      time_out, ch, this] {});
+  req_count_++;
+  return true;
+}
+void BRPCClient::AsyncSendBatchBarrier(const std::string& ep,
+                                       int64_t time_out) {
+  req_count_++;
+}
+void BRPCClient::AsyncSendFetchBarrier(const std::string& ep,
+                                       int64_t time_out) {
+  req_count_++;
+}
+void BRPCClient::Wait() {
+  std::unique_lock<std::mutex> lk(sync_mutex_);
+  sync_cond_.wait(lk, [this] { return req_count_ == 0; });
+}
+ChannelQueuePtr BRPCClient::GetChannel(const std::string& ep) {
+  {
+    std::lock_guard<std::mutex> guard(chan_mutex_);
+    auto it = channels_.find(ep);
+    if (it != channels_.end()) {
+      return it->second;
+    }
+  }
+  ChannelQueuePtr q(new framework::BlockingQueue<ChannelContextPtr>());
+  brpc::ChannelOptions options;
+  options.protocol = "baidu_std";
+  options.connection_type = "pooled";
+  options.connect_timeout_ms = 100;
+  options.timeout_ms = FLAGS_timeout_ms /*milliseconds*/;
+  options.max_retry = FLAGS_max_retry;
+  for (int i = 0; i < FLAGS_brpc_channel_num; ++i) {
+    std::shared_ptr<ChannelContext> c(new ChannelContext());
+    if (c->channel.Init(ep.c_str(), &options) != 0) {
+      LOG(ERROR) << "Fail to initialize channel";
+      return nullptr;
+    }
+    c->stub.reset(new sendrecv::SendRecvService_Stub(
+        static_cast<google::protobuf::RpcChannel*>(&c->channel)));
+    q->Push(c);
+  }
+  {
+    std::lock_guard<std::mutex> guard(chan_mutex_);
+    channels_[ep] = q;
+  }
+  return q;
+}
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/detail/brpc_client.h
+++ b/paddle/fluid/operators/detail/brpc_client.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <time.h>
+#include <chrono>  // NOLINT
+#include <ctime>
+#include <functional>
+#include <iostream>
+#include <map>
+#include <mutex>  // NOLINT
+#include <string>
+#include <vector>
+#include "brpc/channel.h"
+#include "paddle/fluid/framework/blocking_queue.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/operators/detail/rpc_client.h"
+#include "paddle/fluid/operators/detail/send_recv.pb.h"
+#include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
+namespace paddle {
+namespace operators {
+namespace detail {
+struct ChannelContext {
+  brpc::Channel channel;
+  std::shared_ptr<sendrecv::SendRecvService_Stub> stub;
+};
+typedef std::shared_ptr<ChannelContext> ChannelContextPtr;
+typedef std::shared_ptr<framework::BlockingQueue<ChannelContextPtr>>
+    ChannelQueuePtr;
+class BRPCClient : public RPCClient {
+ public:
+  BRPCClient() {}
+  virtual ~BRPCClient();
+  bool AsyncSendVar(const std::string& ep, const platform::DeviceContext& ctx,
+                    const framework::Scope& scope, const std::string& var_name,
+                    int64_t time_out = RPCClient::rpc_time_out) override;
+  bool AsyncGetVar(const std::string& ep, const platform::DeviceContext& ctx,
+                   const framework::Scope& scope, const std::string& var_name,
+                   int64_t time_out = RPCClient::rpc_time_out) override;
+  bool AsyncPrefetchVar(const std::string& ep,
+                        const platform::DeviceContext& ctx,
+                        const framework::Scope& scope,
+                        const std::string& in_var_name,
+                        const std::string& out_var_name,
+                        int64_t time_out = RPCClient::rpc_time_out) override;
+  void AsyncSendBatchBarrier(
+      const std::string& ep,
+      int64_t time_out = RPCClient::rpc_time_out) override;
+  void AsyncSendFetchBarrier(
+      const std::string& ep,
+      int64_t time_out = RPCClient::rpc_time_out) override;
+  void Wait() override;
+ private:
+  void Proceed();
+  ChannelQueuePtr GetChannel(const std::string& ep);
+ private:
+  std::unordered_map<std::string, ChannelQueuePtr> channels_;
+  // mutex for Wait client sync
+  std::mutex sync_mutex_;
+  std::condition_variable sync_cond_;
+  std::atomic<int64_t> req_count_{0};
+  // mutex for GetChannel thread safety
+  std::mutex chan_mutex_;
+  DISABLE_COPY_AND_ASSIGN(BRPCClient);
+};
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/detail/brpc_server.cc
+++ b/paddle/fluid/operators/detail/brpc_server.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/operators/detail/brpc_server.h"
+#include "paddle/fluid/operators/detail/request_handler.h"
+namespace sendrecv {
+typedef std::unordered_map<std::string,
+                           paddle::operators::detail::RequestHandler*>
+    HandlerMap;
+class BRPCServiceImpl : public SendRecvService {
+ public:
+  explicit BRPCServiceImpl(const HandlerMap& rpc_call_map)
+      : request_send_h_(nullptr),
+        request_get_h_(nullptr),
+        request_prefetch_h_(nullptr) {
+    auto it = rpc_call_map.find(paddle::operators::detail::kRequestSend);
+    if (it != rpc_call_map.end()) {
+      request_send_h_ = it->second;
+    }
+    it = rpc_call_map.find(paddle::operators::detail::kRequestSend);
+    if (it != rpc_call_map.end()) {
+      request_get_h_ = it->second;
+    }
+    it = rpc_call_map.find(paddle::operators::detail::kRequestPrefetch);
+    if (it != rpc_call_map.end()) {
+      request_prefetch_h_ = it->second;
+    }
+  }
+  virtual ~BRPCServiceImpl() {}
+  void SendVariable(google::protobuf::RpcController* cntl_butil,
+                    const VariableMessage* request, VoidMessage* response,
+                    google::protobuf::Closure* done) override {
+    PADDLE_ENFORCE(request_send_h_ != nullptr,
+                   "RequestSend handler should be registed first!");
+    brpc::ClosureGuard done_guard(done);
+    paddle::framework::Scope* local_scope = request_send_h_->scope();
+    paddle::framework::Variable* outvar = nullptr;
+    paddle::framework::Variable* invar = nullptr;
+    std::string varname = request->varname();
+    if (!request_send_h_->sync_mode()) {
+      local_scope = &request_send_h_->scope()->NewScope();
+      invar = local_scope->Var(varname);
+    } else {
+      invar = local_scope->FindVar(varname);
+    }
+    request_send_h_->Handle(varname, local_scope, invar, &outvar);
+    if (!request_send_h_->sync_mode()) {
+      request_send_h_->scope()->DeleteScope(local_scope);
+    }
+  }
+  void GetVariable(google::protobuf::RpcController* cntl_butil,
+                   const VariableMessage* request, VariableMessage* response,
+                   google::protobuf::Closure* done) override {
+    PADDLE_ENFORCE(request_get_h_ != nullptr,
+                   "RequestGet handler should be registed first!");
+  }
+  void PrefetchVariable(google::protobuf::RpcController* cntl_butil,
+                        const VariableMessage* request,
+                        VariableMessage* response,
+                        google::protobuf::Closure* done) override {
+    PADDLE_ENFORCE(request_prefetch_h_ != nullptr,
+                   "kRequestPrefetch handler should be registed first!");
+  }
+ private:
+  paddle::operators::detail::RequestHandler* request_send_h_;
+  paddle::operators::detail::RequestHandler* request_get_h_;
+  paddle::operators::detail::RequestHandler* request_prefetch_h_;
+};
+}  // namespace sendrecv
+namespace paddle {
+namespace operators {
+namespace detail {
+void AsyncBRPCServer::StartServer() {
+  // Instance of your service.
+  sendrecv::BRPCServiceImpl service_impl(rpc_call_map_);
+  // Add the service into server. Notice the second parameter, because the
+  // service is put on stack, we don't want server to delete it, otherwise
+  // use brpc::SERVER_OWNS_SERVICE.
+  if (server_.AddService(&service_impl, brpc::SERVER_DOESNT_OWN_SERVICE) != 0) {
+    LOG(FATAL) << "Fail to add service";
+    return;
+  }
+  brpc::ServerOptions options;
+  options.idle_timeout_sec = idle_timeout_s_;
+  options.max_concurrency = max_concurrency_;
+  if (server_.Start(bind_address_.c_str(), &options) != 0) {
+    LOG(FATAL) << "Fail to start EchoServer" << bind_address_;
+    return;
+  }
+  butil::EndPoint ep = server_.listen_address();
+  selected_port_ = ep.port;
+  {
+    std::lock_guard<std::mutex> lock(this->mutex_ready_);
+    ready_ = 1;
+  }
+  condition_ready_.notify_all();
+  server_.Join();
+}
+void AsyncBRPCServer::ShutDownImpl() { server_.Stop(1000); }
+void AsyncBRPCServer::WaitServerReady() {
+  VLOG(3) << "AsyncGRPCServer is wait server ready";
+  std::unique_lock<std::mutex> lock(this->mutex_ready_);
+  condition_ready_.wait(lock, [=] { return this->ready_ == 1; });
+  VLOG(3) << "AsyncGRPCServer WaitSeverReady";
+}
+};  // namespace detail
+};  // namespace operators
+};  // namespace paddle
--- a/paddle/fluid/operators/detail/brpc_server.h
+++ b/paddle/fluid/operators/detail/brpc_server.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <condition_variable>  // NOLINT
+#include <mutex>               // NOLINT
+#include <string>
+#include "brpc/server.h"
+#include "paddle/fluid/operators/detail/rpc_server.h"
+#include "paddle/fluid/operators/detail/send_recv.pb.h"
+namespace paddle {
+namespace operators {
+namespace detail {
+class AsyncBRPCServer final : public RPCServer {
+ public:
+  explicit AsyncBRPCServer(const std::string& address, int client_num)
+      : RPCServer(address, client_num), ready_(0) {}
+  virtual ~AsyncBRPCServer() {}
+  void StartServer() override;
+  void WaitServerReady() override;
+ private:
+  void ShutDownImpl() override;
+  brpc::Server server_;
+  static constexpr int idle_timeout_s_ = -1;
+  static constexpr int max_concurrency_ = 0;
+  std::mutex mutex_ready_;
+  std::condition_variable condition_ready_;
+  int ready_;
+};
+};  // namespace detail
+};  // namespace operators
+};  // namespace paddle
--- a/paddle/fluid/operators/detail/grpc_client.cc
+++ b/paddle/fluid/operators/detail/grpc_client.cc
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <limits>
 #include "paddle/fluid/framework/threadpool.h"
+#include "paddle/fluid/operators/detail/request_handler.h"
 #include "paddle/fluid/platform/profiler.h"
 namespace paddle {

--- a/paddle/fluid/operators/detail/serde_test.cc
+++ b/paddle/fluid/operators/detail/serde_test.cc
--- a/paddle/fluid/operators/detail/grpc_server.cc
+++ b/paddle/fluid/operators/detail/grpc_server.cc
@@ -41,11 +41,22 @@ class RequestBase {
  virtual ~RequestBase() {}
  virtual void Process() = 0;
-  CallStatus Status() { return status_; }
+  CallStatus Status() const {
-  void SetStatus(CallStatus status) { status_ = status; }
+    std::lock_guard<std::mutex> l(status_mu_);
+    return status_;
+  }
+  template <typename T>
+  void Finish(const T& reply, ServerAsyncResponseWriter<T>* responder) {
+    std::lock_guard<std::mutex> l(status_mu_);
+    status_ = FINISH;
+    responder->Finish(reply, ::grpc::Status::OK,
+                      reinterpret_cast<void*>(static_cast<intptr_t>(req_id_)));
+  }
  virtual std::string GetReqName() = 0;
 protected:
+  mutable std::mutex status_mu_;
  ::grpc::ServerContext ctx_;
  GrpcService::AsyncService* service_;
  ::grpc::ServerCompletionQueue* cq_;
@@ -80,9 +91,7 @@ class RequestSend final : public RequestBase {
    framework::Variable* outvar = nullptr;
    request_handler_->Handle(varname, scope, invar, &outvar);
-    status_ = FINISH;
+    Finish(reply_, &responder_);
-    responder_.Finish(reply_, ::grpc::Status::OK,
-                      reinterpret_cast<void*>(static_cast<intptr_t>(req_id_)));
  }
 protected:
@@ -122,9 +131,7 @@ class RequestGet final : public RequestBase {
      SerializeToByteBuffer(varname, outvar, *request_handler_->dev_ctx(),
                            &reply_);
    }
-    status_ = FINISH;
+    Finish(reply_, &responder_);
-    responder_.Finish(reply_, ::grpc::Status::OK,
-                      reinterpret_cast<void*>(static_cast<intptr_t>(req_id_)));
  }
 protected:
@@ -166,9 +173,7 @@ class RequestPrefetch final : public RequestBase {
    SerializeToByteBuffer(varname, outvar, *request_handler_->dev_ctx(),
                          &reply_);
-    responder_.Finish(reply_, ::grpc::Status::OK,
+    Finish(reply_, &responder_);
-                      reinterpret_cast<void*>(static_cast<intptr_t>(req_id_)));
-    status_ = FINISH;
  }
 protected:

--- a/paddle/fluid/operators/detail/grpc_server.h
+++ b/paddle/fluid/operators/detail/grpc_server.h
@@ -53,6 +53,7 @@ class AsyncGRPCServer final : public RPCServer {
  void StartServer() override;
 private:
+  // HandleRequest needs to be thread-safe.
  void HandleRequest(
      ::grpc::ServerCompletionQueue* cq, const std::string& rpc_name,
      std::function<void(const std::string&, int)> TryToRegisterNewOne);

--- a/paddle/fluid/operators/detail/macros.h
+++ b/paddle/fluid/operators/detail/macros.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#ifdef PADDLE_WITH_GRPC
+#include "paddle/fluid/operators/detail/grpc_client.h"
+#include "paddle/fluid/operators/detail/grpc_server.h"
+#define RPCSERVER_T detail::AsyncGRPCServer
+#define RPCCLIENT_T detail::GRPCClient
+#else
+#include "paddle/fluid/operators/detail/brpc_client.h"
+#include "paddle/fluid/operators/detail/brpc_server.h"
+#define RPCSERVER_T detail::AsyncBRPCServer
+#define RPCCLIENT_T detail::BRPCClient
+#endif
--- a/paddle/fluid/operators/detail/request_handler.h
+++ b/paddle/fluid/operators/detail/request_handler.h
@@ -28,7 +28,6 @@
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
 namespace paddle {
 namespace operators {
@@ -38,6 +37,10 @@ constexpr char kRequestSend[] = "RequestSend";
 constexpr char kRequestGet[] = "RequestGet";
 constexpr char kRequestPrefetch[] = "RequestPrefetch";
+#define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV"
+#define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV"
+#define FETCH_BARRIER_MESSAGE "FETCH_BARRIER@RECV"
 class RPCServer;
 class RequestHandler {
@@ -80,7 +83,6 @@ class RequestHandler {
  }
  framework::ProgramDesc* program() { return program_; }
  framework::Executor* executor() { return executor_; }
-  std::vector<framework::Variable*>& sparse_vars() { return sparse_vars_; }
  // This function processes user's rpc request.
  // The implemention is in request_handler_impl.
@@ -113,13 +115,7 @@ class RequestHandler {
  std::unordered_map<std::string,
                     std::shared_ptr<framework::ExecutorPrepareContext>>*
      grad_to_prepared_ctx_;
-  // Record received sparse variables, so that
-  // we could reset those after execute optimize program
-  std::vector<framework::Variable*> sparse_vars_;
  RPCServer* rpc_server_;
-  std::mutex sparse_var_mutex_;
 };
 }  // namespace detail

--- a/paddle/fluid/operators/detail/request_handler_impl.cc
+++ b/paddle/fluid/operators/detail/request_handler_impl.cc
@@ -16,15 +16,12 @@
 #include <string>
 #include <vector>
-#include "paddle/fluid/framework/blocking_queue.h"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/operators/detail/request_handler_impl.h"
 #include "paddle/fluid/operators/detail/rpc_server.h"
-#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
-#include "paddle/fluid/operators/detail/variable_response.h"
 namespace paddle {
 namespace operators {
@@ -63,16 +60,22 @@ bool RequestSendHandler::Handle(const std::string& varname,
      PADDLE_THROW("sync: Can not find server side var");
      return false;
    }
    if (invar->IsType<framework::SelectedRows>()) {
-      std::unique_lock<std::mutex> lock(sparse_var_mutex_);
+      std::unique_lock<std::mutex> lock(mutex_sparse_vars_);
      sparse_vars_.push_back(invar);
    }
  }
  return true;
 }
+void RequestSendHandler::ResetSparseVarRecorder() {
+  std::unique_lock<std::mutex> lock(mutex_sparse_vars_);
+  for (auto* var : sparse_vars_) {
+    var->GetMutable<framework::SelectedRows>()->mutable_rows()->clear();
+  }
+  sparse_vars_.clear();
+}
 bool RequestGetHandler::Handle(const std::string& varname,
                               framework::Scope* scope,
                               framework::Variable* invar,

--- a/paddle/fluid/operators/detail/request_handler_impl.h
+++ b/paddle/fluid/operators/detail/request_handler_impl.h
@@ -29,7 +29,6 @@
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/operators/detail/request_handler.h"
-#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
 namespace paddle {
 namespace operators {
@@ -41,6 +40,11 @@ class RequestSendHandler final : public RequestHandler {
  virtual ~RequestSendHandler() {}
  bool Handle(const std::string& varname, framework::Scope* scope,
              framework::Variable* var, framework::Variable** outvar) override;
+  void ResetSparseVarRecorder();
+ private:
+  std::mutex mutex_sparse_vars_;
+  std::vector<framework::Variable*> sparse_vars_;
 };
 class RequestGetHandler final : public RequestHandler {

--- a/paddle/fluid/operators/detail/rpc_client.h
+++ b/paddle/fluid/operators/detail/rpc_client.h
@@ -26,6 +26,8 @@ namespace detail {
 class RPCClient {
 public:
+  RPCClient() {}
+  virtual ~RPCClient() {}
  virtual bool AsyncSendVar(const std::string& ep,
                            const platform::DeviceContext& ctx,
                            const framework::Scope& scope,

--- a/paddle/fluid/operators/detail/rpc_server.h
+++ b/paddle/fluid/operators/detail/rpc_server.h
@@ -60,6 +60,7 @@ class RPCServer {
  void SetCond(const std::string& rpc_name);
  void WaitCond(const std::string& rpc_name);
  void IncreaseBatchBarrier(const std::string rpc_name);
  void ResetBarrierCounter();
 protected:

--- a/paddle/fluid/operators/detail/grpc_server_test.cc
+++ b/paddle/fluid/operators/detail/grpc_server_test.cc
@@ -17,15 +17,14 @@ limitations under the License. */
 #include <thread>  // NOLINT
 #include "gtest/gtest.h"
-#include "paddle/fluid/operators/detail/grpc_client.h"
-#include "paddle/fluid/operators/detail/grpc_server.h"
-#include "paddle/fluid/operators/detail/rpc_client.h"
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/detail/macros.h"
 #include "paddle/fluid/operators/detail/request_handler_impl.h"
+#include "paddle/fluid/operators/detail/rpc_client.h"
+#include "paddle/fluid/operators/detail/rpc_server.h"
 namespace framework = paddle::framework;
 namespace platform = paddle::platform;
@@ -33,7 +32,7 @@ namespace detail = paddle::operators::detail;
 USE_OP(lookup_table);
-std::unique_ptr<detail::AsyncGRPCServer> g_rpc_service;
+std::unique_ptr<detail::RPCServer> g_rpc_service;
 std::unique_ptr<detail::RequestHandler> g_req_handler;
 framework::BlockDesc* AppendPrefetchBlcok(framework::ProgramDesc* program) {
@@ -112,20 +111,19 @@ void StartServer() {
  g_req_handler->SetRPCServer(g_rpc_service.get());
  std::thread server_thread(
-      std::bind(&detail::AsyncGRPCServer::StartServer, g_rpc_service.get()));
+      std::bind(&detail::RPCServer::StartServer, g_rpc_service.get()));
  server_thread.join();
 }
 TEST(PREFETCH, CPU) {
  g_req_handler.reset(new detail::RequestPrefetchHandler(true));
-  g_rpc_service.reset(new detail::AsyncGRPCServer("127.0.0.1:0", 1));
+  g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 1));
+  detail::RPCClient* client = detail::RPCClient::GetInstance<RPCCLIENT_T>();
  std::thread server_thread(StartServer);
  g_rpc_service->WaitServerReady();
-  detail::RPCClient* client =
-      detail::RPCClient::GetInstance<detail::GRPCClient>();
  int port = g_rpc_service->GetSelectedPort();
  std::string ep = paddle::string::Sprintf("127.0.0.1:%d", port);

--- a/paddle/fluid/operators/detail/send_recv.proto
+++ b/paddle/fluid/operators/detail/send_recv.proto
@@ -14,6 +14,8 @@ limitations under the License. */
 syntax = "proto3";
 package sendrecv;
+// option cc_generic_services = true;
 service SendRecvService {
  // For parameter server round-robin like hashing, do not split tensors.
  // Send and recv only one tensor

--- a/paddle/fluid/operators/detail/sendrecvop_utils.h
+++ b/paddle/fluid/operators/detail/sendrecvop_utils.h
@@ -32,16 +32,6 @@ namespace paddle {
 namespace operators {
 namespace detail {
-#define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV"
-#define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV"
-#define FETCH_BARRIER_MESSAGE "FETCH_BARRIER@RECV"
-static int64_t GetTimestamp() {
-  struct timeval tp;
-  gettimeofday(&tp, NULL);
-  return tp.tv_sec * 1000 + tp.tv_usec / 1000;
-}
 typedef void (*DestroyCallback)(void*);
 void SerializeToByteBuffer(const std::string& name, framework::Variable* var,

--- a/paddle/fluid/operators/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise_op.h
@@ -59,7 +59,7 @@ class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker {
  void Make() final {
    AddInput("X", "(Tensor), The first input tensor of elementwise op.");
    AddInput("Y", "(Tensor), The second input tensor of elementwise op.");
-    AddOutput("Out", "The output of elementwise op.");
+    AddOutput("Out", "The output of elementwise op.").Reuse("X");
    AddAttr<int>("axis",
                 "(int, default -1). The start dimension index "
                 "for broadcasting Y onto X.")

--- a/paddle/fluid/operators/fetch_barrier_op.cc
+++ b/paddle/fluid/operators/fetch_barrier_op.cc
@@ -19,9 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/detail/macros.h"
-#include "paddle/fluid/operators/detail/grpc_client.h"
-#include "paddle/fluid/operators/detail/rpc_client.h"
 #include "paddle/fluid/platform/profiler.h"
 namespace paddle {
@@ -45,7 +43,7 @@ class FetchBarrierOp : public framework::OperatorBase {
    platform::RecordEvent record_event(Type(), &ctx);
    detail::RPCClient* rpc_client =
-        detail::RPCClient::GetInstance<detail::GRPCClient>();
+        detail::RPCClient::GetInstance<RPCCLIENT_T>();
    rpc_client->Wait();

--- a/paddle/fluid/operators/fill_constant_batch_size_like_op.cc
+++ b/paddle/fluid/operators/fill_constant_batch_size_like_op.cc
@@ -32,16 +32,16 @@ class FillConstantBatchSizeLikeOp : public BatchSizeLikeOp {
 class FillConstantBatchSizeLikeOpMaker : public BatchSizeLikeOpMaker {
 protected:
  void Apply() override {
-    AddAttr<int>("dtype",
+    AddAttr<int>(
-                 "(int, default 5 (FP32)) "
+        "dtype",
-                 "Output data type")
+        "It could be numpy.dtype. Output data type. Default is float32")
        .SetDefault(framework::proto::VarType::FP32);
-    AddAttr<float>("value", "(float, default 0) The value to be filled")
+    AddAttr<float>("value", "default 0. The value to be filled")
        .SetDefault(0.0f);
    AddComment(R"DOC(
-FillConstantBatchSizeLike Operator.
+This function creates a tensor of specified *shape*, *dtype* and batch size,
+and initializes this with a constant supplied in *value*. The batch size is
-Fill up a variable with specified constant value.
+obtained from the `input` tensor.
 )DOC");
  }

--- a/paddle/fluid/operators/gather_test.cc
+++ b/paddle/fluid/operators/gather_test.cc
@@ -43,7 +43,8 @@ TEST(Gather, GatherData) {
  auto* cpu_place = new paddle::platform::CPUPlace();
  paddle::platform::CPUDeviceContext ctx(*cpu_place);
  paddle::operators::CPUGather<int>(ctx, *src, *index, output);
+  delete cpu_place;
+  cpu_place = NULL;
  for (int i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], i + 4);
  for (int i = 4; i < 8; ++i) EXPECT_EQ(p_output[i], i - 4);

--- a/paddle/fluid/operators/gen_nccl_id_op.cc
+++ b/paddle/fluid/operators/gen_nccl_id_op.cc
@@ -21,8 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/threadpool.h"
-#include "paddle/fluid/operators/detail/grpc_client.h"
+#include "paddle/fluid/operators/detail/macros.h"
-#include "paddle/fluid/operators/detail/grpc_server.h"
 #include "paddle/fluid/operators/detail/request_handler_impl.h"
 #include "paddle/fluid/platform/nccl_helper.h"
@@ -61,8 +60,8 @@ class GenNCCLIdOp : public framework::OperatorBase {
    std::vector<std::string> endpoint_list =
        Attr<std::vector<std::string>>("endpoint_list");
-    detail::RPCClient* client =
+    detail::RPCClient* client = detail::RPCClient::GetInstance<RPCCLIENT_T>();
-        detail::RPCClient::GetInstance<detail::GRPCClient>();
    for (auto& ep : endpoint_list) {
      VLOG(3) << "sending nccl id to " << ep;
      client->AsyncSendVar(ep, dev_ctx, *scope, NCCL_ID_VARNAME);
@@ -78,9 +77,11 @@ class GenNCCLIdOp : public framework::OperatorBase {
    // deleter will call GRPC Server's base class's dtor and
    // that will cause a wired crash.
    detail::RequestSendHandler rpc_h(true);
-    detail::AsyncGRPCServer rpc_service(endpoint, 1);
+    std::unique_ptr<detail::RPCServer> rpc_service(
-    rpc_service.RegisterRPC(detail::kRequestSend, &rpc_h);
+        new RPCSERVER_T(endpoint, 1));
-    rpc_h.SetRPCServer(&rpc_service);
+    rpc_service->RegisterRPC(detail::kRequestSend, &rpc_h);
+    rpc_h.SetRPCServer(rpc_service.get());
    framework::ProgramDesc empty_program;
    framework::Executor executor(dev_ctx.GetPlace());
@@ -90,12 +91,13 @@ class GenNCCLIdOp : public framework::OperatorBase {
    rpc_h.SetExecutor(&executor);
    std::thread server_thread(
-        std::bind(&detail::AsyncGRPCServer::StartServer, &rpc_service));
+        std::bind(&detail::RPCServer::StartServer, rpc_service.get()));
-    rpc_service.SetCond(detail::kRequestSend);
+    rpc_service->SetCond(detail::kRequestSend);
    VLOG(3) << "start getting nccl id from trainer 0...";
-    rpc_service.WaitBarrier(detail::kRequestSend);
+    rpc_service->WaitBarrier(detail::kRequestSend);
    VLOG(3) << "got nccl id and stop server...";
-    rpc_service.ShutDown();
+    rpc_service->ShutDown();
    VLOG(3) << "rpc server stopped";
    server_thread.join();
  }

--- a/paddle/fluid/operators/linear_chain_crf_op.cc
+++ b/paddle/fluid/operators/linear_chain_crf_op.cc
@@ -67,8 +67,6 @@ class LinearChainCRFOpMaker : public framework::OpProtoAndCheckerMaker {
        "mini-batch. Note: S is equal to the sequence number in a mini-batch. "
        "The output is no longer a LoDTensor.");
    AddComment(R"DOC(
-LinearChainCRF Operator.
 Conditional Random Field defines an undirected probabilistic graph with nodes
 denoting random variables and edges denoting dependencies between these
 variables. CRF learns the conditional probability $P(Y|X)$, where

--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -19,7 +19,8 @@ limitations under the License. */
 #include <thread>  // NOLINT
 #include <vector>
-#include "paddle/fluid/operators/detail/grpc_server.h"
+#include "paddle/fluid/operators/detail/macros.h"
 #include "paddle/fluid/operators/detail/request_handler_impl.h"
 #include "paddle/fluid/operators/listen_and_serv_op.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -89,6 +90,12 @@ void ListenAndServOp::SavePort() const {
  rpc_service_->SavePort();
 }
+static int64_t GetTimestamp() {
+  struct timeval tp;
+  gettimeofday(&tp, NULL);
+  return tp.tv_sec * 1000 + tp.tv_usec / 1000;
+}
 void ListenAndServOp::RunSyncLoop(framework::Executor *executor,
                                  framework::ProgramDesc *program,
                                  framework::Scope *recv_scope,
@@ -108,9 +115,6 @@ void ListenAndServOp::RunSyncLoop(framework::Executor *executor,
      std::shared_ptr<framework::ExecutorPrepareContext>(nullptr));
  rpc_service_->ResetBarrierCounter();
-  // Record received sparse variables, so that
-  // we could reset those after execute optimize program
-  std::vector<framework::Variable *> sparse_vars;
  while (true) {
    // Get from multiple trainers, we don't care about the order in which
    // the gradients arrives, just add suffix 0~n and merge the gradient.
@@ -130,7 +134,7 @@ void ListenAndServOp::RunSyncLoop(framework::Executor *executor,
    int32_t last_parent_blkid = program->Block(1).Parent();
    std::vector<size_t> parallel_blkids;
    parallel_blkids.push_back(1);
-    double ts = detail::GetTimestamp();
+    double ts = GetTimestamp();
    for (size_t blkid = 2; blkid < num_blocks; ++blkid) {
      if (blkid != static_cast<size_t>(prefetch_block->ID())) {
        if (program->Block(blkid).Parent() != last_parent_blkid) {
@@ -144,20 +148,14 @@ void ListenAndServOp::RunSyncLoop(framework::Executor *executor,
    }
    ParallelExecuteBlocks(parallel_blkids, executor, optimize_prepared, program,
                          recv_scope);
-    VLOG(2) << "run all blocks spent " << detail::GetTimestamp() - ts << "(ms)";
+    VLOG(2) << "run all blocks spent " << GetTimestamp() - ts << "(ms)";
-    // Reset the received sparse variables, the sum operator would not
-    // sum the input sparse variables which rows is empty at the next
-    // mini-batch.
-    // TODO(Yancey1989): move the reset action into an operator, we couldn't
-    // have any hide logic in the operator.
-    for (framework::Variable *var : sparse_vars) {
-      var->GetMutable<framework::SelectedRows>()->mutable_rows()->clear();
-    }
    rpc_service_->SetCond(detail::kRequestGet);
    rpc_service_->WaitBarrier(detail::kRequestGet);
    rpc_service_->ResetBarrierCounter();
+    // reset received sparse vars to avoid reuse it in the next mini-batch
+    dynamic_cast<detail::RequestSendHandler *>(request_send_handler_.get())
+        ->ResetSparseVarRecorder();
  }  // while(true)
 }
@@ -244,8 +242,8 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
  LOG(INFO) << "sync_mode:" << sync_mode << ", fan_in:" << fan_in
            << ", end_point:" << endpoint;
-  // request_handler_.reset(new detail::GRPCRequestSendHandler(sync_mode));
+  rpc_service_.reset(new RPCSERVER_T(endpoint, fan_in));
-  rpc_service_.reset(new detail::AsyncGRPCServer(endpoint, fan_in));
  request_send_handler_.reset(new detail::RequestSendHandler(sync_mode));
  request_get_handler_.reset(new detail::RequestGetHandler(sync_mode));
  request_prefetch_handler_.reset(

--- a/paddle/fluid/operators/load_op.cc
+++ b/paddle/fluid/operators/load_op.cc
@@ -74,25 +74,18 @@ class LoadOp : public framework::OperatorBase {
 class LoadOpProtoMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override {
-    AddOutput("Out", "(Tensor) The tensor need to be loaded");
+    AddOutput("Out", "The tensor need to be loaded");
    AddAttr<bool>(
        "load_as_fp16",
-        "(boolean, default false)"
        "If true, the tensor will be first loaded and then "
        "converted to float16 data type. Otherwise, the tensor will be "
-        "directly loaded without data type conversion.")
+        "directly loaded without data type conversion. Default is false.")
        .SetDefault(false);
    AddAttr<std::string>("file_path",
-                         "(string) "
+                         R"(Variable will be loaded from "file_path")")
-                         "Variable will be loaded from \"file_path\".")
        .AddCustomChecker(
            [](const std::string &path) { return !path.empty(); });
-    AddComment(R"DOC(
+    AddComment("Load operator will load a tensor variable from disk file.");
-Load Operator.
-Load operator will load a tensor variable from disk file.
-)DOC");
  }
 };
 }  // namespace operators

--- a/paddle/fluid/operators/math/math_function_test.cc
+++ b/paddle/fluid/operators/math/math_function_test.cc
@@ -77,6 +77,8 @@ TEST(math_function, gemm_trans_clbas) {
  paddle::platform::CPUDeviceContext context(*cpu_place);
  GetBlas<float>(context).GEMM(false, true, m, n, k, 1, input1_ptr, 3,
                               input2_ptr + 3, 3, 1, input3_ptr + 1, 4);
+  delete cpu_place;
+  cpu_place = NULL;
  EXPECT_EQ(input3_ptr[0], 0);
  EXPECT_EQ(input3_ptr[1], 24);

--- a/paddle/fluid/operators/max_sequence_len_op.cc
+++ b/paddle/fluid/operators/max_sequence_len_op.cc
@@ -42,10 +42,15 @@ class MaxSeqenceLenOp : public framework::OperatorBase {
 class MaxSeqenceLenOpProtoMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override {
-    AddInput("RankTable", "The lod_rank_table.");
+    AddInput("RankTable", "Input variable which is a LoDRankTable object");
-    AddOutput("Out", "The max sequence length.");
+    AddOutput("Out", "The max sequence length");
-    AddComment(
+    AddComment(R"DOC(
-        R"DOC(Calculate the max sequence length through lod_rank_table.)DOC");
+    Given a LoDRankTable object, this layer returns the max length of
+    a batch of sequences. In fact, a LoDRankTable object contains a list of
+    tuples(<sequence index, sequence length>) and the list is already sorted by
+    sequence length in descending order, so the operator just returns the
+    sequence length of the first tuple element
+)DOC");
  }
 };

--- a/paddle/fluid/operators/mean_op.cc
+++ b/paddle/fluid/operators/mean_op.cc
@@ -34,7 +34,7 @@ class MeanOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override {
    AddInput("X", "The input of mean op");
-    AddOutput("Out", "The output of mean op");
+    AddOutput("Out", "The output of mean op").Reuse("X");
    AddComment(R"DOC(
 Mean Operator.

--- a/paddle/fluid/operators/norm_op.cc
+++ b/paddle/fluid/operators/norm_op.cc
@@ -16,40 +16,34 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
-template <typename AttrType>
 class NormOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override {
-    AddInput(
+    AddInput("X", "(Tensor) A tensor of rank >= axis.");
-        "X",
+    AddAttr<int>("axis",
-        "(Tensor) The input tensor of norm operator. "
+                 "The axis on which to apply normalization. If axis < 0, "
-        "The format of input tensor is NCHW. Where N is batch size, C is the "
+                 "the dimension to normalization is rank(X) + axis. -1 is "
-        "number of channels, H and W is the height and width of feature.");
+                 "the last dimension.");
-    AddInput("Scale",
+    AddAttr<float>("epsilon",
-             "(Tensor) The input tensor of norm operator. "
+                   "(float, default 1e-10) The epsilon value is used "
-             "The format of input tensor is C * 1.");
+                   "to avoid division by zero.")
-    AddAttr<AttrType>("epsilon",
-                      "(float, default 1e-10) Constant "
-                      "for numerical stability.")
        .SetDefault(1.0e-10f);
-    AddOutput("Out",
+    AddOutput("Norm",
-              "(Tensor) The output tensor of norm operator."
+              "(Tensor) A tensor saved the `sqrt(sum(x) + epsion)` will "
-              "N * M."
+              "be used in backward kernel.")
-              "M = C * H * W");
+        .AsIntermediate();
+    AddOutput("Out", "(Tensor) A tensor of the same shape as X.");
    AddComment(R"DOC(
-       "Input shape: $(N, C, H, W)$
-        Scale shape: $(C, 1)$
+Given a tensor, apply 2-normalization along the provided axis.
-        Output shape: $(N, C, H, W)$
-        Where
+$$
-        forward
+y = \frac{x}{ \sqrt{\sum {x^2} + epsion }}
-          $$
+$$
-            [\frac {x_{1}}{\sqrt{\sum{x_{i}^{2}}}} \frac {x_{2}}{\sqrt{\sum{x_{i}^{2}}}} \frac {x_{3}}{\sqrt{\sum{x_{i}^{2}}}} \cdot  \cdot  \cdot \frac {x_{n}}{\sqrt{\sum{x_{i}^{2}}}}]
-          $$
+where, $\sum {x^2}$ is calculated along the `axis` dimension.
-        backward
-          $$
+)DOC");
-            \frac{\frac{\mathrm{d}L }{\mathrm{d}y_{1}} - \frac {x_{1}\sum {\frac{\mathrm{d} L}{\mathrm{d} y_{j}}}x_{j}}{\sum x_{j}^{2}} }{\sqrt{\sum{x_{j}^{2}}}}
-          $$
-        )DOC");
  }
 };
@@ -58,15 +52,15 @@ class NormOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;
  void InferShape(framework::InferShapeContext* ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of NormOp"
+                   "Input(X) of NormOp should not be null.");
-                   "should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Scale"),
-                   "Input(Scale) of NormOp"
-                   "should not be null.");
    PADDLE_ENFORCE(ctx->HasOutput("Out"),
                   "Output(Out) of NormOp should not be null.");
-    auto in_x_dims = ctx->GetInputDim("X");
+    auto xdim = ctx->GetInputDim("X");
-    ctx->SetOutputDim("Out", in_x_dims);
+    ctx->SetOutputDim("Out", xdim);
+    int axis = ctx->Attrs().Get<int>("axis");
+    if (axis < 0) axis = xdim.size() + axis;
+    xdim[axis] = 1;
+    ctx->SetOutputDim("Norm", xdim);
  }
 };
@@ -84,12 +78,12 @@ class NormOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OPERATOR(norm, ops::NormOp, ops::NormOpMaker<float>,
+using CPU = paddle::platform::CPUDeviceContext;
+REGISTER_OPERATOR(norm, ops::NormOp, ops::NormOpMaker,
                  paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(norm_grad, ops::NormOpGrad);
-REGISTER_OP_CPU_KERNEL(
+REGISTER_OP_CPU_KERNEL(norm, ops::NormKernel<CPU, float>,
-    norm, ops::NormKernel<paddle::platform::CPUDeviceContext, float>,
+                       ops::NormKernel<CPU, double>);
-    ops::NormKernel<paddle::platform::CPUDeviceContext, double, float>);
+REGISTER_OP_CPU_KERNEL(norm_grad, ops::NormGradKernel<CPU, float>,
-REGISTER_OP_CPU_KERNEL(
+                       ops::NormGradKernel<CPU, double>);
-    norm_grad, ops::NormGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::NormGradKernel<paddle::platform::CPUDeviceContext, double, float>);
--- a/paddle/fluid/operators/norm_op.cu
+++ b/paddle/fluid/operators/norm_op.cu
@@ -16,9 +16,9 @@ limitations under the License. */
 #include "paddle/fluid/operators/norm_op.h"
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
+using CUDA = paddle::platform::CUDADeviceContext;
-    norm, ops::NormKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::NormKernel<paddle::platform::CUDADeviceContext, double, float>);
+REGISTER_OP_CUDA_KERNEL(norm, ops::NormKernel<CUDA, float>,
-REGISTER_OP_CUDA_KERNEL(
+                        ops::NormKernel<CUDA, double>);
-    norm_grad, ops::NormGradKernel<paddle::platform::CUDADeviceContext, float>,
+REGISTER_OP_CUDA_KERNEL(norm_grad, ops::NormGradKernel<CUDA, float>,
-    ops::NormGradKernel<paddle::platform::CUDADeviceContext, double, float>);
+                        ops::NormGradKernel<CUDA, double>);
--- a/paddle/fluid/operators/norm_op.h
+++ b/paddle/fluid/operators/norm_op.h
@@ -19,156 +19,110 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
-template <typename DeviceContext, typename T, typename AttrType = T>
+inline void GetDims(const framework::DDim& dim, int axis, int* pre, int* n,
+                    int* post) {
+  *pre = 1;
+  *post = 1;
+  *n = dim[axis];
+  for (int i = 0; i < axis; ++i) {
+    (*pre) *= dim[i];
+  }
+  for (int i = axis + 1; i < dim.size(); ++i) {
+    (*post) *= dim[i];
+  }
+}
+template <typename DeviceContext, typename T>
 class NormKernel : public framework::OpKernel<T> {
 public:
-  void Compute(const framework::ExecutionContext& context) const override {
+  void Compute(const framework::ExecutionContext& ctx) const override {
-    const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
+    auto* in_x = ctx.Input<framework::Tensor>("X");
-    const framework::Tensor* scale = context.Input<framework::Tensor>("Scale");
+    auto* out_y = ctx.Output<framework::Tensor>("Out");
-    auto* out = context.Output<framework::Tensor>("Out");
+    auto* out_norm = ctx.Output<framework::Tensor>("Norm");
-    auto epsilon = static_cast<T>(context.Attr<AttrType>("epsilon"));
+    out_y->mutable_data<T>(ctx.GetPlace());
-    out->mutable_data<T>(context.GetPlace());
+    out_norm->mutable_data<T>(ctx.GetPlace());
-    int batch_size = in_x->dims()[0];
-    int channels = in_x->dims()[1];
+    auto xdim = in_x->dims();
-    int height = in_x->dims()[2];
+    auto ndim = out_norm->dims();
-    int width = in_x->dims()[3];
+    T eps = static_cast<T>(ctx.Attr<float>("epsilon"));
-    int fea_len = height * width;
+    int axis = ctx.Attr<int>("axis");
-    auto* place =
+    if (axis < 0) axis = xdim.size() + axis;
-        context.template device_context<DeviceContext>().eigen_device();
+    int pre, n, post;
-    auto x =
+    GetDims(xdim, axis, &pre, &n, &post);
-        framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
-            *in_x, framework::make_ddim({batch_size, fea_len * channels}));
+    auto* place = ctx.template device_context<DeviceContext>().eigen_device();
-    // get square
-    framework::Tensor x_square;
+    Eigen::DSizes<int, 3> shape(pre, n, post);
-    x_square.mutable_data<T>(in_x->dims(), context.GetPlace());
+    Eigen::DSizes<int, 2> norm_shape(pre, post);
-    auto x_square_eigen =
-        framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
+    auto x_e = framework::EigenVector<T>::Flatten(*in_x);
-            x_square, framework::make_ddim({batch_size, fea_len * channels}));
+    auto y_e = framework::EigenVector<T>::Flatten(*out_y);
-    x_square_eigen.device(*place) = x.square();
+    auto norm_e = framework::EigenVector<T>::Flatten(*out_norm);
-    auto scale_eigen =
+    auto x = x_e.reshape(shape);
-        framework::EigenVector<T, Eigen::RowMajor, Eigen::DenseIndex>::Flatten(
+    auto y = y_e.reshape(shape);
-            *scale);
+    auto norm = norm_e.reshape(norm_shape);
-    for (int n = 0; n < batch_size; ++n) {
-      framework::Tensor in_x_batch = in_x->Slice(n, n + 1);
+    Eigen::DSizes<int, 1> rdim(1);
-      auto in_x_batch_eigen =
+    // y = x / sqrt((sum(x * x) + epsilon))
-          framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
+    // norm = sqrt(sum(x * x) + epsilon)
-              in_x_batch, framework::make_ddim({channels, fea_len}));
+    auto sum = x.pow(2).sum(rdim) + eps;
-      framework::Tensor x_square_batch = x_square.Slice(n, n + 1);
+    norm.device(*place) = sum.sqrt();
-      auto x_square_batch_eigen =
+    // y = x / norm
-          framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
+    Eigen::DSizes<int, 3> rshape(pre, 1, post);
-              x_square_batch, framework::make_ddim({channels, fea_len}));
+    Eigen::DSizes<int, 3> bcast(1, n, 1);
-      framework::Tensor out_batch = out->Slice(n, n + 1);
+    y.device(*place) = x / norm.reshape(rshape).broadcast(bcast);
-      auto out_batch_eigen =
-          framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
-              out_batch, framework::make_ddim({channels, fea_len}));
-      framework::Tensor tmp_tensor;
-      tmp_tensor.mutable_data<T>(framework::make_ddim({1, fea_len}),
-                                 context.GetPlace());
-      auto tmp = framework::EigenVector<T, Eigen::RowMajor,
-                                        Eigen::DenseIndex>::Flatten(tmp_tensor);
-      // get colsum and sqrt , inverse
-      auto dim = Eigen::array<int, 1>({{0}});
-      tmp.device(*place) = x_square_batch_eigen.sum(dim);
-      tmp.device(*place) = (tmp + epsilon).sqrt().inverse();
-      Eigen::array<int, 2> broadcast_dim_col;
-      broadcast_dim_col[1] = 1;
-      broadcast_dim_col[0] = channels;
-      out_batch_eigen.device(*place) =
-          in_x_batch_eigen * (tmp.broadcast(broadcast_dim_col));
-      Eigen::array<int, 2> broadcast_dim_row;
-      broadcast_dim_row[1] = fea_len;
-      broadcast_dim_row[0] = 1;
-      out_batch_eigen.device(*place) =
-          out_batch_eigen * (scale_eigen.broadcast(broadcast_dim_row));
-    }
  }
 };
 template <typename DeviceContext, typename T, typename AttrType = T>
 class NormGradKernel : public framework::OpKernel<T> {
 public:
-  void Compute(const framework::ExecutionContext& context) const override {
+  void Compute(const framework::ExecutionContext& ctx) const override {
-    const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
+    auto* in_x = ctx.Input<framework::Tensor>("X");
-    const framework::Tensor* scale = context.Input<framework::Tensor>("Scale");
+    auto* in_norm = ctx.Input<framework::Tensor>("Norm");
-    const framework::Tensor* out_grad =
+    auto* in_dy = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* out_dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto epsilon = static_cast<T>(context.Attr<AttrType>("epsilon"));
+    out_dx->mutable_data<T>(ctx.GetPlace());
-    framework::Tensor* in_x_grad =
-        context.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto xdim = in_x->dims();
-    in_x_grad->mutable_data<T>(context.GetPlace());
+    int axis = ctx.Attr<int>("axis");
-    int batch_size = in_x->dims()[0];
+    if (axis < 0) axis = xdim.size() + axis;
-    int channels = in_x->dims()[1];
+    int pre, n, post;
-    int height = in_x->dims()[2];
+    GetDims(xdim, axis, &pre, &n, &post);
-    int width = in_x->dims()[3];
-    int fea_len = height * width;
+    auto* place = ctx.template device_context<DeviceContext>().eigen_device();
-    auto* place =
-        context.template device_context<DeviceContext>().eigen_device();
+    auto x_e = framework::EigenVector<T>::Flatten(*in_x);
+    auto dy_e = framework::EigenVector<T>::Flatten(*in_dy);
-    auto scale_eigen =
+    auto norm_e = framework::EigenVector<T>::Flatten(*in_norm);
-        framework::EigenVector<T, Eigen::RowMajor, Eigen::DenseIndex>::Flatten(
+    auto dx_e = framework::EigenVector<T>::Flatten(*out_dx);
-            *scale);
-    auto x =
+    Eigen::DSizes<int, 3> shape(pre, n, post);
-        framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
+    Eigen::DSizes<int, 2> norm_shape(pre, post);
-            *in_x, framework::make_ddim({batch_size, fea_len * channels}));
+    auto x = x_e.reshape(shape);
-    // get square
+    auto dy = dy_e.reshape(shape);
-    framework::Tensor x_square;
+    auto norm = norm_e.reshape(norm_shape);
-    x_square.mutable_data<T>(in_x->dims(), context.GetPlace());
+    auto dx = dx_e.reshape(shape);
-    auto x_square_eigen =
-        framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
+    framework::Tensor rsum;
-            x_square, framework::make_ddim({batch_size, fea_len * channels}));
+    rsum.mutable_data<T>({pre, post}, ctx.GetPlace());
-    x_square_eigen.device(*place) = x.square();
+    auto sum = framework::EigenTensor<T, 2>::From(rsum);
-    for (int n = 0; n < batch_size; ++n) {
+    Eigen::DSizes<int, 1> rdim(1);
-      framework::Tensor in_x_batch = in_x->Slice(n, n + 1);
+    Eigen::DSizes<int, 3> bcast(1, n, 1);
-      auto in_x_batch_eigen =
+    Eigen::DSizes<int, 3> rshape(pre, 1, post);
-          framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
-              in_x_batch, framework::make_ddim({channels, fea_len}));
+    // dx = ( dy/sqrt(sum(x*x)) ) * [1 - x*sum(x) / (sum(x*x) + e)]
-      framework::Tensor in_g_batch = in_x_grad->Slice(n, n + 1);
+    //    = [dy - dy * x * sum(x) / (sum(x*x) + e)] / sqrt(sum(x*x))
-      auto in_g_batch_eigen =
+    //    = [dy - x * sum(x*dy) / (sum(x*x) + e)] / sqrt(sum(x*x))
-          framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
+    // 1. sum = sum(x*dy)
-              in_g_batch, framework::make_ddim({channels, fea_len}));
+    sum.device(*place) = (x * dy).sum(rdim);
-      framework::Tensor x_square_batch = x_square.Slice(n, n + 1);
+    // 2. dx = x * sum
-      auto x_square_batch_eigen =
+    dx.device(*place) = sum.reshape(rshape).broadcast(bcast) * x;
-          framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
+    // 3. dx / (sum(x*x) + e)
-              x_square_batch, framework::make_ddim({channels, fea_len}));
+    // where, norm.pow(2) = sum(x*x) + e, which is calculated in forward.
-      framework::Tensor outg_batch = out_grad->Slice(n, n + 1);
+    dx.device(*place) = dx / norm.pow(2).broadcast(bcast);
-      auto outg_batch_eigen =
+    // 4. [dy - dx] / sqrt(sum(x*x))
-          framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
+    dx.device(*place) = (dy - dx) / norm.broadcast(bcast);
-              outg_batch, framework::make_ddim({channels, fea_len}));
-      framework::Tensor tmp_tensor;
-      tmp_tensor.mutable_data<T>(framework::make_ddim({1, fea_len}),
-                                 context.GetPlace());
-      auto tmp_eigen =
-          framework::EigenVector<T, Eigen::RowMajor,
-                                 Eigen::DenseIndex>::Flatten(tmp_tensor);
-      auto dim = Eigen::array<int, 1>({{0}});
-      tmp_eigen.device(*place) = (in_x_batch_eigen * outg_batch_eigen).sum(dim);
-      framework::Tensor norm_tmp_tensor;
-      norm_tmp_tensor.mutable_data<T>(framework::make_ddim({1, fea_len}),
-                                      context.GetPlace());
-      auto norm_tmp_eigen =
-          framework::EigenVector<T, Eigen::RowMajor,
-                                 Eigen::DenseIndex>::Flatten(norm_tmp_tensor);
-      norm_tmp_eigen.device(*place) =
-          (x_square_batch_eigen.sum(dim) + epsilon).sqrt();
-      Eigen::array<int, 2> broadcast_dim_col;
-      broadcast_dim_col[1] = 1;
-      broadcast_dim_col[0] = channels;
-      in_g_batch_eigen.device(*place) =
-          in_x_batch_eigen * tmp_eigen.broadcast(broadcast_dim_col);
-      in_g_batch_eigen.device(*place) =
-          in_g_batch_eigen /
-          (norm_tmp_eigen * norm_tmp_eigen).broadcast(broadcast_dim_col);
-      in_g_batch_eigen.device(*place) = outg_batch_eigen - in_g_batch_eigen;
-      // outg_batch_eigen + (in_g_batch_eigen * -1);
-      in_g_batch_eigen.device(*place) =
-          in_g_batch_eigen / norm_tmp_eigen.broadcast(broadcast_dim_col);
-      Eigen::array<int, 2> broadcast_dim_row;
-      broadcast_dim_row[1] = fea_len;
-      broadcast_dim_row[0] = 1;
-      in_g_batch_eigen.device(*place) =
-          in_g_batch_eigen * (scale_eigen.broadcast(broadcast_dim_row));
-    }
  }
 };
 }  // namespace operators

--- a/paddle/fluid/operators/pool_mkldnn_op.cc
+++ b/paddle/fluid/operators/pool_mkldnn_op.cc
@@ -18,9 +18,14 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
-using mkldnn::memory;  // Note: paddle has also "memory" namespace
+using framework::DataLayout;
-using mkldnn::pooling_forward;
+using mkldnn::memory;
 using mkldnn::pooling_backward;
+using mkldnn::pooling_forward;
+using mkldnn::primitive;
+using mkldnn::reorder;
+using mkldnn::stream;
+using platform::to_void_cast;
 // Generate keys for storing/retriving primitives for this operator
 // TODO(jczaja): Make hashing function more optimial
@@ -55,8 +60,9 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    const Tensor* input = ctx.Input<Tensor>("X");
    Tensor* output = ctx.Output<Tensor>("Out");
-    // Get an unique name from "argument" name of "Out" variable
+    PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN &&
-    // This name will be used as key when saving info into device context
+                       input->format() != memory::format::format_undef,
+                   "Wrong layout/format set for Input tensor");
    std::string pooling_type = ctx.Attr<std::string>("pooling_type");
    std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
@@ -82,6 +88,9 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
    std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
+    auto input_format = input->format();
+    memory::format output_format{memory::format::format_undef};
    const std::string key = gethash(src_tz, pooling_type, ksize, strides,
                                    paddings, ctx.op().Output("Out"));
    const std::string key_pool_p = key + "@pool_p";
@@ -94,16 +103,17 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    auto pool_p =
        std::static_pointer_cast<pooling_forward>(dev_ctx.GetBlob(key_pool_p));
    if (pool_p == nullptr) {
-      // TODO(pzelazko-intel): support more formats
+      auto src_md = platform::MKLDNNMemDesc(
+          src_tz, platform::MKLDNNGetDataType<T>(), input_format);
-      auto src_md =
+      /* create memory descriptor for pooling without specified format
-          platform::MKLDNNMemDesc(src_tz, platform::MKLDNNGetDataType<T>(),
+       * ('any') which lets a primitive (pooling in this case) choose
-                                  mkldnn::memory::format::nchw);
+       * the memory format preferred for best performance
-      auto dst_md =
+       */
-          platform::MKLDNNMemDesc(dst_tz, platform::MKLDNNGetDataType<T>(),
+      auto dst_md = platform::MKLDNNMemDesc(dst_tz, mkldnn::memory::f32,
-                                  mkldnn::memory::format::nchw);
+                                            mkldnn::memory::format::any);
-      std::shared_ptr<pooling_forward::primitive_desc> pool_pd =
+      std::shared_ptr<mkldnn::pooling_forward::primitive_desc> pool_pd =
          CreatePrimitiveDesc(src_md, dst_md, strides, paddings, ksize,
                              pooling_type, mkldnn_engine);
@@ -116,20 +126,22 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
      // save pool_workspace_memory to be referred in backward path
      dev_ctx.SetBlob(key_pool_workspace_memory, workspace_memory);
-      auto pool_src_memory_p = std::make_shared<memory>(
+      auto src_memory = std::make_shared<memory>(pool_pd->src_primitive_desc(),
-          memory::primitive_desc{src_md, mkldnn_engine},
+                                                 to_void_cast<T>(input_data));
-          static_cast<void*>(const_cast<T*>(input_data)));
+      auto dst_memory =
-      dev_ctx.SetBlob(key_pool_src_mem_p, pool_src_memory_p);
+          std::make_shared<memory>(pool_pd->dst_primitive_desc(), output_data);
-      auto pool_dst_memory_p = std::make_shared<memory>(
+      dev_ctx.SetBlob(key_pool_src_mem_p, src_memory);
-          memory::primitive_desc{dst_md, mkldnn_engine},
+      dev_ctx.SetBlob(key_pool_dst_mem_p, dst_memory);
-          static_cast<void*>(output_data));
-      dev_ctx.SetBlob(key_pool_dst_mem_p, pool_dst_memory_p);
+      pool_p = std::make_shared<pooling_forward>(*pool_pd, *(src_memory.get()),
+                                                 *(dst_memory.get()),
+                                                 *workspace_memory);
-      pool_p = std::make_shared<pooling_forward>(
-          *pool_pd, *(pool_src_memory_p.get()), *(pool_dst_memory_p.get()),
-          *workspace_memory);
      dev_ctx.SetBlob(key_pool_p, pool_p);
+      output_format =
+          (memory::format)dst_memory->get_primitive_desc().desc().data.format;
    } else {
      // Primitives already exist
      auto pool_src_memory_p =
@@ -140,14 +152,20 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
          std::static_pointer_cast<memory>(dev_ctx.GetBlob(key_pool_dst_mem_p));
      PADDLE_ENFORCE(pool_dst_memory_p != nullptr,
                     "Fail to find pooling dst mem_p in device context");
-      pool_src_memory_p->set_data_handle(
+      pool_src_memory_p->set_data_handle(to_void_cast<T>(input_data));
-          reinterpret_cast<void*>(const_cast<T*>(input_data)));
      pool_dst_memory_p->set_data_handle(output_data);
+      output_format = (memory::format)pool_dst_memory_p->get_primitive_desc()
+                          .desc()
+                          .data.format;
    }
    // push primitive to stream and wait until it's executed
    std::vector<mkldnn::primitive> pipeline{*(pool_p.get())};
-    mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+    stream(stream::kind::eager).submit(pipeline).wait();
+    output->set_layout(DataLayout::kMKLDNN);
+    output->set_format(output_format);
  }
 private:
@@ -194,6 +212,13 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
    const Tensor* out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
    Tensor* in_x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+    PADDLE_ENFORCE(in_x->layout() == DataLayout::kMKLDNN &&
+                       in_x->format() != memory::format::format_undef,
+                   "Wrong layout/format set for Input X tensor");
+    PADDLE_ENFORCE(out_grad->layout() == DataLayout::kMKLDNN &&
+                       out_grad->format() != memory::format::format_undef,
+                   "Wrong layout/format set for Input output_grad tensor");
    std::string pooling_type = ctx.Attr<std::string>("pooling_type");
    std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
@@ -212,6 +237,7 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
    const T* out_grad_data = out_grad->data<T>();
    T* in_x_grad_data = in_x_grad->mutable_data<T>(ctx.GetPlace());
+    memory::format in_x_grad_format{memory::format::format_undef};
    std::vector<int> diff_src_tz =
        paddle::framework::vectorize2int(in_x_grad->dims());
@@ -225,39 +251,48 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
    const std::string key_pool_bwd_p = key + "@pool_bwd_p";
    const std::string key_pool_diff_src_mem_p = key + "@pool_diff_src_mem_p";
    const std::string key_pool_diff_dst_mem_p = key + "@pool_diff_dst_mem_p";
+    const std::string key_pool_src_mem_p = key + "@pool_src_mem_p";
+    const std::string key_pool_dst_mem_p = key + "@pool_dst_mem_p";
    const std::string key_pool_pd = key + "@pool_pd";
    const std::string key_pool_workspace_memory =
        key + "@pool_workspace_memory";
+    auto user_diff_dst_memory =
+        memory({{{diff_dst_tz}, memory::data_type::f32, out_grad->format()},
+                mkldnn_engine},
+               to_void_cast<T>(out_grad_data));
+    std::shared_ptr<memory> diff_src_memory;
+    std::shared_ptr<memory> diff_dst_memory;
+    auto dst_memory =
+        std::static_pointer_cast<memory>(dev_ctx.GetBlob(key_pool_dst_mem_p));
+    PADDLE_ENFORCE(dst_memory != nullptr,
+                   "Fail to find dst_memory in device context");
+    primitive reorder_diff_dst;
+    bool is_diff_dst_reordered = false;
    auto pool_bwd_p = std::static_pointer_cast<pooling_backward>(
        dev_ctx.GetBlob(key_pool_bwd_p));
    if (pool_bwd_p == nullptr) {
-      auto diff_src_md =
+      // Retrieve src_memory/dst_memory saved in forward pass
-          platform::MKLDNNMemDesc(diff_src_tz, platform::MKLDNNGetDataType<T>(),
+      auto src_memory =
-                                  mkldnn::memory::format::nchw);
+          std::static_pointer_cast<memory>(dev_ctx.GetBlob(key_pool_src_mem_p));
-      auto diff_dst_md =
+      PADDLE_ENFORCE(src_memory != nullptr,
-          platform::MKLDNNMemDesc(diff_dst_tz, platform::MKLDNNGetDataType<T>(),
+                     "Fail to find src_memory in device context");
-                                  mkldnn::memory::format::nchw);
      // Retrieve pool_pd/pool_workspace_memory from device context
      auto pool_pd =
          std::static_pointer_cast<mkldnn::pooling_forward::primitive_desc>(
              dev_ctx.GetBlob(key_pool_pd));
      PADDLE_ENFORCE(pool_pd != nullptr,
                     "Fail to find pool_pd in device context");
+      auto workspace_memory = std::static_pointer_cast<memory>(
-      auto workspace_memory = std::static_pointer_cast<mkldnn::memory>(
          dev_ctx.GetBlob(key_pool_workspace_memory));
      PADDLE_ENFORCE(workspace_memory != nullptr,
                     "Fail to find workspace_memory in device context");
-      auto pool_diff_src_memory_p = std::make_shared<memory>(memory(
+      // create memory descriptors for pooling
-          {diff_src_md, mkldnn_engine}, static_cast<void*>(in_x_grad_data)));
+      auto diff_src_md = src_memory.get()->get_primitive_desc().desc();
-      dev_ctx.SetBlob(key_pool_diff_src_mem_p, pool_diff_src_memory_p);
+      auto diff_dst_md = dst_memory.get()->get_primitive_desc().desc();
-      auto pool_diff_dst_memory_p = std::make_shared<memory>(
-          memory({diff_dst_md, mkldnn_engine},
-                 static_cast<void*>(const_cast<T*>(out_grad_data))));
-      dev_ctx.SetBlob(key_pool_diff_dst_mem_p, pool_diff_dst_memory_p);
      auto pool_bwd_desc = mkldnn::pooling_backward::desc(
          pooling_type == "max" ? mkldnn::algorithm::pooling_max
@@ -267,35 +302,74 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
      auto pool_bwd_pd = mkldnn::pooling_backward::primitive_desc(
          pool_bwd_desc, mkldnn_engine, *pool_pd);
+      // reorder between user_diff_dst and pool diff_dst if needed
+      diff_dst_memory = std::make_shared<memory>(user_diff_dst_memory);
+      if (memory::primitive_desc(dst_memory->get_primitive_desc()) !=
+          user_diff_dst_memory.get_primitive_desc()) {
+        diff_dst_memory =
+            std::make_shared<memory>(dst_memory.get()->get_primitive_desc());
+        reorder_diff_dst = reorder(user_diff_dst_memory, *diff_dst_memory);
+        is_diff_dst_reordered = true;
+      }
+      diff_src_memory = std::make_shared<memory>(
+          pool_bwd_pd.diff_src_primitive_desc(), in_x_grad_data);
+      dev_ctx.SetBlob(key_pool_diff_src_mem_p, diff_src_memory);
+      dev_ctx.SetBlob(key_pool_diff_dst_mem_p, diff_dst_memory);
      pool_bwd_p = std::make_shared<pooling_backward>(
-          pool_bwd_pd, *(pool_diff_dst_memory_p.get()), *workspace_memory,
+          pool_bwd_pd, *(diff_dst_memory.get()), *workspace_memory,
-          *(pool_diff_src_memory_p));
+          *(diff_src_memory));
      dev_ctx.SetBlob(key_pool_bwd_p, pool_bwd_p);
    } else {
      // Primitives already exist
-      auto pool_diff_src_memory_p = std::static_pointer_cast<memory>(
+      diff_src_memory = std::static_pointer_cast<memory>(
          dev_ctx.GetBlob(key_pool_diff_src_mem_p));
-      PADDLE_ENFORCE(pool_diff_src_memory_p != nullptr,
+      PADDLE_ENFORCE(diff_src_memory != nullptr,
                     "Fail to find pooling src mem_p in device context");
-      auto pool_diff_dst_memory_p = std::static_pointer_cast<memory>(
+      diff_dst_memory = std::static_pointer_cast<memory>(
          dev_ctx.GetBlob(key_pool_diff_dst_mem_p));
-      PADDLE_ENFORCE(pool_diff_dst_memory_p != nullptr,
+      PADDLE_ENFORCE(diff_dst_memory != nullptr,
                     "Fail to find pooling dst mem_p in device context");
-      pool_diff_src_memory_p->set_data_handle(
-          reinterpret_cast<void*>(in_x_grad_data));
+      diff_src_memory->set_data_handle(reinterpret_cast<void*>(in_x_grad_data));
-      pool_diff_dst_memory_p->set_data_handle(const_cast<T*>(out_grad_data));
+      diff_dst_memory->set_data_handle(const_cast<T*>(out_grad_data));
+      // reorder between user_diff_dst and pool diff_dst if needed
+      if (memory::primitive_desc(dst_memory->get_primitive_desc()) !=
+          user_diff_dst_memory.get_primitive_desc()) {
+        diff_dst_memory =
+            std::make_shared<memory>(dst_memory.get()->get_primitive_desc());
+        reorder_diff_dst = reorder(user_diff_dst_memory, *diff_dst_memory);
+        is_diff_dst_reordered = true;
+      }
    }
+    in_x_grad_format = (memory::format)diff_src_memory->get_primitive_desc()
+                           .desc()
+                           .data.format;
    // push primitive to stream and wait until it's executed
-    std::vector<mkldnn::primitive> pipeline{*(pool_bwd_p.get())};
+    std::vector<mkldnn::primitive> pipeline;
+    if (is_diff_dst_reordered) {
+      pipeline.push_back(reorder_diff_dst);
+    }
+    pipeline.push_back(*(pool_bwd_p.get()));
    mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+    in_x_grad->set_layout(DataLayout::kMKLDNN);
+    in_x_grad->set_format(in_x_grad_format);
  }  // Compute()
 };
 }  // namespace operators
 }  // namespace paddle
+namespace ops = paddle::operators;
 REGISTER_OP_KERNEL(pool2d, MKLDNN, ::paddle::platform::CPUPlace,
-                   paddle::operators::PoolMKLDNNOpKernel<float>);
+                   ops::PoolMKLDNNOpKernel<float>);
 REGISTER_OP_KERNEL(pool2d_grad, MKLDNN, ::paddle::platform::CPUPlace,
-                   paddle::operators::PoolMKLDNNGradOpKernel<float>);
+                   ops::PoolMKLDNNGradOpKernel<float>);
--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
@@ -151,7 +151,8 @@ void Pool2dOpMaker::Make() {
            "The format of output tensor is also NCHW, "
            "where N is batch size, C is the number of channels, "
            "H is the height of the feature, "
-            "and W is the width of the feature.");
+            "and W is the width of the feature.")
+      .Reuse("X");
  AddAttr<std::string>("pooling_type",
                       "(string), pooling type, can be \"max\" for max-pooling "
@@ -244,7 +245,8 @@ void Pool3dOpMaker::Make() {
            "The format of output tensor is also NCDHW, "
            "where N is batch size, C is "
            "the number of channels, and D, H and W is the depth, height and "
-            "width of the feature, respectively.");
+            "width of the feature, respectively.")
+      .Reuse("X");
  AddAttr<std::string>("pooling_type",
                       "(string) Pooling type, can be \"max\" for max-pooling "

--- a/paddle/fluid/operators/prefetch_op.cc
+++ b/paddle/fluid/operators/prefetch_op.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/detail/grpc_client.h"
+#include "paddle/fluid/operators/detail/macros.h"
 #include "paddle/fluid/operators/send_recv_util.h"
 namespace paddle {
@@ -42,7 +42,7 @@ class PrefetchOp : public framework::OperatorBase {
    auto& ctx = *pool.Get(place);
    detail::RPCClient* rpc_client =
-        detail::RPCClient::GetInstance<detail::GRPCClient>();
+        detail::RPCClient::GetInstance<RPCCLIENT_T>();
    for (size_t i = 0; i < ins.size(); i++) {
      if (NeedSend(scope, ins[i])) {

--- a/paddle/fluid/operators/random_crop_op.cc
+++ b/paddle/fluid/operators/random_crop_op.cc
@@ -20,7 +20,6 @@ class RandomCropOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
- protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext& ctx) const override {
    return framework::OpKernelType(
@@ -36,11 +35,11 @@ class RandomCropOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("Seed", "The random seed.");
    AddOutput("Out", "The cropped instance batch.");
    AddOutput("SeedOut", "The random seed after random cropping.")
-        .AsDispensable();
+        .AsIntermediate();
    AddAttr<std::vector<int>>("shape", "The shape of a cropped instance.");
    AddComment(R"DOC(
-      This operator takes a batch of instance, and do random cropping on each instance. 
+      This operator takes a batch of instance, and do random cropping on each instance.
-      It means that cropping positions differs on each instance, which is determined 
+      It means that cropping positions differs on each instance, which is determined
      by an uniform random generator. All cropped instances have the same shape, which 
      is determined by the operator's attribute 'shape'.
    )DOC");

--- a/paddle/fluid/operators/recv_op.cc
+++ b/paddle/fluid/operators/recv_op.cc
@@ -19,8 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/detail/macros.h"
-#include "paddle/fluid/operators/detail/grpc_client.h"
 #include "paddle/fluid/platform/profiler.h"
 namespace paddle {
@@ -45,7 +44,7 @@ class RecvOp : public framework::OperatorBase {
    platform::RecordEvent record_event(Type(), &ctx);
    detail::RPCClient* rpc_client =
-        detail::RPCClient::GetInstance<detail::GRPCClient>();
+        detail::RPCClient::GetInstance<RPCCLIENT_T>();
    for (size_t i = 0; i < outs.size(); i++) {
      VLOG(3) << "getting " << outs[i] << " from " << epmap[i];
@@ -78,9 +77,15 @@ This operator can get variables from server side.
  }
 };
+class RecvOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* ctx) const override {}
+};
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OPERATOR(recv, ops::RecvOp, ops::RecvOpMaker);
+REGISTER_OPERATOR(recv, ops::RecvOp, paddle::framework::EmptyGradOpMaker,
+                  ops::RecvOpMaker, ops::RecvOpShapeInference);
--- a/paddle/fluid/operators/send_barrier_op.cc
+++ b/paddle/fluid/operators/send_barrier_op.cc
@@ -19,8 +19,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/detail/macros.h"
-#include "paddle/fluid/operators/detail/grpc_client.h"
 #include "paddle/fluid/platform/profiler.h"
 namespace paddle {
@@ -45,7 +45,7 @@ class SendBarrierOp : public framework::OperatorBase {
    platform::RecordEvent record_event(Type(), &ctx);
    detail::RPCClient* rpc_client =
-        detail::RPCClient::GetInstance<detail::GRPCClient>();
+        detail::RPCClient::GetInstance<RPCCLIENT_T>();
    VLOG(3) << "SendBarrierOp sync_mode:" << sync_mode;

--- a/paddle/fluid/operators/send_op.cc
+++ b/paddle/fluid/operators/send_op.cc
@@ -16,10 +16,9 @@ limitations under the License. */
 #include <ostream>
 #include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/detail/grpc_client.h"
+#include "paddle/fluid/operators/detail/macros.h"
 #include "paddle/fluid/operators/send_recv_util.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -36,12 +35,9 @@ class SendOp : public framework::OperatorBase {
  void RunImpl(const framework::Scope& scope,
               const platform::Place& place) const override {
    auto ins = Inputs("X");
-    auto outs = Outputs("Out");
-    std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
-    std::vector<std::string> endpoints =
-        Attr<std::vector<std::string>>("endpoints");
-    bool sync_mode = Attr<bool>("sync_mode");
+    std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
+    int sync_send = Attr<int>("sync_mode");
    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
    auto& ctx = *pool.Get(place);
@@ -50,37 +46,19 @@ class SendOp : public framework::OperatorBase {
    platform::RecordEvent record_event(Type(), &ctx);
    detail::RPCClient* rpc_client =
-        detail::RPCClient::GetInstance<detail::GRPCClient>();
+        detail::RPCClient::GetInstance<RPCCLIENT_T>();
    for (size_t i = 0; i < ins.size(); i++) {
      if (NeedSend(scope, ins[i])) {
        VLOG(3) << "sending " << ins[i] << " to " << epmap[i];
+        // TODO(Yancey1989): we need to use an IO threadpool which has
+        // a larger number of threads than the computing threadpool.
        rpc_client->AsyncSendVar(epmap[i], ctx, scope, ins[i]);
      } else {
        VLOG(3) << "don't send no-initialied variable: " << ins[i];
      }
    }
-    rpc_client->Wait();
+    if (sync_send) {
-    if (sync_mode) {
-      for (auto& ep : endpoints) {
-        VLOG(3) << "batch barrier, ep: " << ep;
-        rpc_client->AsyncSendBatchBarrier(ep);
-      }
-      rpc_client->Wait();
-    }
-    if (outs.size() > 0) {
-      for (size_t i = 0; i < outs.size(); i++) {
-        VLOG(2) << "getting " << outs[i] << " from " << epmap[i];
-        rpc_client->AsyncGetVar(epmap[i], ctx, scope, outs[i]);
-      }
-      rpc_client->Wait();
-      // tell pservers that current trainer have called fetch
-      for (auto& ep : endpoints) {
-        VLOG(2) << "send fetch barrier, ep: " << ep;
-        rpc_client->AsyncSendFetchBarrier(ep);
-      }
      rpc_client->Wait();
    }
  }
@@ -89,26 +67,22 @@ class SendOp : public framework::OperatorBase {
 class SendOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() {
-    AddInput("X", "(Tensor) Input tensor to be sent").AsDuplicable();
+    AddInput("X", "(Tensor, SelectedRows) Input variables to be sent")
-    AddOutput("Out", "(Tensor) Output tensor to be received from server")
        .AsDuplicable();
    AddComment(R"DOC(
 Send operator
-This operator will send tensor to recv_op at the parameter server.
+This operator will send variables to listen_and_serve op at the parameter server.
 )DOC");
-    // TODO(typhoonzero): remove this attr generate de-duplicated vector from
+    AddAttr<int>("sync_mode",
-    // epmap when initializing.
+                 "(int, default 0)"
-    AddAttr<std::vector<std::string>>("endpoints",
+                 "sync send or async send.")
-                                      "(string vector, default 127.0.0.1:6164)"
+        .SetDefault(0);
-                                      "Server endpoints to send variables to.")
-        .SetDefault({});
    AddAttr<std::vector<std::string>>("epmap",
                                      "(string vector, default 127.0.0.1:6164)"
                                      "Server endpoints in the order of input "
                                      "variables for mapping")
-        .SetDefault({});
+        .SetDefault({"127.0.0.1:6164"});
-    AddAttr<bool>("sync_mode", "work in sync_mode or not").SetDefault(true);
  }
 };

--- a/paddle/fluid/operators/send_vars_op.cc
+++ b/paddle/fluid/operators/send_vars_op.cc
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <future>  // NOLINT
-#include <ostream>
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/detail/grpc_client.h"
-#include "paddle/fluid/operators/send_recv_util.h"
-#include "paddle/fluid/platform/profiler.h"
-namespace paddle {
-namespace operators {
-class SendVarsOp : public framework::OperatorBase {
- public:
-  SendVarsOp(const std::string& type, const framework::VariableNameMap& inputs,
-             const framework::VariableNameMap& outputs,
-             const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& place) const override {
-    auto ins = Inputs("X");
-    std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
-    int sync_send = Attr<int>("sync_send");
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    auto& ctx = *pool.Get(place);
-    // For profiling
-    platform::RecordEvent record_event(Type(), &ctx);
-    detail::RPCClient* rpc_client =
-        detail::RPCClient::GetInstance<detail::GRPCClient>();
-    for (size_t i = 0; i < ins.size(); i++) {
-      if (NeedSend(scope, ins[i])) {
-        VLOG(3) << "sending " << ins[i] << " to " << epmap[i];
-        // TODO(Yancey1989): we need to use an IO threadpool which has
-        // a larger number of threads than the computing threadpool.
-        rpc_client->AsyncSendVar(epmap[i], ctx, scope, ins[i]);
-      } else {
-        VLOG(3) << "don't send no-initialied variable: " << ins[i];
-      }
-    }
-    if (sync_send) {
-      rpc_client->Wait();
-    }
-  }
-};
-class SendVarsOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddInput("X", "(Tensor, SelectedRows) Input variables to be sent")
-        .AsDuplicable();
-    AddComment(R"DOC(
-Send operator
-This operator will send variables to listen_and_serve op at the parameter server.
-)DOC");
-    AddAttr<int>("sync_send",
-                 "(int, default 0)"
-                 "sync send or async send.")
-        .SetDefault(0);
-    AddAttr<std::vector<std::string>>("epmap",
-                                      "(string vector, default 127.0.0.1:6164)"
-                                      "Server endpoints in the order of input "
-                                      "variables for mapping")
-        .SetDefault({"127.0.0.1:6164"});
-  }
-};
-class SendVarsOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext* ctx) const override {}
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(send_vars, ops::SendVarsOp,
-                  paddle::framework::EmptyGradOpMaker, ops::SendVarsOpMaker,
-                  ops::SendVarsOpShapeInference);
--- a/paddle/fluid/operators/sgd_op.cc
+++ b/paddle/fluid/operators/sgd_op.cc
@@ -74,7 +74,8 @@ class SGDOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("Grad", "(Tensor or SelectedRows) Input gradient");
    AddOutput("ParamOut",
              "(Tensor or SelectedRows, same with Param) "
-              "Output parameter, should share the same memory with Param");
+              "Output parameter, should share the same memory with Param")
+        .Reuse("Param");
    AddComment(R"DOC(
 SGD operator

--- a/paddle/fluid/operators/slice_op.cc
+++ b/paddle/fluid/operators/slice_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/slice_op.h"
+#include <algorithm>
+#include <vector>
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+class SliceOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input (Input) of slice op should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output (Out) of slice op should not be null.");
+    auto in_dims = ctx->GetInputDim("Input");
+    PADDLE_ENFORCE(in_dims.size() < 7,
+                   "The rank of input should be less than 7.");
+    framework::DDim out_dims(in_dims);
+    auto axes = ctx->Attrs().Get<std::vector<int>>("axes");
+    auto starts = ctx->Attrs().Get<std::vector<int>>("starts");
+    auto ends = ctx->Attrs().Get<std::vector<int>>("ends");
+    PADDLE_ENFORCE_EQ(starts.size(), ends.size());
+    PADDLE_ENFORCE_EQ(starts.size(), axes.size());
+    int dim_value, start, end;
+    for (size_t i = 0; i < axes.size(); ++i) {
+      dim_value = out_dims[axes[i]];
+      start = starts[i] < 0 ? (starts[i] + dim_value) : starts[i];
+      end = ends[i] < 0 ? (ends[i] + dim_value) : ends[i];
+      start = std::max(start, 0);
+      end = std::max(end, 0);
+      start = std::min(start, dim_value);
+      end = std::min(end, dim_value);
+      start = std::min(start, end);
+      out_dims[axes[i]] = end - start;
+    }
+    ctx->SetOutputDim("Out", out_dims);
+  }
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("Input")->type()),
+        ctx.GetPlace());
+  }
+};
+class SliceOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Input", "Tensor of data to extract slices from.");
+    AddOutput("Out", "Sliced data tensor.");
+    AddAttr<std::vector<int>>(
+        "axes",
+        "(list<int>) Axes that `starts` and `ends` apply to. It's optional."
+        "If not present, will be treated as [0, 1, ..., len(`starts`) - 1].");
+    AddAttr<std::vector<int>>(
+        "starts",
+        "(list<int>) Starting indices of corresponding axis in `axes`");
+    AddAttr<std::vector<int>>(
+        "ends",
+        "(list<int>) Starting indices of corresponding axis in `axes`.");
+    AddComment(R"DOC(
+Slice Operator.
+Produces a slice of the input tensor along multiple axes. Similar to numpy:
+https://docs.scipy.org/doc/numpy/reference/arrays.indexing.html
+Slice uses `axes`, `starts` and `ends` attributes to specify the start and 
+end dimension for each axis in the list of axes, it uses this information
+to slice the input data tensor. If a negative value is passed for any of 
+the start or end indices, it represents number of elements before the end 
+of that dimension. If the value passed to start or end is larger than
+the n (the number of elements in this dimension), it represents n. 
+For slicing to the end of a dimension with unknown size, it is recommended 
+to pass in INT_MAX. If axes are omitted, they are set to [0, ..., ndim-1].
+    Example 1:
+    Given:
+        data = [ [1, 2, 3, 4], [5, 6, 7, 8], ]
+        axes = [0, 1]
+        starts = [1, 0]
+        ends = [2, 3]
+    Then:
+        result = [ [5, 6, 7], ]
+    Example 2:
+    Given:
+        data = [ [1, 2, 3, 4], [5, 6, 7, 8], ]
+        starts = [0, 1]
+        ends = [-1, 1000]
+    Then:
+        result = [ [2, 3, 4], ]
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(slice, ops::SliceOp, ops::SliceOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    slice, ops::SliceKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::SliceKernel<paddle::platform::CPUDeviceContext, int64_t>,
+    ops::SliceKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SliceKernel<paddle::platform::CPUDeviceContext, double>);
--- a/paddle/fluid/operators/slice_op.cu
+++ b/paddle/fluid/operators/slice_op.cu
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/slice_op.h"
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    slice, ops::SliceKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SliceKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::SliceKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::SliceKernel<paddle::platform::CUDADeviceContext, int64_t>);
--- a/paddle/fluid/operators/slice_op.h
+++ b/paddle/fluid/operators/slice_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <algorithm>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+namespace paddle {
+namespace operators {
+template <typename DeviceContext, typename T>
+class SliceKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    int rank = ctx.Input<framework::Tensor>("Input")->dims().size();
+    switch (rank) {
+      case 1:
+        SliceCompute<1>(ctx);
+        break;
+      case 2:
+        SliceCompute<2>(ctx);
+        break;
+      case 3:
+        SliceCompute<3>(ctx);
+        break;
+      case 4:
+        SliceCompute<4>(ctx);
+        break;
+      case 5:
+        SliceCompute<5>(ctx);
+        break;
+      case 6:
+        SliceCompute<6>(ctx);
+        break;
+    }
+  }
+ private:
+  template <size_t D>
+  void SliceCompute(const framework::ExecutionContext& context) const {
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
+    auto in = context.Input<framework::Tensor>("Input");
+    auto out = context.Output<framework::Tensor>("Out");
+    out->mutable_data<T>(context.GetPlace());
+    auto out_dims = out->dims();
+    auto in_dims = in->dims();
+    auto axes = context.Attr<std::vector<int>>("axes");
+    auto starts = context.Attr<std::vector<int>>("starts");
+    auto offsets = Eigen::array<int, D>();
+    auto extents = Eigen::array<int, D>();
+    for (size_t i = 0; i < D; ++i) {
+      offsets[i] = 0;
+      extents[i] = out_dims[i];
+    }
+    int start;
+    for (size_t i = 0; i < axes.size(); ++i) {
+      start = starts[i];
+      if (start < 0) {
+        start = (start + in_dims[axes[i]]);
+      }
+      start = std::max(start, 0);
+      offsets[axes[i]] = start;
+    }
+    auto in_t =
+        framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
+            *in);
+    auto out_t =
+        framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
+            *out);
+    out_t.device(place) = in_t.slice(offsets, extents);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
@@ -83,7 +83,8 @@ class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("X",
             "The input tensor of softmax. "
             "2-D with shape [batch_size, input_feature_dimensions].");
-    AddOutput("Out", "The normalized values with the same shape as X.");
+    AddOutput("Out", "The normalized values with the same shape as X.")
+        .Reuse("X");
    AddAttr<bool>(
        "use_cudnn",
        "(bool, default false) Only used in cudnn kernel, need install cudnn")

--- a/paddle/fluid/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
@@ -115,7 +115,7 @@ class SumOpMaker : public framework::OpProtoAndCheckerMaker {
  void Make() override {
    AddInput("X", "(vector<Tensor>) The input tensors of sum operator.")
        .AsDuplicable();
-    AddOutput("Out", "(Tensor) The output tensor of sum operator.");
+    AddOutput("Out", "(Tensor) The output tensor of sum operator.").Reuse("X");
    AddComment(R"DOC(
 Sum operator.

--- a/paddle/fluid/operators/test_send_nccl_id.cc
+++ b/paddle/fluid/operators/test_send_nccl_id.cc
@@ -20,8 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/detail/grpc_client.h"
+#include "paddle/fluid/operators/detail/macros.h"
-#include "paddle/fluid/operators/detail/grpc_server.h"
 #include "paddle/fluid/operators/detail/request_handler_impl.h"
 #include "paddle/fluid/operators/listen_and_serv_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
@@ -29,6 +28,10 @@ limitations under the License. */
 #include "paddle/fluid/platform/nccl_helper.h"
 #include "paddle/fluid/string/printf.h"
+#ifdef PADDLE_WITH_GRPC
+#include "paddle/fluid/operators/send_recv_util.h"
+#endif
 USE_NO_KERNEL_OP(listen_and_serv);
 namespace f = paddle::framework;
@@ -37,7 +40,7 @@ namespace m = paddle::operators::math;
 namespace detail = paddle::operators::detail;
 namespace string = paddle::string;
-std::unique_ptr<detail::AsyncGRPCServer> g_rpc_service;
+std::unique_ptr<detail::RPCServer> g_rpc_service;
 std::unique_ptr<detail::RequestHandler> g_req_handler;
 void StartServer() {
@@ -58,7 +61,7 @@ void StartServer() {
  g_req_handler->SetRPCServer(g_rpc_service.get());
  std::thread server_thread(
-      std::bind(&detail::AsyncGRPCServer::StartServer, g_rpc_service.get()));
+      std::bind(&detail::RPCServer::StartServer, g_rpc_service.get()));
  g_rpc_service->SetCond(detail::kRequestSend);
  g_rpc_service->WaitBarrier(detail::kRequestSend);
@@ -68,9 +71,9 @@ void StartServer() {
  server_thread.join();
 }
-TEST(SendNcclId, GrpcServer) {
+TEST(SendNcclId, RPCServer) {
  g_req_handler.reset(new detail::RequestSendHandler(true));
-  g_rpc_service.reset(new detail::AsyncGRPCServer("127.0.0.1:0", 1));
+  g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 1));
  std::thread server_thread(StartServer);
  g_rpc_service->WaitServerReady();
@@ -87,8 +90,9 @@ TEST(SendNcclId, GrpcServer) {
  int port = g_rpc_service->GetSelectedPort();
  std::string ep = string::Sprintf("127.0.0.1:%d", port);
-  detail::RPCClient* client =
-      detail::RPCClient::GetInstance<detail::GRPCClient>();
+  detail::RPCClient* client = detail::RPCClient::GetInstance<RPCCLIENT_T>();
  LOG(INFO) << "connect to server" << ep;
  client->AsyncSendVar(ep, dev_ctx, scope, NCCL_ID_VARNAME);
  client->Wait();

--- a/paddle/fluid/operators/top_k_op.cc
+++ b/paddle/fluid/operators/top_k_op.cc
@@ -50,7 +50,7 @@ class TopkOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override {
    AddInput("X", "(Tensor) The input of Topk op");
-    AddOutput("Out", "(Tensor) The output tensor of Topk op");
+    AddOutput("Out", "(Tensor) The output tensor of Topk op").Reuse("X");
    AddOutput("Indices", "(Tensor) The indices of Topk elements of input");
    AddComment(R"DOC(
 Top K operator

--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -11,6 +11,7 @@ limitations under the License. */
 #pragma once
 #include <memory>
+#include <mutex>  // NOLINT
 #include <string>
 #include <unordered_map>
 #include <vector>
@@ -100,6 +101,7 @@ class CUDADeviceContext : public DeviceContext {
  template <typename Callback>
  void RecordEvent(cudaEvent_t ev, Callback callback) {
+    std::lock_guard<std::mutex> guard(mtx_);
    callback();
    PADDLE_ENFORCE(cudaEventRecord(ev, stream_));
  }
@@ -116,6 +118,8 @@ class CUDADeviceContext : public DeviceContext {
  int compute_capability;
  int multi_process;
  int max_threads_per_mp;
+  std::mutex mtx_;
 };
 template <>

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -509,10 +509,10 @@ All parameter, weight, gradient are variables in Paddle.
            self.num_threads_ = num_threads;
          })
      .def_property(
-          "use_event",
+          "use_cuda",
-          [](const ExecutionStrategy &self) { return self.use_event_; },
+          [](const ExecutionStrategy &self) { return self.use_cuda_; },
-          [](ExecutionStrategy &self, bool use_event) {
+          [](ExecutionStrategy &self, bool use_cuda) {
-            self.use_event_ = use_event;
+            self.use_cuda_ = use_cuda;
          })
      .def_property(
          "allow_op_delay",

--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -181,6 +181,7 @@ function build() {
    ============================================
 EOF
    make clean
+    make -j `nproc`
    make install -j `nproc`
 }

--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
@@ -30,7 +30,7 @@ int main(int argc, char** argv) {
  new_argv.push_back(
      strdup("--tryfromenv=fraction_of_gpu_memory_to_use,use_pinned_memory"));
 #else
-  new_argv.push_back(strdup("--tryfromenv=use_pinned_memory"));
+  new_argv.push_back(strdup("--tryfromenv=use_pinned_memory,use_mkldnn"));
 #endif
  int new_argc = static_cast<int>(new_argv.size());
  char** new_argv_address = new_argv.data();

--- a/python/paddle/dataset/flowers.py
+++ b/python/paddle/dataset/flowers.py
@@ -119,7 +119,8 @@ def reader_creator(data_file,
                yield sample, int(label) - 1
    if use_xmap:
-        return xmap_readers(mapper, reader, cpu_count(), buffered_size)
+        cpu_num = int(os.environ.get('CPU_NUM', cpu_count()))
+        return xmap_readers(mapper, reader, cpu_num, buffered_size)
    else:
        return map_readers(mapper, reader)

--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -26,6 +26,7 @@ from trainer import BeginEpochEvent
 from trainer import EndEpochEvent
 from trainer import BeginStepEvent
 from trainer import EndStepEvent
+from trainer import CheckpointConfig
 import inferencer
 from inferencer import Inferencer
@@ -116,7 +117,7 @@ def __bootstrap__():
    read_env_flags = [
        'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir',
-        'eager_delete_scope'
+        'eager_delete_scope', 'use_mkldnn'
    ]
    if core.is_compiled_with_cuda():
        read_env_flags += [

--- a/python/paddle/fluid/data_feeder.py
+++ b/python/paddle/fluid/data_feeder.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 import core
 import numpy
+import os
 import six.moves as six
 import multiprocessing
@@ -150,7 +151,9 @@ class DataFeeder(object):
        elif isinstance(self.place, core.CUDAPlace):
            return core.get_cuda_device_count()
        else:
-            return multiprocessing.cpu_count()
+            cpu_num = int(
+                os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
+            return cpu_num
    def decorate_reader(self,
                        reader,

--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -363,6 +363,13 @@ class OpProtoHolder(object):
            raise ValueError("Operator \"%s\" has not been registered." % type)
        return self.op_proto_map[type]
+    @staticmethod
+    def generated_op_attr_names():
+        return {
+            core.op_proto_and_checker_maker.kOpRoleAttrName(),
+            core.op_proto_and_checker_maker.kOpRoleVarAttrName()
+        }
 class Operator(object):
    """

--- a/python/paddle/fluid/inferencer.py
+++ b/python/paddle/fluid/inferencer.py
@@ -56,6 +56,8 @@ class Inferencer(object):
        else:
            self.exe = executor.Executor(self.place)
+        self.inference_program = self.inference_program.clone(for_test=True)
    def infer(self, inputs, return_numpy=True):
        """
        :param inputs: a map of {"input_name": input_var} that will be feed into the inference program

--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -24,7 +24,8 @@ __all__ = [
    'save_vars', 'save_params', 'save_persistables', 'load_vars', 'load_params',
    'load_persistables', 'save_inference_model', 'load_inference_model',
    'get_inference_program', 'save_checkpoint', 'load_checkpoint',
-    'clean_checkpoint'
+    'clean_checkpoint', 'load_persist_vars_without_grad',
+    'save_persist_vars_without_grad', 'get_latest_checkpoint_serial'
 ]
@@ -457,95 +458,161 @@ def get_parameter_value_by_name(name, executor, program=None):
 SUCCESS_MARK_FILENAME = "_SUCCESS"
 CHECKPOINT_PREFIX = "checkpoint"
+MODEL_DIR = "__model__"
+TRAINER_PREFIX = "trainer"
 CHECKPOINT_SEPARATOR = "_"
 def save_checkpoint(executor,
-                    checkpoint_dir=None,
+                    checkpoint_dir,
-                    max_num_checkpoints=3,
+                    trainer_id,
-                    save_interval_secs=600,
+                    trainer_args=None,
-                    main_program=None):
+                    main_program=None,
+                    max_num_checkpoints=3):
    """
    Save Checkpoint will save persistable LodTensor variables from main_program in checkpoint directory,
    the directory named by serial number from 0 to (n -1), save_checkpoint use LRU strategy
    to keep numbers of checkpoint directory,  the numbers of checkpoint directory are max_num_checkpoints at most,
    The interval between two saved checkpoints must greater than save_interval_secs.
-    :param executor
+    :param executor executor for save the value
-    :param checkpoint_dir
+    :param checkpoint_dir the checkpoint directory 
-    :param max_num_checkpoints
+    :param trainer_id currect trainer id, if id is equal to 0, the trainer is chief
-    :param save_interval_secs
+    :param main_program   will save all variables in program 
-    :param main_program
+    :param max_num_checkpoints will keep numbers of checkpoint serials not bigger than max_num_checkpoints
    """
    if checkpoint_dir is None:
-        checkpoint_dir = os.getcwd()
+        raise ValueError("'checkpoint_dir' should not be None")
+    if trainer_args:
+        assert isinstance(trainer_args, dict)
    if not os.path.isdir(checkpoint_dir):
        os.makedirs(checkpoint_dir)
-    serial = _get_lastest_checkpoint_dir(checkpoint_dir)
+    serial = get_latest_checkpoint_serial(checkpoint_dir) + 1
-    if serial >= 0 and not _interval_secs_exceed(
+    cur_dir = _get_serial_dir(checkpoint_dir, serial)
-            _get_serial_dir(serial, checkpoint_dir), save_interval_secs):
-        return
-    serial += 1
+    save_trainer_args(cur_dir, trainer_id, trainer_args)
-    cur_dir = _get_serial_dir(serial, checkpoint_dir)
-    save_vars(
+    if trainer_id == 0:
-        executor,
+        save_persist_vars_without_grad(executor, cur_dir, main_program)
-        dirname=cur_dir,
-        main_program=main_program,
+    _scroll_delete(checkpoint_dir, max_num_checkpoints)
-        vars=None,
-        predicate=_is_checkpoint_var,
-        filename=None)
-    _write_success(cur_dir)
-    _lru_delete(checkpoint_dir, max_num_checkpoints)
-def load_checkpoint(executor, checkpoint_dir=None, main_program=None):
+def load_checkpoint(executor, checkpoint_dir, serial, main_program):
    """
    Load checkpoint from a directory by executor,
    it will find  the most recent saved checkpoint file and load it auto.
-    :param executor
+    :param executor executor for load the value
-    :param checkpoint_dir
+    :param checkpoint_dir  the checkpoint directory 
-    :param main_program
+    :param serial the serial folder in checkpoint directory will be load
+    :param main_program  will load all variables in program 
    """
    if checkpoint_dir is None:
-        checkpoint_dir = os.getcwd()
+        raise ValueError("'checkpoint_dir' should not be None")
-    serial = _get_lastest_checkpoint_dir(checkpoint_dir)
+    if serial is None or serial < 0:
+        raise ValueError("'serial' should not be None or <0 ")
-    if serial < 0:
+    if main_program is None:
-        return
+        raise ValueError('main_program should not be None.')
-    cur_dir = _get_serial_dir(serial, checkpoint_dir)
+    cur_dir = _get_serial_dir(checkpoint_dir, serial)
+    load_persist_vars_without_grad(executor, cur_dir, main_program, True)
-    load_vars(
-        executor,
-        dirname=cur_dir,
-        main_program=main_program,
-        predicate=_is_checkpoint_var,
-        filename=None)
 def clean_checkpoint(checkpoint_dir, delete_dir=False):
    """
    clean the checkpoint dir, when the train exits normally, the trainer will call clean_checkpoint to delete checkpoint directory saved before.
    delete_dir only works when the directory is empty, otherwise, OSError is raised.  
+    :param checkpoint_dir
+    :param delete_dir
    """
    if checkpoint_dir is None:
-        checkpoint_dir = os.getcwd()
+        raise ValueError("'checkpoint_dir' should not be None")
-    _lru_delete(checkpoint_dir, max_num_checkpoints=0)
+    _scroll_delete(checkpoint_dir, max_num_checkpoints=0)
    if delete_dir and not os.listdir(checkpoint_dir):
        os.rmdir(checkpoint_dir)
-def _get_serial_dir(serial, checkpoint_dir):
+def load_persist_vars_without_grad(executor,
-    serial_folder = CHECKPOINT_PREFIX + CHECKPOINT_SEPARATOR + str(serial)
+                                   dirname,
-    return os.path.join(checkpoint_dir, serial_folder)
+                                   program,
+                                   has_model_dir=False):
+    """
+    load_persist_vars_without_grad will load variables from a directory by an executor,
+    the variable named end with "@GRAD" will not be loaded.
+    :param executor  executor for load the value
+    :param dirname the checkpoint directory 
+    :param program   will load all variables in program 
+    :param has_model_dir if has_model_dir is True, will load variables from  sub directory named __model__
+    """
+    if has_model_dir:
+        dirname = _get_model_dir(dirname)
+    load_vars(
+        executor,
+        dirname=dirname,
+        main_program=program,
+        predicate=_is_checkpoint_var,
+        filename=None)
+def save_persist_vars_without_grad(executor, dirname, program):
+    """
+    save_persist_vars_without_grad  will save variables to a directory by an executor,
+    the variable named end with "@GRAD" will not be saved.
+    :param executor  executor for load the value
+    :param dirname the checkpoint directory 
+    :param program   will load all variables in program
+    """
+    cur_dir = _get_model_dir(dirname)
+    save_vars(
+        executor,
+        dirname=cur_dir,
+        main_program=program,
+        vars=None,
+        predicate=_is_checkpoint_var,
+        filename=None)
+    _write_success(cur_dir)
+def save_trainer_args(dirname, trainer_id, trainer_args):
+    assert isinstance(trainer_args, dict)
+    cur_dir = _get_trainer_dir(dirname, trainer_id)
+    for name, value in trainer_args.iteritems():
+        args_file = os.path.join(cur_dir, name)
+        with open(args_file, 'w') as f:
+            f.write(str(value))
+    _write_success(cur_dir)
+def load_trainer_args(checkpoint_dir, serial, trainer_id, trainer_args):
+    assert isinstance(trainer_args, list)
+    cur_dir = _get_serial_dir(checkpoint_dir, serial)
+    cur_dir = _get_trainer_dir(cur_dir, trainer_id)
+    ret_values = []
+    for arg in trainer_args:
+        cur_file = os.path.join(cur_dir, arg)
+        with open(cur_file, 'r') as f:
+            contents = f.read()
+            ret_values.append(contents.strip())
+    return ret_values
 def _is_checkpoint_var(var):
@@ -559,36 +626,74 @@ def _is_checkpoint_var(var):
            var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
            var.desc.type() == core.VarDesc.VarType.RAW:
        return False
+    # @GRAD are named for gradient variables, checkpoint will not save it.
+    if "@GRAD" in var.name:
+        return False
+    # .trainer_ are named for distribute train variables, checkpoint will not save it.
+    if ".trainer_" in var.name:
+        return False
-    if var.name.endswith("@GRAD"):
+    # .block is named for distribute train variables, checkpoint will not save it.
+    if ".block" in var.name:
        return False
    return var.persistable
-def _interval_secs_exceed(dirname, save_interval_secs):
+def _get_dir_serial(dirname):
-    dir_time = os.path.getmtime(dirname)
+    _, serial = dirname.split(CHECKPOINT_SEPARATOR)
-    if save_interval_secs > (time.time() - dir_time):
-        return False
+    try:
-    return True
+        serial_num = int(serial)
+    except ValueError:
+        serial_num = -1
+    return serial_num
+def _get_serial_dir(dirname, serial):
+    serial_folder = CHECKPOINT_PREFIX + CHECKPOINT_SEPARATOR + str(serial)
+    serial_dir = os.path.join(dirname, serial_folder)
+    if not os.path.isdir(serial_dir):
+        os.makedirs(serial_dir)
+    return serial_dir
+def _get_model_dir(dirname):
+    model_dir = os.path.join(dirname, MODEL_DIR)
-def _lru_delete(dirname, max_num_checkpoints=3):
+    if not os.path.isdir(model_dir):
+        os.makedirs(model_dir)
+    return model_dir
+def _get_trainer_dir(dirname, trainer_id):
+    trainer_folder = TRAINER_PREFIX + CHECKPOINT_SEPARATOR + str(trainer_id)
+    trainer_dir = os.path.join(dirname, trainer_folder)
+    if not os.path.isdir(trainer_dir):
+        os.makedirs(trainer_dir)
+    return trainer_dir
+def _scroll_delete(dirname, max_num_checkpoints=3):
    dirs = os.listdir(dirname)
-    serials = []
+    serial_map = {}
    for serial in dirs:
-        try:
+        serial_num = _get_dir_serial(serial)
-            serials.append(int(serial))
+        serial_map[serial_num] = serial
-        except ValueError:
-            continue
-    if len(serials) <= max_num_checkpoints:
+    if len(serial_map.keys()) <= max_num_checkpoints:
        return
+    serials = serial_map.keys()
    serials.sort(reverse=True)
    serials = serials[max_num_checkpoints:]
    for serial in serials:
-        cur_dir = os.path.join(dirname, str(serial))
+        cur_dir = _get_serial_dir(dirname, serial)
        shutil.rmtree(cur_dir)
@@ -604,33 +709,30 @@ def _write_success(dirname):
        f.write(now)
-def _get_lastest_checkpoint_dir(checkpoint_dir):
+def get_latest_checkpoint_serial(checkpoint_dir):
    """
    get the latest file in checkpoint directory, the _SUCCESS file must exist in the directory
    :param checkpoint_dir
    """
-    if not checkpoint_dir.strip():
+    if not checkpoint_dir:
        return -1
    def has_success(checkpoint_dir, cur_dir):
        """
        is _SUCCESS in this dir
        """
-        _, serial = cur_dir.split(CHECKPOINT_SEPARATOR)
-        try:
-            int(serial)
-        except ValueError:
-            return -1
-        if not os.path.isdir(os.path.join(checkpoint_dir, cur_dir)):
+        serial = _get_dir_serial(cur_dir)
+        if serial == -1 or not os.path.isdir(
+                os.path.join(checkpoint_dir, cur_dir)):
            return -1
        success_path = os.path.join(
-            _get_serial_dir(serial, checkpoint_dir), SUCCESS_MARK_FILENAME)
+            _get_serial_dir(checkpoint_dir, serial), MODEL_DIR,
+            SUCCESS_MARK_FILENAME)
        if os.path.isfile(success_path):
-            return int(serial)
+            return serial
    if not os.path.isdir(checkpoint_dir):
        return -1

--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 import contextlib
-from layer_function_generator import autodoc
+from layer_function_generator import autodoc, templatedoc
 from tensor import assign, fill_constant
 from .. import core
 from ..framework import Program, Variable, Operator
@@ -721,26 +721,22 @@ def lod_rank_table(x, level=0):
    return table
+@templatedoc()
 def max_sequence_len(rank_table):
-    """Max Sequence Len Operator. Given a LoDRankTable object, this layer
+    """
-    returns the max length of a batch of sequences. In fact, a LoDRankTable
+    ${comment}
-    object contains a list of tuples(<sequence index, sequence length>) and
-    the list is already sorted by sequence length in descending order, so the
+    >>> import paddle.fluid as fluid
-    operator just returns the sequence length of the first tuple element.
+    >>> x = fluid.layers.data(name='x', shape=[10], dtype='float32',
+    >>>                       lod_level=1)
+    >>> rank_table = layers.lod_rank_table(x=x, level=0)
+    >>> max_seq_len = layers.max_sequence_len(rank_table)
    Args:
-        rank_table (Variable): Input variable which is a LoDRankTable object.
+        rank_table(${rank_table_type}): ${rank_table_comment}.
    Returns:
-        Variable: The max length of sequence.
+        ${out_comment}.
-    Examples:
-        .. code-block:: python
-            x = fluid.layers.data(name='x', shape=[10],
-                            dtype='float32', lod_level=1)
-            rank_table = layers.lod_rank_table(x=x, level=0)
-            max_seq_len = layers.max_sequence_len(rank_table)
    """
    helper = LayerHelper("max_seqence_len", **locals())
    res = helper.create_tmp_variable(dtype="int64")

--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -19,11 +19,12 @@ from ..unique_name import generate as unique_name
 from control_flow import BlockGuard
 from ..layer_helper import LayerHelper
 from ..executor import global_scope
+from layer_function_generator import generate_layer_fn, templatedoc
 __all__ = [
    'data', 'BlockGuardServ', 'ListenAndServ', 'Send', 'open_recordio_file',
    'open_files', 'read_file', 'shuffle', 'batch', 'double_buffer',
-    'random_data_generator', 'Preprocessor'
+    'random_data_generator', 'Preprocessor', 'load'
 ]
@@ -662,3 +663,29 @@ class Preprocessor(object):
                "sink_var_names": self.sink_var_names
            })
        return monkey_patch_reader_methods(self.reader)
+@templatedoc()
+def load(out, file_path, load_as_fp16=None):
+    """
+    ${comment}
+    >>> import paddle.fluid as fluid
+    >>> tmp_tensor = fluid.layers.create_tensor(dtype='float32')
+    >>> fluid.layers.load(tmp_tensor, "./tmp_tensor.bin")
+    Args:
+        out(${out_type}): ${out_comment}.
+        file_path(${file_path_type}): ${file_path_comment}.
+        load_as_fp16(${load_as_fp16_type}): ${load_as_fp16_comment}.
+    Returns:
+        None
+    """
+    helper = LayerHelper("load", **locals())
+    attrs = {"file_path": file_path}
+    if load_as_fp16 is not None:
+        attrs['load_as_fp16'] = load_as_fp16
+    helper.append_op(type="load", inputs={}, output={"Out": out}, args=attrs)
--- a/python/paddle/fluid/layers/layer_function_generator.py
+++ b/python/paddle/fluid/layers/layer_function_generator.py
@@ -15,16 +15,13 @@ import re
 import cStringIO
 import functools
 import warnings
+import string
 from ..proto import framework_pb2
 from ..framework import OpProtoHolder, Variable
 from ..layer_helper import LayerHelper
-__all__ = [
+__all__ = ['deprecated', 'generate_layer_fn', 'autodoc', 'templatedoc']
-    'deprecated',
-    'generate_layer_fn',
-    'autodoc',
-]
 def _convert_(name):
@@ -43,6 +40,10 @@ def _convert_(name):
    return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()
+def _type_to_str_(tp):
+    return framework_pb2.AttrType.Name(tp)
 def _generate_doc_string_(op_proto):
    """
    Generate docstring by OpProto
@@ -54,9 +55,6 @@ def _generate_doc_string_(op_proto):
        str: the document string
    """
-    def _type_to_str_(tp):
-        return framework_pb2.AttrType.Name(tp)
    if not isinstance(op_proto, framework_pb2.OpProto):
        raise TypeError("OpProto should be `framework_pb2.OpProto`")
@@ -75,7 +73,11 @@ def _generate_doc_string_(op_proto):
        buf.write(str(each_input.dispensable))
        buf.write('\n')
+    skip_attrs = OpProtoHolder.generated_op_attr_names()
    for each_attr in op_proto.attrs:
+        if each_attr.name in skip_attrs:
+            continue
        buf.write('    ')
        buf.write(each_attr.name)
        buf.write(' (')
@@ -220,3 +222,67 @@ def autodoc(comment=""):
        return func
    return __impl__
+_inline_math_single_dollar = re.compile(r"\$([^\$]+)\$")
+def templatedoc(op_type=None):
+    """
+    Decorator of layer function. It will use the docstring from the layer
+    function as the template. The template arguments are:
+    * ${comment}: The operator comment written in CPP.
+    * ${{name}_comment}: The comment of ${name} written with AddAttr, AddOutput,
+        and AddInput. The ${name} is Python snake style. i.e., xxx_xxx.
+    * ${{name}_type}: The type of ${name}.
+    Returns:
+        Decorated function.
+    """
+    def trim_ending_dot(msg):
+        return msg.rstrip('.')
+    def escape_inline_math(msg):
+        return _inline_math_single_dollar.sub(repl=r':math:`\1`', string=msg)
+    def __impl__(func):
+        if op_type is None:
+            op_type_name = func.__name__
+        else:
+            op_type_name = op_type
+        op_proto = OpProtoHolder.instance().get_op_proto(op_type_name)
+        tmpl = string.Template(func.__doc__)
+        comment_lines = op_proto.comment.split("\n")
+        comment = ""
+        for line in comment_lines:
+            line = line.strip()
+            if len(line) != 0:
+                comment += escape_inline_math(line)
+                comment += " "
+            elif len(comment) != 0:
+                comment += "\n    \n    "
+        args = {"comment": trim_ending_dot(comment)}
+        for each_input in op_proto.inputs:
+            input_name = _convert_(each_input.name)
+            args["{0}_comment".format(input_name)] = trim_ending_dot(
+                each_input.comment)
+            args["{0}_type".format(input_name)] = "Variable"
+        for each_attr in op_proto.attrs:
+            input_name = _convert_(each_attr.name)
+            args["{0}_comment".format(input_name)] = trim_ending_dot(
+                each_attr.comment)
+            args["{0}_type".format(input_name)] = _type_to_str_(each_attr.type)
+        for each_opt in op_proto.outputs:
+            output_name = _convert_(each_opt.name)
+            args["{0}_comment".format(output_name)] = trim_ending_dot(
+                each_opt.comment)
+            args["{0}_type".format(output_name)] = "Variable"
+        func.__doc__ = tmpl.substitute(args)
+        return func
+    return __impl__
--- a/python/paddle/fluid/layers/learning_rate_scheduler.py
+++ b/python/paddle/fluid/layers/learning_rate_scheduler.py
@@ -11,6 +11,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""
+When training a model, it's often useful to decay the
+learning rate during training process, this is called
+learning_rate_decay. There are many strategies to do
+this, this module will provide some classical method.
+User can also implement their own learning_rate_decay
+strategy according to this module.
+"""
 import control_flow
 import nn
@@ -22,14 +30,6 @@ __all__ = [
    'exponential_decay', 'natural_exp_decay', 'inverse_time_decay',
    'polynomial_decay', 'piecewise_decay', 'noam_decay'
 ]
-"""
-When training a model, it's often useful to decay the
-learning rate during training process, this is called
-learning_rate_decay. There are many strategies to do
-this, this module will provide some classical method.
-User can also implement their own learning_rate_decay
-strategy according to this module.
-"""
 def _decay_step_counter(begin=0):
@@ -41,18 +41,20 @@ def _decay_step_counter(begin=0):
 def noam_decay(d_model, warmup_steps):
-    """Apply decay to learning rate.
+    """
-    ```python
+    Noam decay method. The numpy implementation of noam decay as follows.
-    lr_value = np.power(d_model, -0.5) * np.min([
-            np.power(current_steps, -0.5),
+    >>> import numpy as np
-            np.power(warmup_steps, -1.5) * current_steps
+    >>> lr_value = np.power(d_model, -0.5) * np.min([
-        ])
+    >>>                         np.power(current_steps, -0.5),
-    ```
+    >>>                         np.power(warmup_steps, -1.5) * current_steps])
+    Please reference `attention is all you need
+    <https://arxiv.org/pdf/1706.03762.pdf>`_.
    Args:
        d_model(Variable): The dimensionality of input and output of model.
-            Reference: attention is all you need
-                https://arxiv.org/pdf/1706.03762.pdf
        warmup_steps(Variable): A super parameter.
    Returns:

--- a/python/paddle/fluid/layers/metric.py
+++ b/python/paddle/fluid/layers/metric.py
@@ -64,10 +64,6 @@ def auc(input, label, curve='ROC', num_thresholds=200):
    topk_indices = helper.create_tmp_variable(dtype="int64")
    topk_out, topk_indices = nn.topk(input, k=k)
    auc_out = helper.create_tmp_variable(dtype="float32")
-    if correct is None:
-        correct = helper.create_tmp_variable(dtype="int64")
-    if total is None:
-        total = helper.create_tmp_variable(dtype="int64")
    helper.append_op(
        type="accuracy",
        inputs={

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -19,9 +19,10 @@ from ..layer_helper import LayerHelper
 from ..initializer import Normal, Constant
 from ..framework import Variable
 from ..param_attr import ParamAttr
-from layer_function_generator import autodoc
+from layer_function_generator import autodoc, templatedoc
 from tensor import concat
 import utils
+import random
 __all__ = [
    'fc',
@@ -801,7 +802,22 @@ def gru_unit(input,
    return updated_hidden, reset_hidden_pre, gate
+@templatedoc()
 def linear_chain_crf(input, label, param_attr=None):
+    """
+    Linear Chain CRF.
+    ${comment}
+    Args:
+        input(${emission_type}): ${emission_comment}
+        label(${label_type}): ${label_comment}
+        param_attr(ParamAttr): The attribute of the learnable parameter.
+    Returns:
+        ${log_likelihood_comment}
+    """
    helper = LayerHelper('linear_chain_crf', **locals())
    size = input.shape[1]
    transition = helper.create_parameter(
@@ -827,7 +843,19 @@ def linear_chain_crf(input, label, param_attr=None):
    return log_likelihood
+@templatedoc()
 def crf_decoding(input, param_attr, label=None):
+    """
+    ${comment}
+    Args:
+        input(${emission_type}): ${emission_comment}
+        param_attr(ParamAttr): The parameter attribute for training.
+        label(${label_type}): ${label_comment}
+    Returns:
+        ${viterbi_path_comment}
+    """
    helper = LayerHelper('crf_decoding', **locals())
    transition = helper.get_parameter(param_attr.name)
    viterbi_path = helper.create_tmp_variable(dtype=helper.input_dtype())
@@ -2439,19 +2467,21 @@ def l2_normalize(x, axis, epsilon=1e-12, name=None):
    The l2 normalize layer normalizes `x` along dimension `axis` using an L2
    norm. For a 1-D tensor (`dim` is fixed to 0), this layer computes
-    output = x / sqrt(max(sum(x**2), epsilon))
+    .. math::
+    y = \frac{x}{ \sqrt{\sum {x^2} + epsion }}
    For `x` with more dimensions, this layer independently normalizes each 1-D
    slice along dimension `axis`.
    Args:
       x(Variable|list): The input tensor to l2_normalize layer.
-       axis(int): Dimension along which to normalize the input.
+       axis(int): The axis on which to apply normalization. If `axis < 0`,
-       epsilon(float): A lower bound value for `x`'s l2 norm. sqrt(epsilon) will
+           the dimension to normalization is rank(X) + axis. -1 is the
-                       be used as the divisor if the l2 norm of `x` is less than
+           last dimension.
-                       sqrt(epsilon).
+       epsilon(float): The epsilon value is used to avoid division by zero,
+           the defalut value is 1e-10.
       name(str|None): A name for this layer(optional). If set None, the layer
-                       will be named automatically.
+           will be named automatically.
    Returns:
@@ -2470,46 +2500,17 @@ def l2_normalize(x, axis, epsilon=1e-12, name=None):
        axis = 0
    helper = LayerHelper("l2_normalize", **locals())
-    square = helper.create_tmp_variable(dtype=x.dtype)
+    out = helper.create_tmp_variable(dtype=x.dtype)
-    helper.append_op(type="square", inputs={"X": x}, outputs={"Out": square})
+    norm = helper.create_tmp_variable(dtype=x.dtype)
-    reduced_sum = helper.create_tmp_variable(dtype=x.dtype)
    helper.append_op(
-        type="reduce_sum",
+        type="norm",
-        inputs={"X": square},
+        inputs={"X": x},
-        outputs={"Out": reduced_sum},
+        outputs={"Out": out,
+                 "Norm": norm},
        attrs={
-            "dim": [1] if axis is None else [axis],
+            "axis": 1 if axis is None else axis,
-            "keep_dim": True,
+            "epsilon": epsilon,
-            "reduce_all": False
        })
-    # TODO(caoying) A lower bound value epsilon for the norm is needed to
-    # imporve the numeric stability of reciprocal. This requires a maximum_op.
-    rsquare = helper.create_tmp_variable(dtype=x.dtype)
-    helper.append_op(
-        type="reciprocal", inputs={"X": reduced_sum}, outputs={"Out": rsquare})
-    # TODO(caoying) the current elementwise_mul operator does not support a
-    # general broadcast rule which broadcasts input(Y) to have the same
-    # dimension with Input(X) starting from a specified dimension. So this
-    # exanpsion is requred. Once a general broadcast rule is spported, this
-    # expanding canbe removed.
-    rsquare_expanded = helper.create_tmp_variable(dtype=x.dtype)
-    expand_times = [1] * len(x.shape)
-    expand_times[axis] = int(x.shape[axis])
-    helper.append_op(
-        type="expand",
-        inputs={"X": rsquare},
-        outputs={"Out": rsquare_expanded},
-        attrs={"expand_times": expand_times})
-    out = helper.create_tmp_variable(dtype=x.dtype)
-    helper.append_op(
-        type="elementwise_mul",
-        inputs={"X": x,
-                "Y": rsquare_expanded},
-        outputs={"Out": out})
    return out
@@ -4009,18 +4010,25 @@ def image_resize(input,
    return out
+@templatedoc(op_type="bilinear_interp")
 def resize_bilinear(input, out_shape=None, scale=None, name=None):
    """
-    This is an alias of layer 'image_resize' with bilinear interpolation.
+    ${comment}
+    Args:
+        input(${x_type}): ${x_comment}.
+        out_shape(${out_size_type}): ${out_size_comment}.
+        scale(float|None): The multiplier for the input height or width. At
+             least one of out_shape or scale must be set. And out_shape has
+             a higher priority than scale. Default: None.
+        name(str|None): The output variable name.
-    The mathematical meaning of resize bilinear layer is
+    Returns:
-    Bilinear interpolation.
-    Bilinear interpolation is an extension of linear interpolation for
-    interpolating functions of two variables (e.g. H-direction and
-    W-direction in this layer) on a rectilinear 2D grid.
-    For details, please refer to Wikipedia:
+        ${out_comment}.
-    https://en.wikipedia.org/wiki/Bilinear_interpolation
    """
    return image_resize(input, out_shape, scale, name, 'BILINEAR')
@@ -4107,10 +4115,31 @@ def gather(input, index):
    return out
-def random_crop(input, shape, seed=1):
+@templatedoc()
+def random_crop(x, shape, seed=None):
+    """
+    ${comment}
+    Examples:
+        >>> img = fluid.layers.data("img", [3, 256, 256])
+        >>> cropped_img = fluid.layers.random_crop(img, shape=[3, 224, 224])
+    Args:
+        x(${x_type}): ${x_comment}
+        shape(${shape_type}): ${shape_comment}
+        seed(int|${seed_type}|None): ${seed_comment} By default, the seed will
+            get from `random.randint(-65536, 65535)`.
+    Returns:
+        ${out_comment}
+    """
    helper = LayerHelper("random_crop", **locals())
    dtype = helper.input_dtype()
    out = helper.create_tmp_variable(dtype)
+    if seed is None:
+        seed = random.randint(-65536, 65535)
    if isinstance(seed, int):
        seed_value = seed
        seed = helper.create_tmp_variable(dtype="int64")

--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -71,8 +71,10 @@ __all__ = [
    'cumsum',
    'scatter',
    'sum',
+    'slice',
    'polygon_box_transform',
    'shape',
+    'maxout',
 ] + __activations__
 for _OP in set(__all__):

--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -18,6 +18,7 @@ from ..framework import convert_np_dtype_to_dtype_
 from ..framework import Variable
 from ..initializer import Constant, force_init_on_cpu
 from ..core import VarDesc
+from layer_function_generator import templatedoc
 import numpy
 __all__ = [
@@ -30,6 +31,8 @@ __all__ = [
    'assign',
    'fill_constant_batch_size_like',
    'fill_constant',
+    'argmin',
+    'argmax',
    'ones',
    'zeros',
 ]
@@ -266,6 +269,7 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None):
    return out
+@templatedoc()
 def fill_constant_batch_size_like(input,
                                  shape,
                                  dtype,
@@ -273,30 +277,28 @@ def fill_constant_batch_size_like(input,
                                  input_dim_idx=0,
                                  output_dim_idx=0):
    """
-    **fill_constant_batch_size_like**
+    ${comment}
-    This function creates a tensor of specified *shape*, *dtype* and batch size,
-    and initializes this with a constant supplied in *value*. The batch size is
-    obtained from the `input` tensor.
    It also sets *stop_gradient* to True.
+    >>> data = fluid.layers.fill_constant_batch_size_like(
+    >>>             input=like, shape=[1], value=0, dtype='int64')
    Args:
-        input(Variable): Tensor whose dimensions will be used to get batch size
+        input(${input_type}): ${input_comment}.
-        shape(tuple|list|None): Shape of output tensor
-        dtype(np.dtype|core.VarDesc.VarType|str): Data type of output tensor
-        value(float): Constant value to initialize the output tensor
-        input_dim_idx(int): Index of input's batch size dimension
-        output_dim_idx(int): Index of output's batch size dimension
-    Returns:
+        shape(${shape_type}): ${shape_comment}.
-        Variable: The tensor variable storing the output
-    Examples:
+        dtype(${dtype_type}): ${dtype_comment}.
-        .. code-block:: python
+        value(${value_type}): ${value_comment}.
+        input_dim_idx(${input_dim_idx_type}): ${input_dim_idx_comment}.
+        output_dim_idx(${output_dim_idx_type}): ${output_dim_idx_comment}.
-          data = fluid.layers.fill_constant_batch_size_like(
+    Returns:
-              input=like, shape=[1], value=0, dtype='int64')
+        ${out_comment}.
    """
    helper = LayerHelper("fill_constant_batch_size_like", **locals())
    out = helper.create_tmp_variable(dtype=dtype)
@@ -315,6 +317,68 @@ def fill_constant_batch_size_like(input,
    return out
+def argmin(x, axis=0):
+    """
+    **argmin**
+    This function computes the indices of the min elements 
+    of the input tensor's element along the provided axis.
+    Args:
+        x(Variable): The input to compute the indices of
+                     the min elements.
+        axis(int): Axis to compute indices along.
+    Returns:
+        Variable: The tensor variable storing the output
+    Examples:
+        .. code-block:: python
+          out = fluid.layers.argmin(x=in, axis=0)
+          out = fluid.layers.argmin(x=in, axis=-1)  
+    """
+    helper = LayerHelper("arg_min", **locals())
+    out = helper.create_tmp_variable(VarDesc.VarType.INT64)
+    helper.append_op(
+        type='arg_min',
+        inputs={'X': x},
+        outputs={'Out': [out]},
+        attrs={'axis': axis})
+    return out
+def argmax(x, axis=0):
+    """
+    **argmax**
+    This function computes the indices of the max elements 
+    of the input tensor's element along the provided axis.
+    Args:
+        x(Variable): The input to compute the indices of
+                     the max elements.
+        axis(int): Axis to compute indices along.
+    Returns:
+        Variable: The tensor variable storing the output
+    Examples:
+        .. code-block:: python
+          out = fluid.layers.argmax(x=in, axis=0)
+          out = fluid.layers.argmax(x=in, axis=-1)  
+    """
+    helper = LayerHelper("arg_max", **locals())
+    out = helper.create_tmp_variable(VarDesc.VarType.INT64)
+    helper.append_op(
+        type='arg_max',
+        inputs={'X': x},
+        outputs={'Out': [out]},
+        attrs={'axis': axis})
+    return out
 def ones(shape, dtype, force_cpu=False):
    """
    **ones**
@@ -437,22 +501,6 @@ def save_combine(x, file_path, overwrite=True):
              "overwrite": overwrite})
-def load(out, file_path):
-    """
-    Loads a variable from a given file.
-    Args:
-        out(variable): The variable to be read from the disk file.
-        file_path(str): The path of the disk file.
-    """
-    helper = LayerHelper("load", **locals())
-    helper.append_op(
-        type="load",
-        inputs={},
-        output={"Out": out},
-        args={"file_path": file_path})
 def load_combine(out, file_path):
    """
    Loads a list of vairables from a single file.

--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -18,6 +18,7 @@ import framework
 import executor
 import warnings
 import sys
+import os
 __all__ = ['ParallelExecutor', 'ExecutionStrategy', 'BuildStrategy']
@@ -101,7 +102,9 @@ class ParallelExecutor(object):
                p.set_place(self._act_places[-1])
                self._places.append(p)
        else:
-            for i in xrange(multiprocessing.cpu_count()):
+            cpu_num = int(
+                os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
+            for i in xrange(cpu_num):
                p = core.Place()
                self._act_places.append(core.CPUPlace())
                p.set_place(self._act_places[-1])
@@ -110,19 +113,17 @@ class ParallelExecutor(object):
        if exec_strategy is None:
            exec_strategy = ExecutionStrategy()
-            if use_cuda:
+        exec_strategy.use_cuda = use_cuda
-                exec_strategy.use_event = True
-            else:
-                exec_strategy.use_event = False
        if exec_strategy.num_threads == 0:
            if use_cuda:
                # Experiments on se-resnext shows that too many threads hurt
                # performance. Worth tunning for other models in the future.
-                exec_strategy.num_threads = len(self._places) * 2
+                exec_strategy.num_threads = len(self._places) * 4
            else:
-                exec_strategy.num_threads = min(
+                cpu_num = int(
-                    len(self._places) * 2, multiprocessing.cpu_count())
+                    os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
+                exec_strategy.num_threads = cpu_num
        if build_strategy is None:
            build_strategy = BuildStrategy()

--- a/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py
+++ b/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py
@@ -38,7 +38,7 @@ def inference_program():
    return y_predict
-def linear():
+def train_program():
    y = fluid.layers.data(name='y', shape=[1], dtype='float32')
    y_predict = inference_program()
@@ -104,7 +104,7 @@ def main(use_cuda):
    # Directory for saving the trained model
    params_dirname = "fit_a_line.inference.model"
-    train(use_cuda, linear, params_dirname)
+    train(use_cuda, train_program, params_dirname)
    infer(use_cuda, inference_program, params_dirname)

--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -41,8 +41,8 @@ function(py_test_modules TARGET_NAME)
 endfunction()
 list(REMOVE_ITEM TEST_OPS test_warpctc_op)
 list(REMOVE_ITEM TEST_OPS test_dist_train)
-list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf)
+#list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf)
-list(REMOVE_ITEM TEST_OPS test_parallel_executor_fetch_feed)
+#list(REMOVE_ITEM TEST_OPS test_parallel_executor_fetch_feed)
 # TODO(wuyi): this test hungs on CI, will add it back later
 list(REMOVE_ITEM TEST_OPS test_listen_and_serv_op)
 foreach(TEST_OP ${TEST_OPS})

--- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
+++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import multiprocessing
+import os
 import unittest
 import paddle.fluid as fluid
 import time
@@ -23,6 +25,7 @@ __all__ = ['TestParallelExecutorBase']
 class TestParallelExecutorBase(unittest.TestCase):
    def check_network_convergence(self,
                                  method,
+                                  use_cuda=True,
                                  memory_opt=True,
                                  iter=50,
                                  batch_size=None,
@@ -53,7 +56,7 @@ class TestParallelExecutorBase(unittest.TestCase):
            adam.minimize(loss)
            if memory_opt:
                fluid.memory_optimize(main)
-            place = fluid.CUDAPlace(0)
+            place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
            startup_exe = fluid.Executor(place)
            startup_exe.run(startup)
            exec_strategy = fluid.ExecutionStrategy()
@@ -64,7 +67,7 @@ class TestParallelExecutorBase(unittest.TestCase):
            if use_parallel_executor:
                exe = fluid.ParallelExecutor(
-                    True,
+                    use_cuda,
                    loss_name=loss.name,
                    exec_strategy=exec_strategy,
                    build_strategy=build_strategy)
@@ -72,7 +75,9 @@ class TestParallelExecutorBase(unittest.TestCase):
                exe = fluid.Executor(place=place)
            if batch_size is not None:
-                batch_size *= fluid.core.get_cuda_device_count()
+                batch_size *= fluid.core.get_cuda_device_count(
+                ) if use_cuda else int(
+                    os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
            begin = time.time()
            first_loss, = run_executor(
                exe=exe, feed=feed_dict, fetch_list=[loss.name])

--- a/python/paddle/fluid/tests/unittests/test_arg_min_max_op.py
+++ b/python/paddle/fluid/tests/unittests/test_arg_min_max_op.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import numpy as np
+from op_test import OpTest
+class BaseTestCase(OpTest):
+    def initTestCase(self):
+        self.op_type = 'arg_min'
+        self.dims = (3, 4, 5)
+        self.dtype = 'float32'
+        self.axis = 0
+    def setUp(self):
+        self.initTestCase()
+        self.x = (1000 * np.random.random(self.dims)).astype(self.dtype)
+        self.inputs = {'X': self.x}
+        self.attrs = {'axis': self.axis}
+        if self.op_type == "arg_min":
+            self.outputs = {'Out': np.argmin(self.x, axis=self.axis)}
+        else:
+            self.outputs = {'Out': np.argmax(self.x, axis=self.axis)}
+    def test_check_output(self):
+        self.check_output()
+class TestCase0(BaseTestCase):
+    def initTestCase(self):
+        self.op_type = 'arg_max'
+        self.dims = (3, 4, 5)
+        self.dtype = 'float32'
+        self.axis = 0
+class TestCase1(BaseTestCase):
+    def initTestCase(self):
+        self.op_type = 'arg_min'
+        self.dims = (3, 4)
+        self.dtype = 'float64'
+        self.axis = 1
+class TestCase2(BaseTestCase):
+    def initTestCase(self):
+        self.op_type = 'arg_max'
+        self.dims = (3, 4)
+        self.dtype = 'int64'
+        self.axis = 0
+class TestCase3(BaseTestCase):
+    def initTestCase(self):
+        self.op_type = 'arg_max'
+        self.dims = (3, )
+        self.dtype = 'int64'
+        self.axis = 0
+class TestCase4(BaseTestCase):
+    def initTestCase(self):
+        self.op_type = 'arg_min'
+        self.dims = (1, )
+        self.dtype = 'int32'
+        self.axis = 0
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_checkpoint.py
+++ b/python/paddle/fluid/tests/unittests/test_checkpoint.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle.fluid as fluid
+import unittest
+import os
+import tempfile
+class TestCheckpoint(unittest.TestCase):
+    def setUp(self):
+        self.dirname = tempfile.mktemp()
+        self.max_num_checkpoints = 3
+        self.epoch_interval = 1
+        self.step_interval = 1
+        self.trainer_id = 0
+        self.chief = self.trainer_id == 0
+        self.place = fluid.CPUPlace()
+        self.epoch_id = 100
+        self.step_id = 20
+    def test_checkpoint(self):
+        self.save_checkpoint()
+        serial = fluid.io.get_latest_checkpoint_serial(self.dirname)
+        self.assertTrue(serial >= 0)
+        trainer_args = ["epoch_id", "step_id"]
+        epoch_id, step_id = fluid.io.load_trainer_args(
+            self.dirname, serial, self.trainer_id, trainer_args)
+        self.assertEqual(self.step_id, int(step_id))
+        self.assertEqual(self.epoch_id, int(epoch_id))
+        program = fluid.Program()
+        with fluid.program_guard(program):
+            exe = fluid.Executor(self.place)
+            fluid.io.load_checkpoint(exe, self.dirname, serial, program)
+        fluid.io.clean_checkpoint(self.dirname, delete_dir=True)
+        self.assertFalse(os.path.isdir(self.dirname))
+    def save_checkpoint(self):
+        config = fluid.CheckpointConfig(self.dirname, self.max_num_checkpoints,
+                                        self.epoch_interval, self.step_interval)
+        trainer_args = {}
+        trainer_args["epoch_id"] = self.epoch_id
+        trainer_args["step_id"] = self.step_id
+        program = fluid.Program()
+        with fluid.program_guard(program):
+            program.global_block().create_var(
+                name="scale_0",
+                psersistable=True,
+                dtype="float32",
+                shape=[32, 32])
+            exe = fluid.Executor(self.place)
+            for i in xrange(10):
+                fluid.io.save_checkpoint(exe, config.checkpoint_dir,
+                                         self.trainer_id, trainer_args, program,
+                                         config.max_num_checkpoints)
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_crop_op.py
+++ b/python/paddle/fluid/tests/unittests/test_crop_op.py
@@ -42,9 +42,9 @@ class TestCropOp(OpTest):
    def setUp(self):
        self.op_type = "crop"
        self.crop_by_input = False
+        self.offset_by_input = False
        self.attrs = {}
        self.initTestCase()
-        self.attrs['offsets'] = self.offsets
        if self.crop_by_input:
            self.inputs = {
                'X': np.random.random(self.x_shape).astype("float32"),
@@ -55,6 +55,10 @@ class TestCropOp(OpTest):
            self.inputs = {
                'X': np.random.random(self.x_shape).astype("float32"),
            }
+        if self.offset_by_input:
+            self.inputs['Offsets'] = np.array(self.offsets).astype('int32')
+        else:
+            self.attrs['offsets'] = self.offsets
        self.outputs = {
            'Out': crop(self.inputs['X'], self.offsets, self.crop_shape)
        }
@@ -101,5 +105,22 @@ class TestCase4(TestCropOp):
        self.crop_by_input = True
+class TestCase5(TestCropOp):
+    def initTestCase(self):
+        self.x_shape = (3, 4, 5)
+        self.crop_shape = [2, 2, 3]
+        self.offsets = [1, 0, 2]
+        self.offset_by_input = True
+class TestCase6(TestCropOp):
+    def initTestCase(self):
+        self.x_shape = (10, 9, 14)
+        self.crop_shape = [3, 3, 5]
+        self.offsets = [3, 5, 4]
+        self.crop_by_input = True
+        self.offset_by_input = True
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import unittest
 import paddle.fluid as fluid
 from paddle.fluid.transpiler.distribute_transpiler import delete_ops
@@ -54,10 +55,10 @@ class TestDistTranspiler(TranspilerTest):
        delete_ops(trainer.global_block(), optimize_ops)
        ops = [op.type for op in trainer.global_block().ops] + [
-            "split_byref", "send_vars", "send_barrier", "recv", "recv",
+            "split_byref", "send", "send_barrier", "recv", "recv",
            "fetch_barrier", "concat"
        ]
-        ops.insert(ops.index("elementwise_add_grad") + 1, "send_vars")
+        ops.insert(ops.index("elementwise_add_grad") + 1, "send")
        return ops

--- a/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py
+++ b/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py
@@ -30,9 +30,6 @@ class Memory(object):
        assert val.dtype == self.ex.dtype
        self.cur = val
-    def ex(self):
-        return self.ex
    def next(self):
        self.ex = self.cur
        self.cur = None

--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -387,6 +387,20 @@ class TestBook(unittest.TestCase):
            self.assertIsNotNone(output)
        print(str(program))
+    def test_l2_normalize(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name='x', shape=[8, 7, 10], dtype="float32")
+            output = layers.l2_normalize(x, axis=1)
+    def test_maxout(self):
+        program = Program()
+        with program_guard(program):
+            data = layers.data(name='x', shape=[8, 6, 6], dtype="float32")
+            output = layers.maxout(x=data, groups=2)
+            self.assertIsNotNone(output)
+        print(str(program))
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
@@ -70,17 +70,18 @@ class TestListenAndServOp(OpTest):
        return p.pid
    def _wait_ps_ready(self, pid):
-        retry_times = self.ps_timeout
+        start_left_time = self.ps_timeout
+        sleep_time = 0.5
        while True:
-            assert retry_times >= 0, "wait ps ready failed"
+            assert start_left_time >= 0, "wait ps ready failed"
-            time.sleep(0.5)
+            time.sleep(sleep_time)
            try:
                # the listen_and_serv_op would touch a file which contains the listen port
                # on the /tmp directory until it was ready to process all the RPC call.
                os.stat("/tmp/paddle.%d.port" % pid)
                return
            except os.error:
-                retry_times -= 1
+                start_left_time -= sleep_time
    def test_rpc_interfaces(self):
        # TODO(Yancey1989): need to make sure the rpc interface correctly.

--- a/python/paddle/fluid/tests/unittests/test_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_norm_op.py
@@ -17,44 +17,23 @@ import numpy as np
 from op_test import OpTest
-def norm(input, scale, epsilon):
+def l2_norm(x, axis, epsilon):
-    s0, s1, s2, s3 = input.shape
+    x2 = x**2
-    x_square = input * input
+    s = np.sum(x2, axis=axis, keepdims=True)
-    for i in xrange(s0):
+    r = np.sqrt(s + epsilon)
-        input_batch = input[i:i + 1, :, :, :]
+    y = x / np.broadcast_to(r, x.shape)
-        input_batch = input_batch.reshape(s1, s2 * s3)
+    return y, r
-        x_square_batch = x_square[i:i + 1, :, :, :]
-        x_square_batch = x_square_batch.reshape(s1, s2 * s3)
-        square_colsum = x_square_batch.sum(axis=0) + epsilon
-        tmp = pow(square_colsum, 0.5)
-        tmp = np.reciprocal(tmp)
-        tmp_tile = np.tile(tmp, s1)
-        tmp_tile = tmp_tile.reshape(s1, s2 * s3)
-        scale_tile = np.tile(scale, (1, s2 * s3))
-        scale_tile = scale_tile.reshape(s1, s2 * s3)
-        out_batch = input_batch * tmp_tile * scale_tile
-        out_batch = out_batch.reshape(1, s1, s2, s3)
-        if i == 0:
-            out = out_batch
-        else:
-            out = np.concatenate((out, out_batch), 0)
-    out.reshape(s0, s1, s2, s3)
-    return out
 class TestNormOp(OpTest):
    def setUp(self):
        self.op_type = "norm"
        self.init_test_case()
-        input = np.random.random(self.shape).astype("float32")
+        x = np.random.random(self.shape).astype("float64")
-        scale = np.array([10, 10, 10])
+        y, norm = l2_norm(x, self.axis, self.epsilon)
-        self.inputs = {
+        self.inputs = {'X': x}
-            'X': input.astype('float32'),
+        self.attrs = {'epsilon': self.epsilon, 'axis': self.axis}
-            'Scale': scale.astype('float32')
+        self.outputs = {'Out': y, 'Norm': norm}
-        }
-        self.attrs = {'epsilon': self.epsilon}
-        output = norm(input, scale, self.epsilon)
-        self.outputs = {'Out': output.astype('float32')}
    def test_check_output(self):
        self.check_output()
@@ -63,8 +42,23 @@ class TestNormOp(OpTest):
        self.check_grad(['X'], 'Out')
    def init_test_case(self):
-        self.shape = [2, 3, 2, 2]
+        self.shape = [2, 3, 4, 4]
-        self.epsilon = 1e-6
+        self.axis = 1
+        self.epsilon = 1e-8
+class TestNormOp2(TestNormOp):
+    def init_test_case(self):
+        self.shape = [5, 3, 9, 7]
+        self.axis = 0
+        self.epsilon = 1e-8
+class TestNormOp3(TestNormOp):
+    def init_test_case(self):
+        self.shape = [5, 3, 2, 7]
+        self.axis = -1
+        self.epsilon = 1e-8
 if __name__ == '__main__':

--- a/python/paddle/fluid/tests/unittests/test_normalization_wrapper.py
+++ b/python/paddle/fluid/tests/unittests/test_normalization_wrapper.py
@@ -70,8 +70,9 @@ class TestNormalization(unittest.TestCase):
    def l2_normalize(self, data, axis, epsilon):
        """ Compute the groundtruth.
        """
-        output = data * np.reciprocal(
+        output = data / np.broadcast_to(
-            np.sum(np.square(data), axis=axis, keepdims=True))
+            np.sqrt(np.sum(np.square(data), axis=axis, keepdims=True)),
+            data.shape)
        return output
    def test_l2_normalize(self):

--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
@@ -17,6 +17,7 @@ import paddle.fluid as fluid
 import unittest
 import paddle
 import numpy as np
+import os
 word_dict, verb_dict, label_dict = conll05.get_dict()
 word_dict_len = len(word_dict)
@@ -101,7 +102,11 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
 class TestCRFModel(unittest.TestCase):
-    def check_network_convergence(self, is_sparse, build_strategy=None):
+    def check_network_convergence(self,
+                                  is_sparse,
+                                  build_strategy=None,
+                                  use_cuda=True):
+        os.environ['CPU_NUM'] = str(4)
        main = fluid.Program()
        startup = fluid.Program()
        with fluid.program_guard(main, startup):
@@ -145,12 +150,12 @@ class TestCRFModel(unittest.TestCase):
                    paddle.dataset.conll05.test(), buf_size=8192),
                batch_size=16)
-            place = fluid.CUDAPlace(0)
+            place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
            exe = fluid.Executor(place)
            exe.run(startup)
            pe = fluid.ParallelExecutor(
-                use_cuda=True,
+                use_cuda=use_cuda,
                loss_name=avg_cost.name,
                build_strategy=build_strategy)
@@ -172,25 +177,33 @@ class TestCRFModel(unittest.TestCase):
        build_strategy = fluid.BuildStrategy()
        build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
        self.check_network_convergence(
-            is_sparse=True, build_strategy=build_strategy)
+            is_sparse=True, build_strategy=build_strategy, use_cuda=True)
+        self.check_network_convergence(
+            is_sparse=True, build_strategy=build_strategy, use_cuda=False)
    def test_update_dense_parameter_all_reduce(self):
        build_strategy = fluid.BuildStrategy()
        build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
        self.check_network_convergence(
-            is_sparse=False, build_strategy=build_strategy)
+            is_sparse=False, build_strategy=build_strategy, use_cuda=True)
+        self.check_network_convergence(
+            is_sparse=False, build_strategy=build_strategy, use_cuda=False)
    def test_update_sparse_parameter_reduce(self):
        build_strategy = fluid.BuildStrategy()
        build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
        self.check_network_convergence(
-            is_sparse=True, build_strategy=build_strategy)
+            is_sparse=True, build_strategy=build_strategy, use_cuda=True)
+        self.check_network_convergence(
+            is_sparse=True, build_strategy=build_strategy, use_cuda=False)
    def test_update_dense_parameter_reduce(self):
        build_strategy = fluid.BuildStrategy()
        build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
        self.check_network_convergence(
-            is_sparse=False, build_strategy=build_strategy)
+            is_sparse=False, build_strategy=build_strategy, use_cuda=True)
+        self.check_network_convergence(
+            is_sparse=False, build_strategy=build_strategy, use_cuda=False)
 if __name__ == '__main__':

--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
@@ -18,6 +18,7 @@ import paddle.fluid as fluid
 import unittest
 import numpy as np
 import paddle
+import os
 def Lenet(data, class_dim):
@@ -35,7 +36,7 @@ def Lenet(data, class_dim):
 class TestFetchOp(unittest.TestCase):
-    def parallel_exe(self, train_inputs, seed):
+    def parallel_exe(self, train_inputs, seed, use_cuda):
        main = fluid.Program()
        startup = fluid.Program()
        startup.random_seed = seed
@@ -59,13 +60,13 @@ class TestFetchOp(unittest.TestCase):
            # conv2d_1.b_0@GRAD. Those variables should not be pruned.
            # fluid.memory_optimize(main)
-            place = fluid.CUDAPlace(0)
+            place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
            exe = fluid.Executor(place)
            exe.run(startup)
            feeder = fluid.DataFeeder(place=place, feed_list=[data, label])
            pe = fluid.ParallelExecutor(
-                use_cuda=True, loss_name=loss.name, main_program=main)
+                use_cuda=use_cuda, loss_name=loss.name, main_program=main)
            fetch_list = []
            all_vars = main.global_block().vars
@@ -88,14 +89,16 @@ class TestFetchOp(unittest.TestCase):
        for i in range(iters):
            train_inputs.append(tst_reader_iter.next())
-        self.parallel_exe(train_inputs, seed=1)
+        os.environ['CPU_NUM'] = str(4)
+        self.parallel_exe(train_inputs, seed=1, use_cuda=True)
+        self.parallel_exe(train_inputs, seed=1, use_cuda=False)
 class TestFeedParallel(unittest.TestCase):
-    def test_main(self):
+    def parallel_exe(self, use_cuda, seed):
        main = fluid.Program()
        startup = fluid.Program()
-        startup.random_seed = 1
+        startup.random_seed = seed
        with fluid.scope_guard(fluid.core.Scope()):
            with fluid.program_guard(main, startup):
                data = fluid.layers.data(
@@ -111,15 +114,18 @@ class TestFeedParallel(unittest.TestCase):
                    regularization=fluid.regularizer.L2Decay(1e-4))
                opt.minimize(loss)
-        place = fluid.CUDAPlace(0)
+        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
        feeder = fluid.DataFeeder(place=place, feed_list=[data, label])
        reader = feeder.decorate_reader(
            paddle.batch(
                flowers.train(), batch_size=16), multi_devices=True)
        exe = fluid.Executor(place)
        exe.run(startup)
        pe = fluid.ParallelExecutor(
-            use_cuda=True, loss_name=loss.name, main_program=main)
+            use_cuda=use_cuda, loss_name=loss.name, main_program=main)
        for batch_id, data in enumerate(reader()):
            loss_np = np.array(pe.run(feed=data, fetch_list=[loss.name])[0])
@@ -127,6 +133,11 @@ class TestFeedParallel(unittest.TestCase):
            if batch_id == 2:
                break
+    def test_feed_op(self):
+        os.environ['CPU_NUM'] = str(4)
+        self.parallel_exe(use_cuda=True, seed=1)
+        self.parallel_exe(use_cuda=False, seed=1)
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
@@ -18,6 +18,7 @@ import numpy as np
 import paddle
 import paddle.dataset.mnist as mnist
 import unittest
+import os
 MNIST_RECORDIO_FILE = "./mnist_test_pe.recordio"
@@ -85,6 +86,7 @@ def fc_with_batchnorm(use_feed):
 class TestMNIST(TestParallelExecutorBase):
    @classmethod
    def setUpClass(cls):
+        os.environ['CPU_NUM'] = str(4)
        # Convert mnist to recordio file
        with fluid.program_guard(fluid.Program(), fluid.Program()):
            reader = paddle.batch(mnist.train(), batch_size=4)
@@ -99,9 +101,12 @@ class TestMNIST(TestParallelExecutorBase):
            fluid.recordio_writer.convert_reader_to_recordio_file(
                MNIST_RECORDIO_FILE, reader, feeder)
-    def check_simple_fc_convergence(self, balance_parameter_opt_between_cards):
+    def check_simple_fc_convergence(self,
-        self.check_network_convergence(simple_fc_net)
+                                    balance_parameter_opt_between_cards,
-        self.check_network_convergence(simple_fc_net, allow_op_delay=True)
+                                    use_cuda=True):
+        self.check_network_convergence(simple_fc_net, use_cuda=use_cuda)
+        self.check_network_convergence(
+            simple_fc_net, use_cuda=use_cuda, allow_op_delay=True)
        img = np.zeros(shape=[32, 784], dtype='float32')
        label = np.ones(shape=[32, 1], dtype='int64')
@@ -109,17 +114,21 @@ class TestMNIST(TestParallelExecutorBase):
            simple_fc_net,
            feed_dict={"image": img,
                       "label": label},
+            use_cuda=use_cuda,
            balance_parameter_opt_between_cards=balance_parameter_opt_between_cards
        )
    def test_simple_fc(self):
-        self.check_simple_fc_convergence(False)
+        self.check_simple_fc_convergence(False, use_cuda=True)
+        self.check_simple_fc_convergence(False, use_cuda=False)
    def test_simple_fc_with_new_strategy(self):
-        self.check_simple_fc_convergence(True)
+        self.check_simple_fc_convergence(True, use_cuda=True)
+        self.check_simple_fc_convergence(True, use_cuda=False)
    def check_simple_fc_parallel_accuracy(self,
-                                          balance_parameter_opt_between_cards):
+                                          balance_parameter_opt_between_cards,
+                                          use_cuda=True):
        img = np.zeros(shape=[32, 784], dtype='float32')
        label = np.ones(shape=[32, 1], dtype='int64')
        single_first_loss, single_last_loss = self.check_network_convergence(
@@ -127,12 +136,14 @@ class TestMNIST(TestParallelExecutorBase):
            seed=1000,
            feed_dict={"image": img,
                       "label": label},
+            use_cuda=use_cuda,
            use_parallel_executor=False)
        parallel_first_loss, parallel_last_loss = self.check_network_convergence(
            method=simple_fc_net,
            seed=1000,
            feed_dict={"image": img,
                       "label": label},
+            use_cuda=use_cuda,
            use_parallel_executor=True,
            balance_parameter_opt_between_cards=balance_parameter_opt_between_cards
        )
@@ -143,28 +154,33 @@ class TestMNIST(TestParallelExecutorBase):
            self.assertAlmostEquals(p_l, single_last_loss[0], delta=1e-6)
    def test_simple_fc_parallel_accuracy(self):
-        self.check_simple_fc_parallel_accuracy(False)
+        self.check_simple_fc_parallel_accuracy(False, use_cuda=True)
+        self.check_simple_fc_parallel_accuracy(False, use_cuda=False)
    def test_simple_fc_parallel_accuracy_with_new_strategy(self):
-        self.check_simple_fc_parallel_accuracy(True)
+        self.check_simple_fc_parallel_accuracy(True, use_cuda=True)
+        self.check_simple_fc_parallel_accuracy(True, use_cuda=False)
-    def check_batchnorm_fc_convergence(self,
+    def check_batchnorm_fc_convergence(
-                                       balance_parameter_opt_between_cards):
+            self, balance_parameter_opt_between_cards, use_cuda):
-        self.check_network_convergence(fc_with_batchnorm)
+        self.check_network_convergence(fc_with_batchnorm, use_cuda=use_cuda)
        img = np.zeros(shape=[32, 784], dtype='float32')
        label = np.ones(shape=[32, 1], dtype='int64')
        self.check_network_convergence(
            fc_with_batchnorm,
            feed_dict={"image": img,
                       "label": label},
+            use_cuda=use_cuda,
            balance_parameter_opt_between_cards=balance_parameter_opt_between_cards
        )
    def test_batchnorm_fc(self):
-        self.check_batchnorm_fc_convergence(False)
+        self.check_batchnorm_fc_convergence(False, use_cuda=True)
+        self.check_batchnorm_fc_convergence(False, use_cuda=False)
    def test_batchnorm_fc_with_new_strategy(self):
-        self.check_batchnorm_fc_convergence(True)
+        self.check_batchnorm_fc_convergence(True, use_cuda=True)
+        self.check_batchnorm_fc_convergence(True, use_cuda=False)
 if __name__ == '__main__':

--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py
@@ -15,6 +15,7 @@
 import paddle.fluid as fluid
 from parallel_executor_test_base import TestParallelExecutorBase
 import unittest
+import os
 def squeeze_excitation(input, num_channels, reduction_ratio):
@@ -130,22 +131,30 @@ def SE_ResNeXt50Small(batch_size=2, use_feed=False):
 class TestResnet(TestParallelExecutorBase):
-    def check_resnet_convergence(self, balance_parameter_opt_between_cards):
+    def check_resnet_convergence(self,
+                                 balance_parameter_opt_between_cards,
+                                 use_cuda=True,
+                                 iter=20):
+        os.environ['CPU_NUM'] = str(4)
        import functools
        batch_size = 2
        self.check_network_convergence(
            functools.partial(
                SE_ResNeXt50Small, batch_size=batch_size),
-            iter=20,
+            iter=iter,
            batch_size=batch_size,
+            use_cuda=use_cuda,
            balance_parameter_opt_between_cards=balance_parameter_opt_between_cards
        )
    def test_resnet(self):
-        self.check_resnet_convergence(False)
+        self.check_resnet_convergence(False, use_cuda=True)
+        self.check_resnet_convergence(False, use_cuda=False, iter=5)
    def test_resnet_with_new_strategy(self):
-        self.check_resnet_convergence(True)
+        self.check_resnet_convergence(True, use_cuda=True)
+        self.check_resnet_convergence(True, use_cuda=False, iter=5)
 if __name__ == '__main__':

--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
@@ -15,6 +15,7 @@
 import paddle.fluid as fluid
 import numpy as np
 import unittest
+import os
 def simple_fc_net():
@@ -35,7 +36,8 @@ def simple_fc_net():
 class ParallelExecutorTestingDuringTraining(unittest.TestCase):
-    def check_network_convergence(self, build_strategy=None):
+    def check_network_convergence(self, use_cuda, build_strategy=None):
+        os.environ['CPU_NUM'] = str(4)
        main = fluid.Program()
        startup = fluid.Program()
        with fluid.program_guard(main, startup):
@@ -49,19 +51,19 @@ class ParallelExecutorTestingDuringTraining(unittest.TestCase):
            image = np.random.normal(size=(batch_size, 784)).astype('float32')
            label = np.random.randint(0, 10, (batch_size, 1), dtype="int64")
-            place = fluid.CUDAPlace(0)
+            place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
            exe = fluid.Executor(place)
            exe.run(startup)
            feed_dict = {'image': image, 'label': label}
            train_exe = fluid.ParallelExecutor(
-                use_cuda=True,
+                use_cuda=use_cuda,
                loss_name=loss.name,
                main_program=main,
                build_strategy=build_strategy)
            test_exe = fluid.ParallelExecutor(
-                use_cuda=True,
+                use_cuda=use_cuda,
                main_program=test_program,
                share_vars_from=train_exe,
                build_strategy=build_strategy)
@@ -81,12 +83,18 @@ class ParallelExecutorTestingDuringTraining(unittest.TestCase):
    def test_parallel_testing(self):
        build_strategy = fluid.BuildStrategy()
        build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
-        self.check_network_convergence(build_strategy)
+        self.check_network_convergence(
+            use_cuda=True, build_strategy=build_strategy)
+        self.check_network_convergence(
+            use_cuda=False, build_strategy=build_strategy)
    def test_parallel_testing_with_new_strategy(self):
        build_strategy = fluid.BuildStrategy()
        build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
-        self.check_network_convergence(build_strategy)
+        self.check_network_convergence(
+            use_cuda=True, build_strategy=build_strategy)
+        self.check_network_convergence(
+            use_cuda=False, build_strategy=build_strategy)
 if __name__ == '__main__':

--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
@@ -19,6 +19,7 @@ from parallel_executor_test_base import TestParallelExecutorBase
 import unittest
 import paddle
 import paddle.dataset.wmt16 as wmt16
+import os
 WMT16_RECORDIO_FILE = "./wmt16_test_pe.recordio"
@@ -149,6 +150,7 @@ def transformer(use_feed):
 class TestTransformer(TestParallelExecutorBase):
    @classmethod
    def setUpClass(cls):
+        os.environ['CPU_NUM'] = str(4)
        reader = paddle.batch(
            wmt16.train(ModelHyperParams.src_vocab_size,
                        ModelHyperParams.trg_vocab_size),
@@ -167,7 +169,8 @@ class TestTransformer(TestParallelExecutorBase):
    @unittest.skip("transformer is buggy in multi gpu")
    def test_main(self):
-        self.check_network_convergence(transformer)
+        self.check_network_convergence(transformer, use_cuda=True)
+        self.check_network_convergence(transformer, use_cuda=False)
 if __name__ == '__main__':

--- a/python/paddle/fluid/tests/unittests/test_simple_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_simple_dist_transpiler.py
@@ -59,9 +59,9 @@ class TestSimpleDistTranspiler(TranspilerTest):
        delete_ops(trainer.global_block(), optimize_ops)
        ops = [op.type for op in trainer.global_block().ops] + [
-            "send_vars", "send_barrier", "recv", "recv", "fetch_barrier"
+            "send", "send_barrier", "recv", "recv", "fetch_barrier"
        ]
-        ops.insert(ops.index("elementwise_add_grad") + 1, "send_vars")
+        ops.insert(ops.index("elementwise_add_grad") + 1, "send")
        return ops
    def _transpiler_instance(self):

--- a/python/paddle/fluid/tests/unittests/test_slice_op.py
+++ b/python/paddle/fluid/tests/unittests/test_slice_op.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import numpy as np
+from op_test import OpTest
+class TestSliceOp(OpTest):
+    def setUp(self):
+        self.op_type = "slice"
+        self.config()
+        self.inputs = {'Input': self.input}
+        self.outputs = {'Out': self.out}
+        self.attrs = {
+            'axes': self.axes,
+            'starts': self.starts,
+            'ends': self.ends
+        }
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
+        self.starts = [1, 0, 2]
+        self.ends = [3, 3, 4]
+        self.axes = [0, 1, 2]
+        self.out = self.input[1:3, 0:3, 2:4, :]
+    def test_check_output(self):
+        self.check_output()
+class TestCase1(TestSliceOp):
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
+        self.starts = [-3, 0, 2]
+        self.ends = [3, 100, -1]
+        self.axes = [0, 1, 2]
+        self.out = self.input[-3:3, 0:100, 2:-1, :]
+class TestCase2(TestSliceOp):
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
+        self.starts = [-3, 0, 2]
+        self.ends = [3, 100, -1]
+        self.axes = [0, 1, 3]
+        self.out = self.input[-3:3, 0:100, :, 2:-1]
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/trainer.py
+++ b/python/paddle/fluid/trainer.py
@@ -27,11 +27,8 @@ import parallel_executor
 from transpiler import distribute_transpiler
 __all__ = [
-    'Trainer',
+    'Trainer', 'BeginEpochEvent', 'EndEpochEvent', 'BeginStepEvent',
-    'BeginEpochEvent',
+    'EndStepEvent', 'CheckpointConfig'
-    'EndEpochEvent',
-    'BeginStepEvent',
-    'EndStepEvent',
 ]
@@ -59,6 +56,35 @@ class EndStepEvent(object):
        self.metrics = metrics
+class CheckpointConfig(object):
+    def __init__(self,
+                 checkpoint_dir=None,
+                 max_num_checkpoints=3,
+                 epoch_interval=1,
+                 step_interval=10):
+        if checkpoint_dir is None:
+            self.checkpoint_dir = os.getcwd()
+        else:
+            self.checkpoint_dir = checkpoint_dir
+        self.max_num_checkpoints = max_num_checkpoints
+        if epoch_interval < 1:
+            self.epoch_interval = 1
+        else:
+            self.epoch_interval = epoch_interval
+        if step_interval < 1:
+            self.step_interval = 10
+        else:
+            self.step_interval = step_interval
+        self.epoch_id = 0
+        self.step_id = 0
+        self.load_serial = None
+        self.is_pserver = False
 def check_and_get_place(place):
    """
    Check the type of place or get the default place
@@ -99,13 +125,24 @@ class Trainer(object):
                 optimizer_func,
                 param_path=None,
                 place=None,
-                 parallel=False):
+                 parallel=False,
+                 checkpoint_config=None):
        self.__stop = False
        self.parallel = parallel
        # 1. we need to generate a framework.Program by calling
        # program_func. Reference: fluid.program_guard in
        # test_word2vec.py
+        # config for checkpoint
+        # only chief worker will save variables
+        self.trainer_id = 0
+        self.checkpoint_cfg = checkpoint_config
+        if self.checkpoint_cfg:
+            assert isinstance(self.checkpoint_cfg, CheckpointConfig)
+            serial = io.get_latest_checkpoint_serial(
+                self.checkpoint_cfg.checkpoint_dir)
+            self.checkpoint_cfg.load_serial = serial if serial >= 0 else None
        self.scope = core.Scope()
        self.startup_program = framework.Program()
@@ -115,9 +152,9 @@ class Trainer(object):
            program_func_outs = train_func()
            self.train_func_outputs = program_func_outs if isinstance(
                program_func_outs, list) else [program_func_outs]
-            self.test_program = self.train_program.clone()
+            self.test_program = self.train_program.clone(for_test=True)
-            # The fisrt element of program_func_outs is loss.
+            # The first element of program_func_outs is loss.
            loss = self.train_func_outputs[0]
            optimizer = optimizer_func()
@@ -137,9 +174,25 @@ class Trainer(object):
            exe = executor.Executor(place)
            exe.run(self.startup_program)
-        if param_path:
+        if self.checkpoint_cfg and self.checkpoint_cfg.load_serial:
+            with self._prog_and_scope_guard():
+                exe = executor.Executor(place)
+                io.load_checkpoint(exe, self.checkpoint_cfg.checkpoint_dir,
+                                   self.checkpoint_cfg.load_serial,
+                                   self.startup_program)
+            if not self.checkpoint_cfg.is_pserver:
+                epoch_id, step_id = io.load_trainer_args(
+                    self.checkpoint_cfg.checkpoint_dir,
+                    self.checkpoint_cfg.load_serial, self.trainer_id,
+                    self._get_checkpoint_load_args())
+                self.checkpoint_cfg.epoch_id = int(epoch_id)
+                self.checkpoint_cfg.step_id = int(step_id)
+        if param_path and os.path.isdir(param_path):
            # load params from param_path into scope
-            io.load_persistables(exe, dirname=param_path)
+            io.load_persist_vars_without_grad(
+                exe, dirname=param_path, program=self.startup_program)
    def _transpile_nccl2_dist(self):
        # PADDLE_TRAINER_IPS
@@ -194,14 +247,18 @@ class Trainer(object):
        current_endpoint = os.getenv("PADDLE_CURRENT_IP", "") + ":" + port
        # the unique trainer id, starting from 0, needed by trainer
        # only
-        trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
+        self.trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
        # the role, should be either PSERVER or TRAINER
        training_role = os.getenv("PADDLE_TRAINING_ROLE")
        with self._prog_and_scope_guard():
            t = distribute_transpiler.DistributeTranspiler()
            t.transpile(
-                trainer_id, pservers=pserver_endpoints, trainers=trainers)
+                self.trainer_id, pservers=pserver_endpoints, trainers=trainers)
            if training_role == "PSERVER":
+                if self.checkpoint_cfg:
+                    self.is_pserver = True
                self.train_program = t.get_pserver_program(current_endpoint)
                self.startup_program = t.get_startup_program(current_endpoint,
                                                             self.train_program)
@@ -294,11 +351,26 @@ class Trainer(object):
            self._train_by_any_executor(event_handler, exe, num_epochs, reader)
    def _train_by_any_executor(self, event_handler, exe, num_epochs, reader):
-        for epoch_id in range(num_epochs):
+        if self.checkpoint_cfg:
+            epochs = [
+                epoch_id for epoch_id in range(num_epochs)
+                if epoch_id >= self.checkpoint_cfg.epoch_id
+            ]
+        else:
+            epochs = [epoch_id for epoch_id in range(num_epochs)]
+        for epoch_id in epochs:
            event_handler(BeginEpochEvent(epoch_id))
            for step_id, data in enumerate(reader()):
                if self.__stop:
+                    if self.checkpoint_cfg:
+                        self._clean_checkpoint()
                    return
+                if self.checkpoint_cfg and self.checkpoint_cfg.load_serial \
+                    and self.checkpoint_cfg.step_id >= step_id and self.checkpoint_cfg.epoch_id == epoch_id:
+                    continue
                begin_event = BeginStepEvent(epoch_id, step_id)
                event_handler(begin_event)
                if begin_event.fetch_metrics:
@@ -309,8 +381,13 @@ class Trainer(object):
                                      ])
                else:
                    metrics = exe.run(feed=data, fetch_list=[])
+                if self.checkpoint_cfg:
+                    self._save_checkpoint(epoch_id, step_id)
                event_handler(EndStepEvent(epoch_id, step_id, metrics))
            event_handler(EndEpochEvent(epoch_id))
+        if self.checkpoint_cfg:
+            self._clean_checkpoint()
    def _test_by_executor(self, reader, feed_order, fetch_list):
        with executor.scope_guard(self.scope):
@@ -349,6 +426,38 @@ class Trainer(object):
                loss_name=self.train_func_outputs[0].name)
        return self._get_parallel_executor()
+    def _clean_checkpoint(self):
+        assert self.checkpoint_cfg
+        io.clean_checkpoint(checkpoint_dir=self.checkpoint_cfg.checkpoint_dir)
+    def _get_checkpoint_load_args(self):
+        """
+        epoch_id and step_id are runtime arguments, they are not variables, will load them independently.
+        """
+        return ["epoch_id", "step_id"]
+    def _get_checkpoint_save_args(self, epoch_id, step_id):
+        """
+        epoch_id and step_id are runtime arguments, they are not variables, will save them independently.
+        """
+        trainer_args = {}
+        trainer_args["epoch_id"] = epoch_id
+        trainer_args["step_id"] = step_id
+        return trainer_args
+    def _save_checkpoint(self, epoch_id, step_id):
+        assert self.checkpoint_cfg
+        if epoch_id % self.checkpoint_cfg.epoch_interval == 0 and step_id % self.checkpoint_cfg.step_interval == 0:
+            exe = executor.Executor(self.place)
+            io.save_checkpoint(
+                executor=exe,
+                checkpoint_dir=self.checkpoint_cfg.checkpoint_dir,
+                trainer_id=self.trainer_id,
+                trainer_args=self._get_checkpoint_save_args(epoch_id, step_id),
+                main_program=self.train_program,
+                max_num_checkpoints=self.checkpoint_cfg.max_num_checkpoints)
 def build_feed_var_list(program, feed_order):
    if not isinstance(program, framework.Program):

--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -24,9 +24,9 @@ Steps to transpile trainer:
 1. split variable to multiple blocks, aligned by product(dim[1:]) (width).
 2. rename splited grad variables to add trainer_id suffix ".trainer_%d".
 3. modify trainer program add split_op to each grad variable.
-4. append send_op to send splited variables to server and fetch
+4. append send_op to send splited variables to server and 
-    params(splited blocks or origin param) from server.
+5. add recv_op to fetch params(splited blocks or origin param) from server.
-5. append concat_op to merge splited blocks to update local weights.
+6. append concat_op to merge splited blocks to update local weights.
 Steps to transpile pserver:
 1. create new program for parameter server.
@@ -177,6 +177,7 @@ class DistributeTranspiler:
                        dtype=table_grad_var.dtype)
                    for index in range(len(self.pserver_endpoints))
                ]
+        return param_list, grad_list
    def _init_splited_vars(self, slice_var_up):
        # update these mappings for further transpile:
@@ -199,8 +200,8 @@ class DistributeTranspiler:
                grad_list.append(g)
                param_grad_set.add(g.name)
-        self._update_dist_lookup_table_vars(param_list, grad_list,
+        param_list, grad_list = self._update_dist_lookup_table_vars(
-                                            self.params_grads)
+            param_list, grad_list, self.params_grads)
        if slice_var_up:
            # when we slice var up into blocks, we will slice the var according to
@@ -316,7 +317,7 @@ class DistributeTranspiler:
            program.global_block().insert_op(
                index=index + 1,
-                type="send_vars",
+                type="send",
                inputs={"X": splited_vars},
                outputs={},
                attrs={
@@ -677,7 +678,7 @@ class DistributeTranspiler:
                    break
    def _split_table_grad_and_add_send_vars(self, program, pserver_endpoints):
-        # 2. add split_ids_op and send_vars_op to send gradient to pservers
+        # 2. add split_ids_op and send_op to send gradient to pservers
        # there should only be one table_name
        all_ops = program.global_block().ops
        table_grad_name = grad_var_name(self.table_name)
@@ -694,11 +695,11 @@ class DistributeTranspiler:
                    outputs={"Out": self.trainer_side_table_grad_list})
                program.global_block().insert_op(
                    index=op_index + 2,
-                    type="send_vars",
+                    type="send",
                    inputs={'X': self.trainer_side_table_grad_list},
                    outputs={},
                    attrs={
-                        "sync_send": True,
+                        "sync_mode": True,
                        "epmap": pserver_endpoints,
                        RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
                    })

--- a/python/paddle/v2/dataset/flowers.py
+++ b/python/paddle/v2/dataset/flowers.py
@@ -119,7 +119,8 @@ def reader_creator(data_file,
                yield sample, int(label) - 1
    if use_xmap:
-        return xmap_readers(mapper, reader, cpu_count(), buffered_size)
+        cpu_num = int(os.environ.get('CPU_NUM', cpu_count()))
+        return xmap_readers(mapper, reader, cpu_num, buffered_size)
    else:
        return map_readers(mapper, reader)