diff --git a/.gitignore b/.gitignore
index 9e3a0b499f9f42856429f3a42bef313ea3df3699..b92bb9cc129659fa502b4a9b55548992412e5429 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,6 +5,7 @@ python/paddle/v2/fluid/tests/book/image_classification_resnet.inference.model/
 python/paddle/v2/fluid/tests/book/image_classification_vgg.inference.model/
 python/paddle/v2/fluid/tests/book/label_semantic_roles.inference.model/
 *.DS_Store
+*.vs
 build/
 build_doc/
 *.user
@@ -15,6 +16,7 @@ build_doc/
 .cproject
 .pydevproject
 .settings/
+CMakeSettings.json
 Makefile
 .test_env/
 third_party/
diff --git a/.travis.yml b/.travis.yml
index 8c772030925dcad3909f142b08e4d8057a3f89b7..361136ac2c8d899a0d7a4d7945083fcc489551b5 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -27,15 +27,6 @@ script:
     # 43min timeout
     paddle/scripts/paddle_docker_build.sh ${JOB}
     if [ $? -eq 0 ] || [ $? -eq 142 ]; then true; else exit 1; fi;
-  - |
-    if [[ "$JOB" != "doc" ]]; then exit 0; fi;
-    # For document only
-    if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit 0; fi;
-    if [[ "$TRAVIS_BRANCH" != "develop"  && ! "$TRAVIS_BRANCH" =~ ^v[[:digit:]]+\.[[:digit:]]+(\.[[:digit:]]+)?(-\S*)?$ ]]; then exit 0; fi;
-    export DEPLOY_DOCS_SH=https://raw.githubusercontent.com/PaddlePaddle/PaddlePaddle.org/master/scripts/deploy/deploy_docs.sh
-    export DOCS_DIR=`pwd`
-    cd ..
-    curl $DEPLOY_DOCS_SH | bash -s $CONTENT_DEC_PASSWD $TRAVIS_BRANCH $DOCS_DIR $DOCS_DIR/build/doc/
 notifications:
   email:
     on_success: change
diff --git a/AUTHORS.md b/AUTHORS.md
index 8c4a113fc276783c945867ceae9612339b7f0bbc..41b7193677a0208ba2fa82b72862292572dcb6ef 100644
--- a/AUTHORS.md
+++ b/AUTHORS.md
@@ -46,6 +46,7 @@
 | tianbingsz | Tian-Bing Xu |
 | tpatejko | Tomasz Patejko |
 | typhoonzero | Yi Wu |
+| velconia | Qi-Yang Min |
 | wanghaoshuang | Hao-Shuang Wang |
 | wangyang59 | Yang Wang |
 | wangzhen-nlp | Zhen Wang |
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 23bb27e77b9eab0c322a71a8ff570d12d1050377..c2fa5420e916fd5958f6198d6e97c9b1092b5aa1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -24,6 +24,9 @@ message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: "
         "${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}")
 message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: "
         "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
+if(WIN32)
+    set(CMAKE_STATIC_LIBRARY_PREFIX lib)
+endif(WIN32)
 
 if(NOT CMAKE_CROSSCOMPILING)
     find_package(CUDA QUIET)
@@ -65,7 +68,15 @@ option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better d
 option(WITH_ANAKIN      "Compile with Anakin library"                   OFF)
 option(WITH_GRPC     "Use grpc as the default rpc framework"            ${WITH_DISTRIBUTE})
 option(WITH_BRPC_RDMA     "Use brpc rdma as the rpc protocal"           OFF)
+option(WITH_INFERENCE    "Compile fluid inference library"              ON)
 option(WITH_SYSTEM_BLAS   "Use system blas library"           OFF)
+option(PY_VERSION       "Compile PaddlePaddle with python3 support"     ${PY_VERSION})
+
+# PY_VERSION
+if(NOT PY_VERSION)
+  set(PY_VERSION 2.7)
+endif()
+set(PYBIND11_PYTHON_VERSION ${PY_VERSION})
 
 # CMAKE_BUILD_TYPE
 if(NOT CMAKE_BUILD_TYPE)
@@ -103,6 +114,11 @@ if(ANDROID OR IOS)
     add_definitions(-DPADDLE_MOBILE_INFERENCE)
 endif()
 
+if (APPLE OR WIN32)
+    set(WITH_MKL OFF CACHE STRING
+        "Disable MKL for building on mac and windows" FORCE)
+endif()
+
 set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
   "A path setting third party libraries download & build directories.")
 
@@ -141,6 +157,8 @@ endif()
 ########################################################################################
 
 include(external/mklml)     # download mklml package
+include(external/xbyak)     # download xbyak package
+include(external/libxsmm)   # download, build, install libxsmm
 include(external/zlib)      # download, build, install zlib
 include(external/gflags)    # download, build, install gflags
 include(external/glog)      # download, build, install glog
@@ -150,12 +168,20 @@ include(external/python)    # download, build, install python
 include(external/openblas)  # download, build, install openblas
 include(external/mkldnn)    # download, build, install mkldnn
 include(external/swig)      # download, build, install swig
-include(external/warpctc)   # download, build, install warpctc
 include(external/boost)     # download boost
 include(external/any)       # download libn::any
 include(external/eigen)     # download eigen3
 include(external/pybind11)  # download pybind11
 include(external/cares)
+include(external/cub)
+
+if (NOT WIN32)
+# there is no official support of snappystream, warpctc, nccl, cupti in windows
+include(external/snappy)    # download snappy
+include(external/snappystream) # download snappystream
+include(external/warpctc)   # download, build, install warpctc
+include(cupti)
+endif (NOT WIN32)
 
 if(WITH_DISTRIBUTE)
     if(WITH_GRPC)
@@ -178,19 +204,27 @@ if(WITH_BRPC_RDMA)
     endif()
 endif()
 
-include(external/snappy)    # download snappy
-include(external/snappystream)
-include(external/threadpool)
 
+include(external/threadpool)
+include(flags)              # set paddle compile flags
 include(cudnn)              # set cudnn libraries, must before configure
-include(cupti)
 include(configure)          # add paddle env configuration
+
+if(WITH_GPU)
+    include(cuda)
+    include(tensorrt)
+endif()
+if(WITH_MKL OR WITH_MKLML)
+    include(external/anakin)
+elseif()
+    set(WITH_ANAKIN OFF CACHE STRING "Anakin is used in MKL only now." FORCE)
+endif()
+
 include(generic)            # simplify cmake module
 include(package)            # set paddle packages
 include(ccache)             # set ccache for compilation
 include(util)               # set unittest and link libs
 include(rdma)               # set rdma libraries
-include(flags)              # set paddle compile flags
 include(version)            # set PADDLE_VERSION
 include(coveralls)          # set code coverage
 include(inference_lib)      # add paddle fluid inference libraries
@@ -210,14 +244,6 @@ set(EXTERNAL_LIBS
     ${PYTHON_LIBRARIES}
 )
 
-if(WITH_GPU)
-    include(cuda)
-    include(tensorrt)
-    include(external/anakin)
-else()
-  set(WITH_ANAKIN OFF CACHE STRING "Anakin is valid only when GPU is set." FORCE)
-endif()
-
 if(WITH_AMD_GPU)
     find_package(HIP)
     include(hip)
@@ -227,6 +253,10 @@ if(WITH_MKLML)
     list(APPEND EXTERNAL_LIBS ${MKLML_IOMP_LIB})
 endif()
 
+if(WITH_LIBXSMM)
+    list(APPEND EXTERNAL_LIBS ${LIBXSMM_LIBS})
+endif()
+
 if(WITH_MKLDNN)
     list(APPEND EXTERNAL_LIBS ${MKLDNN_LIB})
 endif()
@@ -266,7 +296,3 @@ if(WITH_DOC)
     find_python_module(recommonmark REQUIRED)
     add_subdirectory(doc)
 endif()
-
-if (WITH_CONTRIB)
-    add_subdirectory(paddle/contrib)
-endif()
diff --git a/Dockerfile b/Dockerfile
index 48c750358cfcb227667c429f19befcaa2f51ebbd..634be18a51bf61e96a8bf6f263b6674a7932d6e4 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -53,7 +53,7 @@ RUN curl -s -q https://glide.sh/get | sh
 #    and its size is only one-third of the official one.
 # 2. Manually add ~IPluginFactory() in IPluginFactory class of NvInfer.h, otherwise, it couldn't work in paddle.
 #    See https://github.com/PaddlePaddle/Paddle/issues/10129 for details.
-RUN wget -qO- http://paddlepaddledeps.bj.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz | \
+RUN wget -qO- http://paddlepaddledeps.cdn.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz | \
     tar -xz -C /usr/local && \
     cp -rf /usr/local/TensorRT/include /usr && \
     cp -rf /usr/local/TensorRT/lib /usr
@@ -80,7 +80,7 @@ RUN pip install pre-commit 'ipython==5.3.0' && \
     pip install opencv-python
 
 #For docstring checker
-RUN pip install pylint pytest astroid isort
+RUN pip install pylint pytest astroid isort LinkChecker
 
 COPY ./python/requirements.txt /root/
 RUN pip install -r /root/requirements.txt
diff --git a/README.md b/README.md
index eb99ed21d02650ef16cc7da91836909c02895be9..45186ec4ef48dc305b2616dbf4966f01c3609962 100644
--- a/README.md
+++ b/README.md
@@ -18,7 +18,21 @@ learning to many products at Baidu.
 Our vision is to enable deep learning for everyone via PaddlePaddle.
 Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle.
 
-### Lastest PaddlePaddle Version: [Fluid](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid)
+
+### Latest PaddlePaddle Release: [Fluid 0.15.0](https://github.com/PaddlePaddle/Paddle/tree/v0.15.0)
+### Install Latest Stable Release:
+```
+# Linux CPU
+pip install paddlepaddle
+# Linux GPU cuda9cudnn7
+pip install paddlepaddle-gpu
+# Linux GPU cuda8cudnn7
+pip install paddlepaddle-gpu==0.14.0.post87
+# Linux GPU cuda8cudnn5
+pip install paddlepaddle-gpu==0.14.0.post85
+
+# For installation on other platform, refer to http://paddlepaddle.org/
+```
 
 ## Features
 
@@ -62,33 +76,26 @@ Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddl
 
 ## Installation
 
-It is recommended to check out the
-[Docker installation guide](http://www.paddlepaddle.org/docs/develop/documentation/fluid/en/build_and_install/docker_install_en.html)
-before looking into the
-[build from source guide](http://www.paddlepaddle.org/docs/develop/documentation/fluid/en/build_and_install/build_from_source_en.html).
+It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/0.15.0/new_docs/beginners_guide/install/install_doc.html) on our website.
 
 ## Documentation
 
-We provide [English](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html) and
-[Chinese](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html) documentation.
+We provide [English](http://paddlepaddle.org/documentation/docs/en/0.15.0/getstarted/index_en.html) and
+[Chinese](http://paddlepaddle.org/documentation/docs/zh/0.15.0/new_docs/beginners_guide/index.html) documentation.
 
-- [Deep Learning 101](http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.html)
+- [Deep Learning 101](https://github.com/PaddlePaddle/book)
 
   You might want to start from this online interactive book that can run in a Jupyter Notebook.
 
-- [Distributed Training](http://www.paddlepaddle.org/docs/develop/documentation/en/howto/cluster/index_en.html)
+- [Distributed Training](http://paddlepaddle.org/documentation/docs/zh/0.15.0/new_docs/user_guides/howto/training/cluster_howto.html)
 
   You can run distributed training jobs on MPI clusters.
 
-- [Distributed Training on Kubernetes](http://www.paddlepaddle.org/docs/develop/documentation/en/howto/cluster/multi_cluster/k8s_en.html)
-
-   You can also run distributed training jobs on Kubernetes clusters.
-
-- [Python API](http://www.paddlepaddle.org/docs/develop/api/en/overview.html)
+- [Python API](http://paddlepaddle.org/documentation/api/zh/0.15.0/fluid.html)
 
    Our new API enables much shorter programs.
 
-- [How to Contribute](http://www.paddlepaddle.org/docs/develop/documentation/fluid/en/dev/contribute_to_paddle_en.html)
+- [How to Contribute](http://paddlepaddle.org/documentation/docs/zh/0.15.0/new_docs/advanced_usage/development/contribute_to_paddle.html)
 
    We appreciate your contributions!
 
diff --git a/benchmark/fluid/Dockerfile b/benchmark/fluid/Dockerfile
index 707fadb1fae97cefe8a41715cd57d71754abda41..2e1e0d376899fd664866621263db62258e7c3869 100644
--- a/benchmark/fluid/Dockerfile
+++ b/benchmark/fluid/Dockerfile
@@ -11,6 +11,7 @@ RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.7 /usr/lib/libcudnn.so && ln -s
 # Add "ENV http_proxy=http://ip:port" if your download is slow, and don't forget to unset it at runtime.
 # exmaple: unset http_proxy && unset https_proxy && python fluid_benchmark.py ...
 
+
 RUN pip install -U pip
 RUN pip install -U kubernetes paddlepaddle
 
@@ -27,5 +28,6 @@ ADD *.whl /
 RUN pip install /*.whl && rm -f /*.whl 
 
 ENV LD_LIBRARY_PATH=/usr/local/lib
-ADD fluid_benchmark.py recordio_converter.py args.py recordio_converter.py run.sh run_fluid_benchmark.sh /workspace/
+ADD fluid_benchmark.py recordio_converter.py args.py recordio_converter.py run.sh run_fluid_benchmark.sh imagenet_reader.py /workspace/
 ADD models/ /workspace/models/
+
diff --git a/benchmark/fluid/args.py b/benchmark/fluid/args.py
index a79f25ccc6ace1594f3f331633130eaace5e175b..0d5c9652de6b814627e54018366137e214726619 100644
--- a/benchmark/fluid/args.py
+++ b/benchmark/fluid/args.py
@@ -17,7 +17,8 @@ import argparse
 __all__ = ['parse_args', ]
 
 BENCHMARK_MODELS = [
-    "machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm"
+    "machine_translation", "resnet", "se_resnext", "vgg", "mnist",
+    "stacked_dynamic_lstm", "resnet_with_preprocess"
 ]
 
 
@@ -67,12 +68,12 @@ def parse_args():
         '--cpus',
         type=int,
         default=1,
-        help='If cpus > 1, will use ParallelDo to run, else use Executor.')
+        help='If cpus > 1, will set ParallelExecutor to use multiple threads.')
     parser.add_argument(
         '--data_set',
         type=str,
         default='flowers',
-        choices=['cifar10', 'flowers'],
+        choices=['cifar10', 'flowers', 'imagenet'],
         help='Optional dataset for benchmark.')
     parser.add_argument(
         '--infer_only', action='store_true', help='If set, run forward only.')
@@ -122,6 +123,11 @@ def parse_args():
         type=str,
         default="",
         help='Directory that contains all the training recordio files.')
+    parser.add_argument(
+        '--test_data_path',
+        type=str,
+        default="",
+        help='Directory that contains all the test data (NOT recordio).')
     parser.add_argument(
         '--use_inference_transpiler',
         action='store_true',
@@ -130,5 +136,15 @@ def parse_args():
         '--no_random',
         action='store_true',
         help='If set, keep the random seed and do not shuffle the data.')
+    parser.add_argument(
+        '--use_lars',
+        action='store_true',
+        help='If set, use lars for optimizers, ONLY support resnet module.')
+    parser.add_argument(
+        '--reduce_strategy',
+        type=str,
+        choices=['reduce', 'all_reduce'],
+        default='all_reduce',
+        help='Specify the reduce strategy, can be reduce, all_reduce')
     args = parser.parse_args()
     return args
diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py
index 94ea7bd6aca7c9595037a2dacc5e36d4c77827e7..ddd9fe809853a830ca676cc98f1819f683866def 100644
--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
@@ -16,6 +16,7 @@ import argparse
 import cProfile
 import time
 import os
+import traceback
 
 import numpy as np
 
@@ -27,7 +28,7 @@ import paddle.fluid.transpiler.distribute_transpiler as distribute_transpiler
 from args import *
 
 
-def append_nccl2_prepare(trainer_id):
+def append_nccl2_prepare(trainer_id, startup_prog):
     if trainer_id >= 0:
         # append gen_nccl_id at the end of startup program
         trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
@@ -40,11 +41,11 @@ def append_nccl2_prepare(trainer_id):
         current_endpoint = os.getenv("PADDLE_CURRENT_IP") + ":" + port
         worker_endpoints.remove(current_endpoint)
 
-        nccl_id_var = fluid.default_startup_program().global_block().create_var(
+        nccl_id_var = startup_prog.global_block().create_var(
             name="NCCLID",
             persistable=True,
             type=fluid.core.VarDesc.VarType.RAW)
-        fluid.default_startup_program().global_block().append_op(
+        startup_prog.global_block().append_op(
             type="gen_nccl_id",
             inputs={},
             outputs={"NCCLID": nccl_id_var},
@@ -59,7 +60,7 @@ def append_nccl2_prepare(trainer_id):
                         "nccl-based dist train.")
 
 
-def dist_transpile(trainer_id, args):
+def dist_transpile(trainer_id, args, train_prog, startup_prog):
     if trainer_id < 0:
         return None, None
 
@@ -80,137 +81,74 @@ def dist_transpile(trainer_id, args):
     # the role, should be either PSERVER or TRAINER
     training_role = os.getenv("PADDLE_TRAINING_ROLE")
 
-    t = distribute_transpiler.DistributeTranspiler()
+    config = distribute_transpiler.DistributeTranspilerConfig()
+    config.slice_var_up = not args.no_split_var
+    t = distribute_transpiler.DistributeTranspiler(config=config)
     t.transpile(
         trainer_id,
+        # NOTE: *MUST* use train_prog, for we are using with guard to
+        # generate different program for train and test.
+        program=train_prog,
         pservers=pserver_endpoints,
         trainers=trainers,
         sync_mode=not args.async_mode,
-        slice_var_up=not args.no_split_var)
+        startup_program=startup_prog)
     if training_role == "PSERVER":
         pserver_program = t.get_pserver_program(current_endpoint)
-        pserver_startup_program = t.get_startup_program(current_endpoint,
-                                                        pserver_program)
+        pserver_startup_program = t.get_startup_program(
+            current_endpoint, pserver_program, startup_program=startup_prog)
         return pserver_program, pserver_startup_program
     elif training_role == "TRAINER":
         train_program = t.get_trainer_program()
-        return train_program, fluid.default_startup_program()
+        return train_program, startup_prog
     else:
         raise ValueError(
             'PADDLE_TRAINING_ROLE environment variable must be either TRAINER or PSERVER'
         )
 
 
-def test(exe, inference_program, test_reader, feeder, batch_acc):
-    accuracy_evaluator = fluid.metrics.Accuracy()
-    for batch_id, data in enumerate(test_reader()):
-        acc = exe.run(inference_program,
-                      feed=feeder.feed(data),
-                      fetch_list=[batch_acc])
-        accuracy_evaluator.update(value=np.array(acc), weight=len(data))
+def test_parallel(exe, test_args, args, test_prog, feeder):
+    acc_evaluators = []
+    for i in xrange(len(test_args[2])):
+        acc_evaluators.append(fluid.metrics.Accuracy())
 
-    return accuracy_evaluator.eval()
-
-
-# TODO(wuyi): replace train, train_parallel, test functions with new trainer
-# API once it is ready.
-def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
-          args, train_prog, startup_prog):
-    if os.getenv("PADDLE_TRAINING_ROLE") == "PSERVER":
-        place = core.CPUPlace()
-        exe = fluid.Executor(place)
-        exe.run(startup_prog)
-        exe.run(train_prog)
-        return
-
-    if args.use_fake_data:
-        raise Exception(
-            "fake data is not supported in single GPU test for now.")
-
-    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
-    exe = fluid.Executor(place)
-    exe.run(startup_prog)
-
-    # Use inference_transpiler to speedup
-    if not args.use_reader_op:
-        feed_var_list = [
-            var for var in train_prog.global_block().vars.itervalues()
-            if var.is_data
-        ]
-        feeder = fluid.DataFeeder(feed_var_list, place)
-
-    iters, num_samples, start_time = 0, 0, time.time()
-    for pass_id in range(args.pass_num):
-        train_losses = []
-        if not args.use_reader_op:
-            reader_generator = train_reader()
-        batch_id = 0
-        data = None
+    to_fetch = [v.name for v in test_args[2]]
+    if args.use_reader_op:
+        test_args[4].start()
         while True:
-            if not args.use_reader_op:
-                data = next(reader_generator, None)
-                if data == None:
-                    break
-            if iters == args.iterations:
+            try:
+                acc_rets = exe.run(fetch_list=to_fetch)
+                for i, e in enumerate(acc_evaluators):
+                    e.update(
+                        value=np.array(acc_rets[i]), weight=args.batch_size)
+            except fluid.core.EOFException as eof:
+                test_args[4].reset()
                 break
-            if iters == args.skip_batch_num:
-                start_time = time.time()
-                num_samples = 0
+    else:
+        for batch_id, data in enumerate(test_args[3]()):
+            acc_rets = exe.run(feed=feeder.feed(data), fetch_list=to_fetch)
+            for i, e in enumerate(acc_evaluators):
+                e.update(value=np.array(acc_rets[i]), weight=len(data))
 
-            if args.use_reader_op:
-                try:
-                    loss = exe.run(train_prog, fetch_list=[avg_loss])
-                except fluid.core.EnforceNotMet as ex:
-                    break
-            else:
-                loss = exe.run(train_prog,
-                               feed=feeder.feed(data),
-                               fetch_list=[avg_loss])
-            iters += 1
-            batch_id += 1
-            # FIXME(wuyi): For use_reader_op, if the current
-            # pass is not the last, the last batch of this pass
-            # is also equal to args.batch_size.
-            if args.use_reader_op:
-                num_samples += args.batch_size * args.gpus
-            else:
-                num_samples += len(data)
-            train_losses.append(loss)
-            print("Pass: %d, Iter: %d, Loss: %f\n" %
-                  (pass_id, iters, np.mean(train_losses)))
-        print_train_time(start_time, time.time(), num_samples)
-        print("Pass: %d, Loss: %f" % (pass_id, np.mean(train_losses))),
-        # evaluation
-        if not args.no_test and batch_acc and not args.use_reader_op:
-            if args.use_inference_transpiler:
-                t = fluid.InferenceTranspiler()
-                t.transpile(infer_prog, place)
-
-            pass_test_acc = test(exe, infer_prog, test_reader, feeder,
-                                 batch_acc)
-            print(", Test Accuracy: %f" % pass_test_acc)
-        print("\n")
-        # TODO(wuyi): add warmup passes to get better perf data.
-        exit(0)
+    return [e.eval() for e in acc_evaluators]
 
 
-# TODO(wuyi): replace train, train_parallel, test functions with new trainer
-# API once it is ready.
-def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
-                   batch_acc, args, train_prog, startup_prog, nccl_id_var,
-                   num_trainers, trainer_id):
+# NOTE: only need to benchmark using parallelexe
+def train_parallel(train_args, test_args, args, train_prog, test_prog,
+                   startup_prog, nccl_id_var, num_trainers, trainer_id):
+    over_all_start = time.time()
     place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
+    feeder = None
     if not args.use_reader_op:
         feed_var_list = [
             var for var in train_prog.global_block().vars.itervalues()
             if var.is_data
         ]
         feeder = fluid.DataFeeder(feed_var_list, place)
-
     # generate fake:
     if args.use_fake_data:
         for var in feed_var_list:
-            v = startup_prog.global_block().clone_variable(var)
+            v = startup_prog.global_block()._clone_variable(var)
             var.persistable = True
             v.persistable = True
 
@@ -230,62 +168,119 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
     startup_exe = fluid.Executor(place)
     startup_exe.run(startup_prog)
     strategy = fluid.ExecutionStrategy()
-    strategy.num_threads = 1
+    strategy.num_threads = args.cpus
     strategy.allow_op_delay = False
+    build_strategy = fluid.BuildStrategy()
+    if args.reduce_strategy == "reduce":
+        build_strategy.reduce_strategy = fluid.BuildStrategy(
+        ).ReduceStrategy.Reduce
+    else:
+        build_strategy.reduce_strategy = fluid.BuildStrategy(
+        ).ReduceStrategy.AllReduce
+
+    avg_loss = train_args[0]
+
+    if args.update_method == "pserver":
+        # parameter server mode distributed training, merge
+        # gradients on local server, do not initialize
+        # ParallelExecutor with multi server all-reduce mode.
+        num_trainers = 1
+        trainer_id = 0
+
     exe = fluid.ParallelExecutor(
         True,
         avg_loss.name,
+        main_program=train_prog,
         exec_strategy=strategy,
+        build_strategy=build_strategy,
         num_trainers=num_trainers,
         trainer_id=trainer_id)
 
+    if not args.no_test:
+        if args.update_method == "pserver":
+            test_scope = None
+        else:
+            # NOTE: use an empty scope to avoid test exe using NCCLID
+            test_scope = fluid.Scope()
+        test_exe = fluid.ParallelExecutor(
+            True, main_program=test_prog, share_vars_from=exe)
+
     for pass_id in range(args.pass_num):
         num_samples = 0
         iters = 0
         start_time = time.time()
         if not args.use_reader_op:
-            reader_generator = train_reader()
+            reader_generator = train_args[3]()  #train_reader
         batch_id = 0
         data = None
+        if args.use_reader_op:
+            train_args[4].start()
         while True:
             if not args.use_reader_op:
                 data = next(reader_generator, None)
                 if data == None:
                     break
+            if args.profile and batch_id == 5:
+                profiler.start_profiler("All")
+                profiler.reset_profiler()
+            elif args.profile and batch_id == 10:
+                print("profiling total time: ", time.time() - start_time)
+                profiler.stop_profiler("total", "/tmp/profile_%d_pass%d" %
+                                       (trainer_id, pass_id))
             if iters == args.iterations:
+                reader_generator.close()
                 break
-            if args.profile and pass_id == 0 and batch_id == 5:
-                profiler.start_profiler("All")
-            elif args.profile and pass_id == 0 and batch_id == 10:
-                profiler.stop_profiler("total", "/tmp/profile_%d" % trainer_id)
 
             if iters == args.skip_batch_num:
                 start_time = time.time()
                 num_samples = 0
+            fetch_list = [avg_loss.name]
+            acc_name_list = [v.name for v in train_args[2]]
+            fetch_list.extend(acc_name_list)
+
             if args.use_fake_data or args.use_reader_op:
                 try:
-                    loss, = exe.run([avg_loss.name])
+
+                    fetch_ret = exe.run(fetch_list)
+                except fluid.core.EOFException as eof:
+                    break
                 except fluid.core.EnforceNotMet as ex:
+                    traceback.print_exc()
                     break
             else:
-                loss, = exe.run([avg_loss.name], feed=feeder.feed(data))
+                fetch_ret = exe.run(fetch_list, feed=feeder.feed(data))
             if args.use_reader_op:
                 num_samples += args.batch_size * args.gpus
             else:
                 num_samples += len(data)
+
             iters += 1
             if batch_id % 1 == 0:
-                print("Pass %d, batch %d, loss %s" %
-                      (pass_id, batch_id, np.array(loss)))
+                fetched_data = [np.mean(np.array(d)) for d in fetch_ret]
+                print("Pass %d, batch %d, loss %s, accucacys: %s" %
+                      (pass_id, batch_id, fetched_data[0], fetched_data[1:]))
             batch_id += 1
 
         print_train_time(start_time, time.time(), num_samples)
-        if not args.no_test and batch_acc and not args.use_reader_op:
-            # we have not implement record io for test
-            # skip test when use args.use_reader_op
-            test_acc = test(startup_exe, infer_prog, test_reader, feeder,
-                            batch_acc)
-            print("Pass: %d, Test Accuracy: %f\n" % (pass_id, test_acc))
+        if args.use_reader_op:
+            train_args[4].reset()  # reset reader handle
+        else:
+            del reader_generator
+
+        if not args.no_test and test_args[2]:
+            test_feeder = None
+            if not args.use_reader_op:
+                test_feed_var_list = [
+                    var for var in test_prog.global_block().vars.itervalues()
+                    if var.is_data
+                ]
+                test_feeder = fluid.DataFeeder(test_feed_var_list, place)
+            test_ret = test_parallel(test_exe, test_args, args, test_prog,
+                                     test_feeder)
+            print("Pass: %d, Test Accuracy: %s\n" %
+                  (pass_id, [np.mean(np.array(v)) for v in test_ret]))
+
+    print("total train time: ", time.time() - over_all_start)
 
 
 def print_arguments(args):
@@ -327,44 +322,46 @@ def main():
     if args.use_cprof:
         pr = cProfile.Profile()
         pr.enable()
+
     model_def = __import__("models.%s" % args.model, fromlist=["models"])
-    train_args = list(model_def.get_model(args))
-    train_args.append(args)
-    # Run optimizer.minimize(avg_loss)
-    train_args[2].minimize(train_args[0])
-    if args.memory_optimize:
-        fluid.memory_optimize(fluid.default_main_program())
+
+    train_prog = fluid.Program()
+    test_prog = fluid.Program()
+    startup_prog = fluid.Program()
+
+    train_args = list(model_def.get_model(args, True, train_prog, startup_prog))
+    test_args = list(model_def.get_model(args, False, test_prog, startup_prog))
+
+    all_args = [train_args, test_args, args]
 
     if args.update_method == "pserver":
-        train_prog, startup_prog = dist_transpile(trainer_id, args)
+        train_prog, startup_prog = dist_transpile(trainer_id, args, train_prog,
+                                                  startup_prog)
         if not train_prog:
             raise Exception(
                 "Must configure correct environments to run dist train.")
-        train_args.extend([train_prog, startup_prog])
+        all_args.extend([train_prog, test_prog, startup_prog])
         if args.gpus > 1 and os.getenv("PADDLE_TRAINING_ROLE") == "TRAINER":
-            train_args.extend([nccl_id_var, num_trainers, trainer_id])
-            train_parallel(*train_args)
-        train(*train_args)
+            all_args.extend([nccl_id_var, num_trainers, trainer_id])
+            train_parallel(*all_args)
+        elif os.getenv("PADDLE_TRAINING_ROLE") == "PSERVER":
+            # start pserver with Executor
+            server_exe = fluid.Executor(fluid.CPUPlace())
+            server_exe.run(startup_prog)
+            server_exe.run(train_prog)
         exit(0)
 
     # for other update methods, use default programs
-    train_args.append(fluid.default_main_program())
-    train_args.append(fluid.default_startup_program())
+    all_args.extend([train_prog, test_prog, startup_prog])
 
     if args.update_method == "nccl2":
-        nccl_id_var, num_trainers, trainer_id = append_nccl2_prepare(trainer_id)
-    if args.gpus == 1:
-        # NOTE: parallel executor use profiler interanlly
-        if args.use_nvprof and args.device == 'GPU':
-            with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof:
-                train(*train_args)
-        else:
-            train(*train_args)
-    else:
-        if args.device == "CPU":
-            raise Exception("Only support GPU perf with parallel exe")
-        train_args.extend([nccl_id_var, num_trainers, trainer_id])
-        train_parallel(*train_args)
+        nccl_id_var, num_trainers, trainer_id = append_nccl2_prepare(
+            trainer_id, startup_prog)
+
+    if args.device == "CPU":
+        raise Exception("Only support GPU perf with parallel exe")
+    all_args.extend([nccl_id_var, num_trainers, trainer_id])
+    train_parallel(*all_args)
 
 
 if __name__ == "__main__":
diff --git a/benchmark/fluid/imagenet_reader.py b/benchmark/fluid/imagenet_reader.py
new file mode 100644
index 0000000000000000000000000000000000000000..a39485a61f12417fbdb512fc81e90ec49c310bf5
--- /dev/null
+++ b/benchmark/fluid/imagenet_reader.py
@@ -0,0 +1,344 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import math
+import random
+import functools
+import numpy as np
+from threading import Thread
+import subprocess
+import time
+
+from Queue import Queue
+import paddle
+from PIL import Image, ImageEnhance
+
+random.seed(0)
+
+DATA_DIM = 224
+
+THREAD = int(os.getenv("PREPROCESS_THREADS", "10"))
+BUF_SIZE = 5120
+
+DATA_DIR = '/mnt/ImageNet'
+TRAIN_LIST = '/mnt/ImageNet/train.txt'
+TEST_LIST = '/mnt/ImageNet/val.txt'
+
+img_mean = np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1))
+img_std = np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1))
+
+
+def resize_short(img, target_size):
+    percent = float(target_size) / min(img.size[0], img.size[1])
+    resized_width = int(round(img.size[0] * percent))
+    resized_height = int(round(img.size[1] * percent))
+    img = img.resize((resized_width, resized_height), Image.LANCZOS)
+    return img
+
+
+def crop_image(img, target_size, center):
+    width, height = img.size
+    size = target_size
+    if center == True:
+        w_start = (width - size) / 2
+        h_start = (height - size) / 2
+    else:
+        w_start = random.randint(0, width - size)
+        h_start = random.randint(0, height - size)
+    w_end = w_start + size
+    h_end = h_start + size
+    img = img.crop((w_start, h_start, w_end, h_end))
+    return img
+
+
+def random_crop(img, size, scale=[0.08, 1.0], ratio=[3. / 4., 4. / 3.]):
+    aspect_ratio = math.sqrt(random.uniform(*ratio))
+    w = 1. * aspect_ratio
+    h = 1. / aspect_ratio
+
+    bound = min((float(img.size[0]) / img.size[1]) / (w**2),
+                (float(img.size[1]) / img.size[0]) / (h**2))
+    scale_max = min(scale[1], bound)
+    scale_min = min(scale[0], bound)
+
+    target_area = img.size[0] * img.size[1] * random.uniform(scale_min,
+                                                             scale_max)
+    target_size = math.sqrt(target_area)
+    w = int(target_size * w)
+    h = int(target_size * h)
+
+    i = random.randint(0, img.size[0] - w)
+    j = random.randint(0, img.size[1] - h)
+
+    img = img.crop((i, j, i + w, j + h))
+    img = img.resize((size, size), Image.LANCZOS)
+    return img
+
+
+def rotate_image(img):
+    angle = random.randint(-10, 10)
+    img = img.rotate(angle)
+    return img
+
+
+def distort_color(img):
+    def random_brightness(img, lower=0.5, upper=1.5):
+        e = random.uniform(lower, upper)
+        return ImageEnhance.Brightness(img).enhance(e)
+
+    def random_contrast(img, lower=0.5, upper=1.5):
+        e = random.uniform(lower, upper)
+        return ImageEnhance.Contrast(img).enhance(e)
+
+    def random_color(img, lower=0.5, upper=1.5):
+        e = random.uniform(lower, upper)
+        return ImageEnhance.Color(img).enhance(e)
+
+    ops = [random_brightness, random_contrast, random_color]
+    random.shuffle(ops)
+
+    img = ops[0](img)
+    img = ops[1](img)
+    img = ops[2](img)
+
+    return img
+
+
+def process_image(sample, mode, color_jitter, rotate):
+    img_path = sample[0]
+
+    img = Image.open(img_path)
+    if mode == 'train':
+        if rotate: img = rotate_image(img)
+        img = random_crop(img, DATA_DIM)
+    else:
+        img = resize_short(img, target_size=256)
+        img = crop_image(img, target_size=DATA_DIM, center=True)
+    if mode == 'train':
+        if color_jitter:
+            img = distort_color(img)
+        if random.randint(0, 1) == 1:
+            img = img.transpose(Image.FLIP_LEFT_RIGHT)
+
+    if img.mode != 'RGB':
+        img = img.convert('RGB')
+
+    img = np.array(img).astype('float32').transpose((2, 0, 1)) / 255
+    img -= img_mean
+    img /= img_std
+
+    if mode == 'train' or mode == 'val':
+        return img, sample[1]
+    elif mode == 'test':
+        return [img]
+
+
+class XmapEndSignal():
+    pass
+
+
+def xmap_readers(mapper,
+                 reader,
+                 process_num,
+                 buffer_size,
+                 order=False,
+                 print_queue_state=True):
+    end = XmapEndSignal()
+
+    # define a worker to read samples from reader to in_queue
+    def read_worker(reader, in_queue):
+        for i in reader():
+            in_queue.put(i)
+        in_queue.put(end)
+
+    # define a worker to read samples from reader to in_queue with order flag
+    def order_read_worker(reader, in_queue, file_queue):
+        in_order = 0
+        for i in reader():
+            in_queue.put((in_order, i))
+            in_order += 1
+        in_queue.put(end)
+
+    # define a worker to handle samples from in_queue by mapper
+    # and put mapped samples into out_queue
+    def handle_worker(in_queue, out_queue, mapper):
+        sample = in_queue.get()
+        while not isinstance(sample, XmapEndSignal):
+            r = mapper(sample)
+            out_queue.put(r)
+            sample = in_queue.get()
+        in_queue.put(end)
+        out_queue.put(end)
+
+    # define a worker to handle samples from in_queue by mapper
+    # and put mapped samples into out_queue by order
+    def order_handle_worker(in_queue, out_queue, mapper, out_order):
+        ins = in_queue.get()
+        while not isinstance(ins, XmapEndSignal):
+            order, sample = ins
+            r = mapper(sample)
+            while order != out_order[0]:
+                pass
+            out_queue.put(r)
+            out_order[0] += 1
+            ins = in_queue.get()
+        in_queue.put(end)
+        out_queue.put(end)
+
+    def xreader():
+        file_queue = Queue()
+        in_queue = Queue(buffer_size)
+        out_queue = Queue(buffer_size)
+        out_order = [0]
+        # start a read worker in a thread
+        target = order_read_worker if order else read_worker
+        t = Thread(target=target, args=(reader, in_queue))
+        t.daemon = True
+        t.start()
+        # start several handle_workers
+        target = order_handle_worker if order else handle_worker
+        args = (in_queue, out_queue, mapper, out_order) if order else (
+            in_queue, out_queue, mapper)
+        workers = []
+        for i in xrange(process_num):
+            worker = Thread(target=target, args=args)
+            worker.daemon = True
+            workers.append(worker)
+        for w in workers:
+            w.start()
+
+        sample = out_queue.get()
+        start_t = time.time()
+        while not isinstance(sample, XmapEndSignal):
+            yield sample
+            sample = out_queue.get()
+            if time.time() - start_t > 3:
+                if print_queue_state:
+                    print("queue sizes: ", in_queue.qsize(), out_queue.qsize())
+                start_t = time.time()
+        finish = 1
+        while finish < process_num:
+            sample = out_queue.get()
+            if isinstance(sample, XmapEndSignal):
+                finish += 1
+            else:
+                yield sample
+
+    return xreader
+
+
+def _reader_creator(file_list,
+                    mode,
+                    shuffle=False,
+                    color_jitter=False,
+                    rotate=False,
+                    xmap=True):
+    def reader():
+        with open(file_list) as flist:
+            full_lines = [line.strip() for line in flist]
+            if shuffle:
+                random.shuffle(full_lines)
+            if mode == 'train':
+                trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
+                trainer_count = int(os.getenv("PADDLE_TRAINERS"))
+                per_node_lines = len(full_lines) / trainer_count
+                lines = full_lines[trainer_id * per_node_lines:(trainer_id + 1)
+                                   * per_node_lines]
+                print(
+                    "read images from %d, length: %d, lines length: %d, total: %d"
+                    % (trainer_id * per_node_lines, per_node_lines, len(lines),
+                       len(full_lines)))
+            else:
+                lines = full_lines
+
+            for line in lines:
+                if mode == 'train':
+                    img_path, label = line.split()
+                    img_path = img_path.replace("JPEG", "jpeg")
+                    img_path = os.path.join(DATA_DIR, "train", img_path)
+                    yield (img_path, int(label))
+                elif mode == 'val':
+                    img_path, label = line.split()
+                    img_path = img_path.replace("JPEG", "jpeg")
+                    img_path = os.path.join(DATA_DIR, "val", img_path)
+                    yield (img_path, int(label))
+                elif mode == 'test':
+                    img_path = os.path.join(DATA_DIR, line)
+                    yield [img_path]
+
+    mapper = functools.partial(
+        process_image, mode=mode, color_jitter=color_jitter, rotate=rotate)
+
+    return paddle.reader.xmap_readers(mapper, reader, THREAD, BUF_SIZE)
+
+
+def load_raw_image_uint8(sample):
+    img_arr = np.array(Image.open(sample[0])).astype('int64')
+    return img_arr, int(sample[1])
+
+
+def train_raw(file_list=TRAIN_LIST, shuffle=True):
+    def reader():
+        with open(file_list) as flist:
+            full_lines = [line.strip() for line in flist]
+            if shuffle:
+                random.shuffle(full_lines)
+
+            trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
+            trainer_count = int(os.getenv("PADDLE_TRAINERS"))
+            per_node_lines = len(full_lines) / trainer_count
+            lines = full_lines[trainer_id * per_node_lines:(trainer_id + 1) *
+                               per_node_lines]
+            print("read images from %d, length: %d, lines length: %d, total: %d"
+                  % (trainer_id * per_node_lines, per_node_lines, len(lines),
+                     len(full_lines)))
+
+            for line in lines:
+                img_path, label = line.split()
+                img_path = img_path.replace("JPEG", "jpeg")
+                img_path = os.path.join(DATA_DIR, "train", img_path)
+                yield (img_path, int(label))
+
+    return paddle.reader.xmap_readers(load_raw_image_uint8, reader, THREAD,
+                                      BUF_SIZE)
+
+
+def train(file_list=TRAIN_LIST, xmap=True):
+    return _reader_creator(
+        file_list,
+        'train',
+        shuffle=True,
+        color_jitter=False,
+        rotate=False,
+        xmap=xmap)
+
+
+def val(file_list=TEST_LIST, xmap=True):
+    return _reader_creator(file_list, 'val', shuffle=False, xmap=xmap)
+
+
+def test(file_list=TEST_LIST):
+    return _reader_creator(file_list, 'test', shuffle=False)
+
+
+if __name__ == "__main__":
+    c = 0
+    start_t = time.time()
+    for d in train()():
+        c += 1
+        if c >= 10000:
+            break
+    spent = time.time() - start_t
+    print("read 10000 speed: ", 10000 / spent, spent)
diff --git a/benchmark/fluid/kube_gen_job.py b/benchmark/fluid/kube_gen_job.py
index dfe8b5cdd58456902fa8ec355e9837dface3f7be..c1f22f1bfa02dd409edc8e1c39a72524240f4088 100644
--- a/benchmark/fluid/kube_gen_job.py
+++ b/benchmark/fluid/kube_gen_job.py
@@ -163,6 +163,19 @@ def gen_job():
         volumes.append({"name": "dshm", "emptyDir": {"medium": "Memory"}})
         volumeMounts.append({"mountPath": "/dev/shm", "name": "dshm"})
 
+    # add ceph volumes
+    volumes.append({
+        "name": "ceph-data",
+        "cephfs": {
+            "monitors": ["192.168.16.23:6789"],
+            "secretRef": {
+                "name": "ceph-secret"
+            },
+            "user": "admin",
+        }
+    })
+    volumeMounts.append({"mountPath": "/mnt/data", "name": "ceph-data"})
+
     tn["spec"]["template"]["spec"]["volumes"] = volumes
     tn_container["volumeMounts"] = volumeMounts
 
diff --git a/benchmark/fluid/models/__init__.py b/benchmark/fluid/models/__init__.py
index 1c3fcac8dd4a1ba0496ef013bd4eb468a0075125..1b8f63c7070c2cd45531966b0bcdff95a848574d 100644
--- a/benchmark/fluid/models/__init__.py
+++ b/benchmark/fluid/models/__init__.py
@@ -13,5 +13,6 @@
 # limitations under the License.
 
 __all__ = [
-    "machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm"
+    "machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm",
+    "resnet_with_preprocess"
 ]
diff --git a/benchmark/fluid/models/machine_translation.py b/benchmark/fluid/models/machine_translation.py
index 17f6b03826ae818a3671ea7f9355a8e8c04b50be..18163c35d65a28c046cfeb33f5b96c34a1a6a35a 100644
--- a/benchmark/fluid/models/machine_translation.py
+++ b/benchmark/fluid/models/machine_translation.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """seq2seq model for fluid."""
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -181,7 +182,7 @@ def lodtensor_to_ndarray(lod_tensor):
     return ndarray
 
 
-def get_model(args):
+def get_model(args, is_train, main_prog, startup_prog):
     if args.use_reader_op:
         raise Exception("machine_translation do not support reader op for now.")
     embedding_dim = 512
@@ -190,30 +191,27 @@ def get_model(args):
     dict_size = 30000
     beam_size = 3
     max_length = 250
-    avg_cost, feeding_list = seq_to_seq_net(
-        embedding_dim,
-        encoder_size,
-        decoder_size,
-        dict_size,
-        dict_size,
-        False,
-        beam_size=beam_size,
-        max_length=max_length)
-
-    # clone from default main program
-    inference_program = fluid.default_main_program().clone()
-
-    optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
-
-    train_batch_generator = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.wmt14.train(dict_size), buf_size=1000),
-        batch_size=args.batch_size * args.gpus)
 
-    test_batch_generator = paddle.batch(
+    with fluid.program_guard(main_prog, startup_prog):
+        with fluid.unique_name.guard():
+            avg_cost, feeding_list = seq_to_seq_net(
+                embedding_dim,
+                encoder_size,
+                decoder_size,
+                dict_size,
+                dict_size,
+                False,
+                beam_size=beam_size,
+                max_length=max_length)
+    if is_train:
+        optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
+        optimizer.minimize(avg_cost)
+
+    batch_generator = paddle.batch(
         paddle.reader.shuffle(
-            paddle.dataset.wmt14.test(dict_size), buf_size=1000),
-        batch_size=args.batch_size)
+            paddle.dataset.wmt14.train(dict_size)
+            if is_train else paddle.dataset.wmt14.test(dict_size),
+            buf_size=1000),
+        batch_size=args.batch_size * args.gpus)
 
-    return avg_cost, inference_program, optimizer, train_batch_generator, \
-           test_batch_generator, None
+    return avg_cost, optimizer, [], batch_generator, None
diff --git a/benchmark/fluid/models/mnist.py b/benchmark/fluid/models/mnist.py
index 8e740dc6896b7eeeb82170aa13d32987c4df5c48..f123e07fb711bd8ff67c1ecf5ec9a02c1e79eb1d 100644
--- a/benchmark/fluid/models/mnist.py
+++ b/benchmark/fluid/models/mnist.py
@@ -65,61 +65,53 @@ def cnn_model(data):
     return predict
 
 
-def get_model(args):
-    if args.use_reader_op:
-        filelist = [
-            os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
-        ]
-        data_file = fluid.layers.open_files(
-            filenames=filelist,
-            shapes=[[-1, 1, 28, 28], (-1, 1)],
-            lod_levels=[0, 0],
-            dtypes=["float32", "int64"],
-            thread_num=args.gpus,
-            pass_num=args.pass_num)
-        data_file = fluid.layers.double_buffer(
-            fluid.layers.batch(
-                data_file, batch_size=args.batch_size))
-        images, label = fluid.layers.read_file(data_file)
-    else:
-        images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
-        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-
-    if args.device == 'CPU' and args.cpus > 1:
-        places = fluid.layers.get_places(args.cpus)
-        pd = fluid.layers.ParallelDo(places)
-        with pd.do():
-            predict = cnn_model(pd.read_input(images))
-            label = pd.read_input(label)
+def get_model(args, is_train, main_prog, startup_prog):
+    # NOTE: mnist is small, we don't implement data sharding yet.
+    opt = None
+    data_file_handle = None
+    with fluid.program_guard(main_prog, startup_prog):
+        if args.use_reader_op:
+            filelist = [
+                os.path.join(args.data_path, f)
+                for f in os.listdir(args.data_path)
+            ]
+            data_file_handle = fluid.layers.open_files(
+                filenames=filelist,
+                shapes=[[-1, 1, 28, 28], (-1, 1)],
+                lod_levels=[0, 0],
+                dtypes=["float32", "int64"],
+                thread_num=1,
+                pass_num=1)
+            data_file = fluid.layers.double_buffer(
+                fluid.layers.batch(
+                    data_file_handle, batch_size=args.batch_size))
+        with fluid.unique_name.guard():
+            if args.use_reader_op:
+                input, label = fluid.layers.read_file(data_file)
+            else:
+                images = fluid.layers.data(
+                    name='pixel', shape=[1, 28, 28], dtype='float32')
+                label = fluid.layers.data(
+                    name='label', shape=[1], dtype='int64')
+
+            predict = cnn_model(images)
             cost = fluid.layers.cross_entropy(input=predict, label=label)
             avg_cost = fluid.layers.mean(x=cost)
+            # Evaluator
             batch_acc = fluid.layers.accuracy(input=predict, label=label)
-
-            pd.write_output(avg_cost)
-            pd.write_output(batch_acc)
-
-        avg_cost, batch_acc = pd()
-        avg_cost = fluid.layers.mean(avg_cost)
-        batch_acc = fluid.layers.mean(batch_acc)
-    else:
-        # Train program
-        predict = cnn_model(images)
-        cost = fluid.layers.cross_entropy(input=predict, label=label)
-        avg_cost = fluid.layers.mean(x=cost)
-
-        # Evaluator
-        batch_acc = fluid.layers.accuracy(input=predict, label=label)
-
-    # inference program
-    inference_program = fluid.default_main_program().clone()
-
-    # Optimization
-    opt = fluid.optimizer.AdamOptimizer(
-        learning_rate=0.001, beta1=0.9, beta2=0.999)
+            # Optimization
+            if is_train:
+                opt = fluid.optimizer.AdamOptimizer(
+                    learning_rate=0.001, beta1=0.9, beta2=0.999)
+                opt.minimize(avg_cost)
+                if args.memory_optimize:
+                    fluid.memory_optimize(main_prog)
 
     # Reader
-    train_reader = paddle.batch(
-        paddle.dataset.mnist.train(), batch_size=args.batch_size * args.gpus)
-    test_reader = paddle.batch(
-        paddle.dataset.mnist.test(), batch_size=args.batch_size)
-    return avg_cost, inference_program, opt, train_reader, test_reader, batch_acc
+    if is_train:
+        reader = paddle.dataset.mnist.train()
+    else:
+        reader = paddle.dataset.mnist.test()
+    batched_reader = paddle.batch(
+        reader, batch_size=args.batch_size * args.gpus)
+    return avg_cost, opt, [batch_acc], batched_reader, data_file_handle
diff --git a/benchmark/fluid/models/resnet.py b/benchmark/fluid/models/resnet.py
index d44a9c07d31cfae9d54ad5949b85c77e60eae258..1b3bfe659c7d97b58dc4121387d4db22266381c5 100644
--- a/benchmark/fluid/models/resnet.py
+++ b/benchmark/fluid/models/resnet.py
@@ -20,6 +20,7 @@ import functools
 import numpy as np
 import time
 import os
+import math
 
 import cProfile, pstats, StringIO
 
@@ -27,182 +28,215 @@ import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import paddle.fluid.profiler as profiler
-from recordio_converter import imagenet_train, imagenet_test
-
-
-def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'):
-    conv1 = fluid.layers.conv2d(
-        input=input,
-        filter_size=filter_size,
-        num_filters=ch_out,
-        stride=stride,
-        padding=padding,
-        act=None,
-        bias_attr=False)
-    return fluid.layers.batch_norm(input=conv1, act=act)
-
-
-def shortcut(input, ch_out, stride):
-    ch_in = input.shape[1]  # if args.data_format == 'NCHW' else input.shape[-1]
-    if ch_in != ch_out:
-        return conv_bn_layer(input, ch_out, 1, stride, 0, None)
-    else:
-        return input
-
-
-def basicblock(input, ch_out, stride):
-    short = shortcut(input, ch_out, stride)
-    conv1 = conv_bn_layer(input, ch_out, 3, stride, 1)
-    conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, act=None)
-    return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')
-
-
-def bottleneck(input, ch_out, stride):
-    short = shortcut(input, ch_out * 4, stride)
-    conv1 = conv_bn_layer(input, ch_out, 1, stride, 0)
-    conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1)
-    conv3 = conv_bn_layer(conv2, ch_out * 4, 1, 1, 0, act=None)
-    return fluid.layers.elementwise_add(x=short, y=conv3, act='relu')
+from imagenet_reader import train, val
+
+train_parameters = {
+    "input_size": [3, 224, 224],
+    "input_mean": [0.485, 0.456, 0.406],
+    "input_std": [0.229, 0.224, 0.225],
+    "learning_strategy": {
+        "name": "piecewise_decay",
+        "batch_size": 256,
+        "epochs": [30, 60, 90],
+        "steps": [0.1, 0.01, 0.001, 0.0001]
+    }
+}
+
+
+class ResNet():
+    def __init__(self, layers=50, is_train=True):
+        self.params = train_parameters
+        self.layers = layers
+        self.is_train = is_train
+
+    def net(self, input, class_dim=1000):
+        layers = self.layers
+        supported_layers = [50, 101, 152]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(supported_layers, layers)
+
+        if layers == 50:
+            depth = [3, 4, 6, 3]
+        elif layers == 101:
+            depth = [3, 4, 23, 3]
+        elif layers == 152:
+            depth = [3, 8, 36, 3]
+        num_filters = [64, 128, 256, 512]
+
+        conv = self.conv_bn_layer(
+            input=input, num_filters=64, filter_size=7, stride=2, act='relu')
+        conv = fluid.layers.pool2d(
+            input=conv,
+            pool_size=3,
+            pool_stride=2,
+            pool_padding=1,
+            pool_type='max')
+
+        for block in range(len(depth)):
+            for i in range(depth[block]):
+                conv = self.bottleneck_block(
+                    input=conv,
+                    num_filters=num_filters[block],
+                    stride=2 if i == 0 and block != 0 else 1)
+
+        pool = fluid.layers.pool2d(
+            input=conv, pool_size=7, pool_type='avg', global_pooling=True)
+        stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
+        out = fluid.layers.fc(input=pool,
+                              size=class_dim,
+                              act='softmax',
+                              param_attr=fluid.param_attr.ParamAttr(
+                                  initializer=fluid.initializer.Uniform(-stdv,
+                                                                        stdv)))
+        return out
+
+    def conv_bn_layer(self,
+                      input,
+                      num_filters,
+                      filter_size,
+                      stride=1,
+                      groups=1,
+                      act=None):
+        conv = fluid.layers.conv2d(
+            input=input,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            act=None,
+            bias_attr=False)
+        return fluid.layers.batch_norm(
+            input=conv, act=act, is_test=not self.is_train)
+
+    def shortcut(self, input, ch_out, stride):
+        ch_in = input.shape[1]
+        if ch_in != ch_out or stride != 1:
+            return self.conv_bn_layer(input, ch_out, 1, stride)
+        else:
+            return input
 
+    def bottleneck_block(self, input, num_filters, stride):
+        conv0 = self.conv_bn_layer(
+            input=input, num_filters=num_filters, filter_size=1, act='relu')
+        conv1 = self.conv_bn_layer(
+            input=conv0,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            act='relu')
+        conv2 = self.conv_bn_layer(
+            input=conv1, num_filters=num_filters * 4, filter_size=1, act=None)
 
-def layer_warp(block_func, input, ch_out, count, stride):
-    res_out = block_func(input, ch_out, stride)
-    for i in range(1, count):
-        res_out = block_func(res_out, ch_out, 1)
-    return res_out
+        short = self.shortcut(input, num_filters * 4, stride)
 
+        return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')
 
-def resnet_imagenet(input, class_dim, depth=50, data_format='NCHW'):
 
-    cfg = {
-        18: ([2, 2, 2, 1], basicblock),
-        34: ([3, 4, 6, 3], basicblock),
-        50: ([3, 4, 6, 3], bottleneck),
-        101: ([3, 4, 23, 3], bottleneck),
-        152: ([3, 8, 36, 3], bottleneck)
-    }
-    stages, block_func = cfg[depth]
-    conv1 = conv_bn_layer(input, ch_out=64, filter_size=7, stride=2, padding=3)
-    pool1 = fluid.layers.pool2d(
-        input=conv1, pool_type='avg', pool_size=3, pool_stride=2)
-    res1 = layer_warp(block_func, pool1, 64, stages[0], 1)
-    res2 = layer_warp(block_func, res1, 128, stages[1], 2)
-    res3 = layer_warp(block_func, res2, 256, stages[2], 2)
-    res4 = layer_warp(block_func, res3, 512, stages[3], 2)
-    pool2 = fluid.layers.pool2d(
-        input=res4,
-        pool_size=7,
-        pool_type='avg',
-        pool_stride=1,
-        global_pooling=True)
-    out = fluid.layers.fc(input=pool2, size=class_dim, act='softmax')
-    return out
-
-
-def resnet_cifar10(input, class_dim, depth=32, data_format='NCHW'):
-    assert (depth - 2) % 6 == 0
-
-    n = (depth - 2) // 6
-
-    conv1 = conv_bn_layer(
-        input=input, ch_out=16, filter_size=3, stride=1, padding=1)
-    res1 = layer_warp(basicblock, conv1, 16, n, 1)
-    res2 = layer_warp(basicblock, res1, 32, n, 2)
-    res3 = layer_warp(basicblock, res2, 64, n, 2)
-    pool = fluid.layers.pool2d(
-        input=res3, pool_size=8, pool_type='avg', pool_stride=1)
-    out = fluid.layers.fc(input=pool, size=class_dim, act='softmax')
-    return out
-
-
-def get_model(args):
-    model = resnet_cifar10
-    if args.data_set == "cifar10":
-        class_dim = 10
-        if args.data_format == 'NCHW':
-            dshape = [3, 32, 32]
-        else:
-            dshape = [32, 32, 3]
-        model = resnet_cifar10
-        train_reader = paddle.dataset.cifar.train10()
-        test_reader = paddle.dataset.cifar.test10()
-    elif args.data_set == "flowers":
+def _model_reader_dshape_classdim(args, is_train):
+    model = None
+    reader = None
+    if args.data_set == "flowers":
         class_dim = 102
         if args.data_format == 'NCHW':
             dshape = [3, 224, 224]
         else:
             dshape = [224, 224, 3]
-        model = resnet_imagenet
-        train_reader = paddle.dataset.flowers.train()
-        test_reader = paddle.dataset.flowers.test()
+        if is_train:
+            reader = paddle.dataset.flowers.train()
+        else:
+            reader = paddle.dataset.flowers.test()
     elif args.data_set == "imagenet":
         class_dim = 1000
         if args.data_format == 'NCHW':
             dshape = [3, 224, 224]
         else:
             dshape = [224, 224, 3]
-        model = resnet_imagenet
         if not args.data_path:
             raise Exception(
                 "Must specify --data_path when training with imagenet")
-        train_reader = imagenet_train(args.data_path)
-        test_reader = imagenet_test(args.data_path)
-
-    if args.use_reader_op:
-        filelist = [
-            os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
-        ]
-        data_file = fluid.layers.open_files(
-            filenames=filelist,
-            shapes=[[-1] + dshape, (-1, 1)],
-            lod_levels=[0, 0],
-            dtypes=["float32", "int64"],
-            thread_num=args.gpus,
-            pass_num=args.pass_num)
-        data_file = fluid.layers.double_buffer(
-            fluid.layers.batch(
-                data_file, batch_size=args.batch_size))
-        input, label = fluid.layers.read_file(data_file)
-    else:
-        input = fluid.layers.data(name='data', shape=dshape, dtype='float32')
-        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-
-    if args.device == 'CPU' and args.cpus > 1:
-        places = fluid.layers.get_places(args.cpus)
-        pd = fluid.layers.ParallelDo(places)
-        with pd.do():
-            predict = model(pd.read_input(input), class_dim)
-            label = pd.read_input(label)
+        if not args.use_reader_op:
+            if is_train:
+                reader = train()
+            else:
+                reader = val()
+        else:
+            if is_train:
+                reader = train(xmap=False)
+            else:
+                reader = val(xmap=False)
+    return reader, dshape, class_dim
+
+
+def get_model(args, is_train, main_prog, startup_prog):
+    reader, dshape, class_dim = _model_reader_dshape_classdim(args, is_train)
+
+    pyreader = None
+    trainer_count = int(os.getenv("PADDLE_TRAINERS"))
+    with fluid.program_guard(main_prog, startup_prog):
+        with fluid.unique_name.guard():
+            if args.use_reader_op:
+                pyreader = fluid.layers.py_reader(
+                    capacity=args.batch_size * args.gpus,
+                    shapes=([-1] + dshape, (-1, 1)),
+                    dtypes=('float32', 'int64'),
+                    name="train_reader" if is_train else "test_reader",
+                    use_double_buffer=True)
+                input, label = fluid.layers.read_file(pyreader)
+            else:
+                input = fluid.layers.data(
+                    name='data', shape=dshape, dtype='float32')
+                label = fluid.layers.data(
+                    name='label', shape=[1], dtype='int64')
+
+            model = ResNet(is_train=is_train)
+            predict = model.net(input, class_dim=class_dim)
             cost = fluid.layers.cross_entropy(input=predict, label=label)
             avg_cost = fluid.layers.mean(x=cost)
-            batch_acc = fluid.layers.accuracy(input=predict, label=label)
-
-            pd.write_output(avg_cost)
-            pd.write_output(batch_acc)
 
-        avg_cost, batch_acc = pd()
-        avg_cost = fluid.layers.mean(avg_cost)
-        batch_acc = fluid.layers.mean(batch_acc)
+            batch_acc1 = fluid.layers.accuracy(input=predict, label=label, k=1)
+            batch_acc5 = fluid.layers.accuracy(input=predict, label=label, k=5)
+
+            # configure optimize
+            optimizer = None
+            if is_train:
+                if args.use_lars:
+                    lars_decay = 1.0
+                else:
+                    lars_decay = 0.0
+
+                total_images = 1281167 / trainer_count
+
+                step = int(total_images / (args.batch_size * args.gpus) + 1)
+                epochs = [30, 60, 90]
+                bd = [step * e for e in epochs]
+                base_lr = args.learning_rate
+                lr = []
+                lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
+                optimizer = fluid.optimizer.Momentum(
+                    learning_rate=fluid.layers.piecewise_decay(
+                        boundaries=bd, values=lr),
+                    momentum=0.9,
+                    regularization=fluid.regularizer.L2Decay(1e-4))
+                optimizer.minimize(avg_cost)
+
+                if args.memory_optimize:
+                    fluid.memory_optimize(main_prog)
+
+    # config readers
+    if not args.use_reader_op:
+        batched_reader = paddle.batch(
+            reader if args.no_random else paddle.reader.shuffle(
+                reader, buf_size=5120),
+            batch_size=args.batch_size * args.gpus,
+            drop_last=True)
     else:
-        predict = model(input, class_dim)
-        cost = fluid.layers.cross_entropy(input=predict, label=label)
-        avg_cost = fluid.layers.mean(x=cost)
-        batch_acc = fluid.layers.accuracy(input=predict, label=label)
-
-    inference_program = fluid.default_main_program().clone()
-    with fluid.program_guard(inference_program):
-        inference_program = fluid.io.get_inference_program(
-            target_vars=[batch_acc])
-
-    optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
-
-    batched_train_reader = paddle.batch(
-        train_reader if args.no_random else paddle.reader.shuffle(
-            train_reader, buf_size=5120),
-        batch_size=args.batch_size * args.gpus,
-        drop_last=True)
-    batched_test_reader = paddle.batch(
-        test_reader, batch_size=args.batch_size, drop_last=True)
-
-    return avg_cost, inference_program, optimizer, batched_train_reader,\
-                   batched_test_reader, batch_acc
+        batched_reader = None
+        pyreader.decorate_paddle_reader(
+            paddle.batch(
+                reader if args.no_random else paddle.reader.shuffle(
+                    reader, buf_size=5120),
+                batch_size=args.batch_size))
+
+    return avg_cost, optimizer, [batch_acc1,
+                                 batch_acc5], batched_reader, pyreader
diff --git a/benchmark/fluid/models/resnet_with_preprocess.py b/benchmark/fluid/models/resnet_with_preprocess.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8d661d847516a15e4e28796960815935b82ae6f
--- /dev/null
+++ b/benchmark/fluid/models/resnet_with_preprocess.py
@@ -0,0 +1,268 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+import numpy as np
+import time
+import os
+
+import cProfile, pstats, StringIO
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import paddle.fluid.profiler as profiler
+# from recordio_converter import imagenet_train, imagenet_test
+from imagenet_reader import train_raw, val
+
+
+def conv_bn_layer(input,
+                  ch_out,
+                  filter_size,
+                  stride,
+                  padding,
+                  act='relu',
+                  is_train=True):
+    conv1 = fluid.layers.conv2d(
+        input=input,
+        filter_size=filter_size,
+        num_filters=ch_out,
+        stride=stride,
+        padding=padding,
+        act=None,
+        bias_attr=False)
+    return fluid.layers.batch_norm(input=conv1, act=act, is_test=not is_train)
+
+
+def shortcut(input, ch_out, stride, is_train=True):
+    ch_in = input.shape[1]  # if args.data_format == 'NCHW' else input.shape[-1]
+    if ch_in != ch_out:
+        return conv_bn_layer(
+            input, ch_out, 1, stride, 0, None, is_train=is_train)
+    else:
+        return input
+
+
+def basicblock(input, ch_out, stride, is_train=True):
+    short = shortcut(input, ch_out, stride, is_train=is_train)
+    conv1 = conv_bn_layer(input, ch_out, 3, stride, 1, is_train=is_train)
+    conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, act=None, is_train=is_train)
+    return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')
+
+
+def bottleneck(input, ch_out, stride, is_train=True):
+    short = shortcut(input, ch_out * 4, stride, is_train=is_train)
+    conv1 = conv_bn_layer(input, ch_out, 1, stride, 0, is_train=is_train)
+    conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, is_train=is_train)
+    conv3 = conv_bn_layer(
+        conv2, ch_out * 4, 1, 1, 0, act=None, is_train=is_train)
+    return fluid.layers.elementwise_add(x=short, y=conv3, act='relu')
+
+
+def layer_warp(block_func, input, ch_out, count, stride):
+    res_out = block_func(input, ch_out, stride)
+    for i in range(1, count):
+        res_out = block_func(res_out, ch_out, 1)
+    return res_out
+
+
+def resnet_imagenet(input,
+                    class_dim,
+                    depth=50,
+                    data_format='NCHW',
+                    is_train=True):
+
+    cfg = {
+        18: ([2, 2, 2, 1], basicblock),
+        34: ([3, 4, 6, 3], basicblock),
+        50: ([3, 4, 6, 3], bottleneck),
+        101: ([3, 4, 23, 3], bottleneck),
+        152: ([3, 8, 36, 3], bottleneck)
+    }
+    stages, block_func = cfg[depth]
+    conv1 = conv_bn_layer(input, ch_out=64, filter_size=7, stride=2, padding=3)
+    pool1 = fluid.layers.pool2d(
+        input=conv1, pool_type='avg', pool_size=3, pool_stride=2)
+    res1 = layer_warp(block_func, pool1, 64, stages[0], 1)
+    res2 = layer_warp(block_func, res1, 128, stages[1], 2)
+    res3 = layer_warp(block_func, res2, 256, stages[2], 2)
+    res4 = layer_warp(block_func, res3, 512, stages[3], 2)
+    pool2 = fluid.layers.pool2d(
+        input=res4,
+        pool_size=7,
+        pool_type='avg',
+        pool_stride=1,
+        global_pooling=True)
+    out = fluid.layers.fc(input=pool2, size=class_dim, act='softmax')
+    return out
+
+
+def resnet_cifar10(input, class_dim, depth=32, data_format='NCHW'):
+    assert (depth - 2) % 6 == 0
+
+    n = (depth - 2) // 6
+
+    conv1 = conv_bn_layer(
+        input=input, ch_out=16, filter_size=3, stride=1, padding=1)
+    res1 = layer_warp(basicblock, conv1, 16, n, 1)
+    res2 = layer_warp(basicblock, res1, 32, n, 2)
+    res3 = layer_warp(basicblock, res2, 64, n, 2)
+    pool = fluid.layers.pool2d(
+        input=res3, pool_size=8, pool_type='avg', pool_stride=1)
+    out = fluid.layers.fc(input=pool, size=class_dim, act='softmax')
+    return out
+
+
+def _model_reader_dshape_classdim(args, is_train):
+    model = resnet_cifar10
+    reader = None
+    if args.data_set == "cifar10":
+        class_dim = 10
+        if args.data_format == 'NCHW':
+            dshape = [3, 32, 32]
+        else:
+            dshape = [32, 32, 3]
+        model = resnet_cifar10
+        if is_train:
+            reader = paddle.dataset.cifar.train10()
+        else:
+            reader = paddle.dataset.cifar.test10()
+    elif args.data_set == "flowers":
+        class_dim = 102
+        if args.data_format == 'NCHW':
+            dshape = [3, 224, 224]
+        else:
+            dshape = [224, 224, 3]
+        model = resnet_imagenet
+        if is_train:
+            reader = paddle.dataset.flowers.train()
+        else:
+            reader = paddle.dataset.flowers.test()
+    elif args.data_set == "imagenet":
+        class_dim = 1000
+        if args.data_format == 'NCHW':
+            dshape = [3, 224, 224]
+        else:
+            dshape = [224, 224, 3]
+        model = resnet_imagenet
+        if not args.data_path:
+            raise Exception(
+                "Must specify --data_path when training with imagenet")
+        if not args.use_reader_op:
+            if is_train:
+                reader = train_raw()
+            else:
+                reader = val()
+        else:
+            if is_train:
+                reader = train_raw()
+            else:
+                reader = val(xmap=False)
+    return model, reader, dshape, class_dim
+
+
+def get_model(args, is_train, main_prog, startup_prog):
+    model, reader, dshape, class_dim = _model_reader_dshape_classdim(args,
+                                                                     is_train)
+
+    pyreader = None
+    trainer_count = int(os.getenv("PADDLE_TRAINERS"))
+    with fluid.program_guard(main_prog, startup_prog):
+        with fluid.unique_name.guard():
+            if args.use_reader_op:
+                pyreader = fluid.layers.py_reader(
+                    capacity=args.batch_size * args.gpus,
+                    shapes=([-1] + dshape, (-1, 1)),
+                    dtypes=('uint8', 'int64'),
+                    name="train_reader" if is_train else "test_reader",
+                    use_double_buffer=True)
+                input, label = fluid.layers.read_file(pyreader)
+            else:
+                input = fluid.layers.data(
+                    name='data', shape=dshape, dtype='uint8')
+                label = fluid.layers.data(
+                    name='label', shape=[1], dtype='int64')
+
+            # add imagenet preprocessors
+            random_crop = fluid.layers.random_crop(input, dshape)
+            casted = fluid.layers.cast(random_crop, 'float32')
+            # input is HWC
+            trans = fluid.layers.transpose(casted, [0, 3, 1, 2]) / 255.0
+            img_mean = fluid.layers.tensor.assign(
+                np.array([0.485, 0.456, 0.406]).astype('float32').reshape((3, 1,
+                                                                           1)))
+            img_std = fluid.layers.tensor.assign(
+                np.array([0.229, 0.224, 0.225]).astype('float32').reshape((3, 1,
+                                                                           1)))
+            h1 = fluid.layers.elementwise_sub(trans, img_mean, axis=1)
+            h2 = fluid.layers.elementwise_div(h1, img_std, axis=1)
+
+            # pre_out = (trans - img_mean) / img_std
+
+            predict = model(h2, class_dim, is_train=is_train)
+            cost = fluid.layers.cross_entropy(input=predict, label=label)
+            avg_cost = fluid.layers.mean(x=cost)
+
+            batch_acc1 = fluid.layers.accuracy(input=predict, label=label, k=1)
+            batch_acc5 = fluid.layers.accuracy(input=predict, label=label, k=5)
+
+            # configure optimize
+            optimizer = None
+            if is_train:
+                if args.use_lars:
+                    lars_decay = 1.0
+                else:
+                    lars_decay = 0.0
+
+                total_images = 1281167 / trainer_count
+
+                step = int(total_images / args.batch_size + 1)
+                epochs = [30, 60, 80, 90]
+                bd = [step * e for e in epochs]
+                base_lr = args.learning_rate
+                lr = []
+                lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
+                optimizer = fluid.optimizer.Momentum(
+                    learning_rate=base_lr,
+                    #learning_rate=fluid.layers.piecewise_decay(
+                    #    boundaries=bd, values=lr),
+                    momentum=0.9,
+                    regularization=fluid.regularizer.L2Decay(1e-4))
+                optimizer.minimize(avg_cost)
+
+                if args.memory_optimize:
+                    fluid.memory_optimize(main_prog)
+
+    # config readers
+    if not args.use_reader_op:
+        batched_reader = paddle.batch(
+            reader if args.no_random else paddle.reader.shuffle(
+                reader, buf_size=5120),
+            batch_size=args.batch_size * args.gpus,
+            drop_last=True)
+    else:
+        batched_reader = None
+        pyreader.decorate_paddle_reader(
+            paddle.batch(
+                # reader if args.no_random else paddle.reader.shuffle(
+                #     reader, buf_size=5120),
+                reader,
+                batch_size=args.batch_size))
+
+    return avg_cost, optimizer, [batch_acc1,
+                                 batch_acc5], batched_reader, pyreader
diff --git a/benchmark/fluid/models/se_resnext.py b/benchmark/fluid/models/se_resnext.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f887fb324dc86a30b708b9ef04068282a3e6c3e
--- /dev/null
+++ b/benchmark/fluid/models/se_resnext.py
@@ -0,0 +1,286 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.fluid as fluid
+import math
+import os
+from imagenet_reader import train, val
+
+__all__ = [
+    "SE_ResNeXt", "SE_ResNeXt50_32x4d", "SE_ResNeXt101_32x4d",
+    "SE_ResNeXt152_32x4d", "get_model"
+]
+
+train_parameters = {
+    "input_size": [3, 224, 224],
+    "input_mean": [0.485, 0.456, 0.406],
+    "input_std": [0.229, 0.224, 0.225],
+    "learning_strategy": {
+        "name": "piecewise_decay",
+        "batch_size": 256,
+        "epochs": [30, 60, 90],
+        "steps": [0.1, 0.01, 0.001, 0.0001]
+    }
+}
+
+
+class SE_ResNeXt():
+    def __init__(self, layers=50, is_train=True):
+        self.params = train_parameters
+        self.layers = layers
+        self.is_train = is_train
+
+    def net(self, input, class_dim=1000):
+        layers = self.layers
+        supported_layers = [50, 101, 152]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(supported_layers, layers)
+        if layers == 50:
+            cardinality = 32
+            reduction_ratio = 16
+            depth = [3, 4, 6, 3]
+            num_filters = [128, 256, 512, 1024]
+
+            conv = self.conv_bn_layer(
+                input=input,
+                num_filters=64,
+                filter_size=7,
+                stride=2,
+                act='relu')
+            conv = fluid.layers.pool2d(
+                input=conv,
+                pool_size=3,
+                pool_stride=2,
+                pool_padding=1,
+                pool_type='max')
+        elif layers == 101:
+            cardinality = 32
+            reduction_ratio = 16
+            depth = [3, 4, 23, 3]
+            num_filters = [128, 256, 512, 1024]
+
+            conv = self.conv_bn_layer(
+                input=input,
+                num_filters=64,
+                filter_size=7,
+                stride=2,
+                act='relu')
+            conv = fluid.layers.pool2d(
+                input=conv,
+                pool_size=3,
+                pool_stride=2,
+                pool_padding=1,
+                pool_type='max')
+        elif layers == 152:
+            cardinality = 64
+            reduction_ratio = 16
+            depth = [3, 8, 36, 3]
+            num_filters = [128, 256, 512, 1024]
+
+            conv = self.conv_bn_layer(
+                input=input,
+                num_filters=64,
+                filter_size=3,
+                stride=2,
+                act='relu')
+            conv = self.conv_bn_layer(
+                input=conv, num_filters=64, filter_size=3, stride=1, act='relu')
+            conv = self.conv_bn_layer(
+                input=conv,
+                num_filters=128,
+                filter_size=3,
+                stride=1,
+                act='relu')
+            conv = fluid.layers.pool2d(
+                input=conv, pool_size=3, pool_stride=2, pool_padding=1, \
+                pool_type='max')
+
+        for block in range(len(depth)):
+            for i in range(depth[block]):
+                conv = self.bottleneck_block(
+                    input=conv,
+                    num_filters=num_filters[block],
+                    stride=2 if i == 0 and block != 0 else 1,
+                    cardinality=cardinality,
+                    reduction_ratio=reduction_ratio)
+
+        pool = fluid.layers.pool2d(
+            input=conv, pool_size=7, pool_type='avg', global_pooling=True)
+        drop = fluid.layers.dropout(x=pool, dropout_prob=0.5)
+        stdv = 1.0 / math.sqrt(drop.shape[1] * 1.0)
+        out = fluid.layers.fc(input=drop,
+                              size=class_dim,
+                              act='softmax',
+                              param_attr=fluid.param_attr.ParamAttr(
+                                  initializer=fluid.initializer.Uniform(-stdv,
+                                                                        stdv)))
+        return out
+
+    def shortcut(self, input, ch_out, stride):
+        ch_in = input.shape[1]
+        if ch_in != ch_out or stride != 1:
+            filter_size = 1
+            return self.conv_bn_layer(input, ch_out, filter_size, stride)
+        else:
+            return input
+
+    def bottleneck_block(self, input, num_filters, stride, cardinality,
+                         reduction_ratio):
+        conv0 = self.conv_bn_layer(
+            input=input, num_filters=num_filters, filter_size=1, act='relu')
+        conv1 = self.conv_bn_layer(
+            input=conv0,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            groups=cardinality,
+            act='relu')
+        conv2 = self.conv_bn_layer(
+            input=conv1, num_filters=num_filters * 2, filter_size=1, act=None)
+        scale = self.squeeze_excitation(
+            input=conv2,
+            num_channels=num_filters * 2,
+            reduction_ratio=reduction_ratio)
+
+        short = self.shortcut(input, num_filters * 2, stride)
+
+        return fluid.layers.elementwise_add(x=short, y=scale, act='relu')
+
+    def conv_bn_layer(self,
+                      input,
+                      num_filters,
+                      filter_size,
+                      stride=1,
+                      groups=1,
+                      act=None):
+        conv = fluid.layers.conv2d(
+            input=input,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) / 2,
+            groups=groups,
+            act=None,
+            bias_attr=False)
+        return fluid.layers.batch_norm(
+            input=conv, act=act, is_test=not self.is_train)
+
+    def squeeze_excitation(self, input, num_channels, reduction_ratio):
+        pool = fluid.layers.pool2d(
+            input=input, pool_size=0, pool_type='avg', global_pooling=True)
+        stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
+        squeeze = fluid.layers.fc(input=pool,
+                                  size=num_channels / reduction_ratio,
+                                  act='relu',
+                                  param_attr=fluid.param_attr.ParamAttr(
+                                      initializer=fluid.initializer.Uniform(
+                                          -stdv, stdv)))
+        stdv = 1.0 / math.sqrt(squeeze.shape[1] * 1.0)
+        excitation = fluid.layers.fc(input=squeeze,
+                                     size=num_channels,
+                                     act='sigmoid',
+                                     param_attr=fluid.param_attr.ParamAttr(
+                                         initializer=fluid.initializer.Uniform(
+                                             -stdv, stdv)))
+        scale = fluid.layers.elementwise_mul(x=input, y=excitation, axis=0)
+        return scale
+
+
+def SE_ResNeXt50_32x4d():
+    model = SE_ResNeXt(layers=50)
+    return model
+
+
+def SE_ResNeXt101_32x4d():
+    model = SE_ResNeXt(layers=101)
+    return model
+
+
+def SE_ResNeXt152_32x4d():
+    model = SE_ResNeXt(layers=152)
+    return model
+
+
+def get_model(args, is_train, main_prog, startup_prog):
+    model = SE_ResNeXt(layers=50)
+    batched_reader = None
+    pyreader = None
+    trainer_count = int(os.getenv("PADDLE_TRAINERS"))
+    dshape = train_parameters["input_size"]
+
+    with fluid.program_guard(main_prog, startup_prog):
+        with fluid.unique_name.guard():
+            if args.use_reader_op:
+                pyreader = fluid.layers.py_reader(
+                    capacity=10,
+                    shapes=([-1] + dshape, (-1, 1)),
+                    dtypes=('float32', 'int64'),
+                    name="train_reader" if is_train else "test_reader",
+                    use_double_buffer=True)
+                input, label = fluid.layers.read_file(pyreader)
+            else:
+                input = fluid.layers.data(
+                    name='data', shape=dshape, dtype='float32')
+                label = fluid.layers.data(
+                    name='label', shape=[1], dtype='int64')
+
+            out = model.net(input=input)
+            cost = fluid.layers.cross_entropy(input=out, label=label)
+            avg_cost = fluid.layers.mean(x=cost)
+            acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
+            acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
+
+            optimizer = None
+            if is_train:
+                if args.use_lars:
+                    lars_decay = 1.0
+                else:
+                    lars_decay = 0.0
+
+                total_images = 1281167 / trainer_count
+
+                step = int(total_images / args.batch_size + 1)
+                epochs = [40, 80, 100]
+                bd = [step * e for e in epochs]
+                base_lr = args.learning_rate
+                lr = []
+                lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
+                optimizer = fluid.optimizer.Momentum(
+                    # learning_rate=base_lr,
+                    learning_rate=fluid.layers.piecewise_decay(
+                        boundaries=bd, values=lr),
+                    momentum=0.9,
+                    regularization=fluid.regularizer.L2Decay(1e-4),
+                    LARS_weight_decay=lars_decay)
+                optimizer.minimize(avg_cost)
+
+                if args.memory_optimize:
+                    fluid.memory_optimize(main_prog)
+
+    # config readers
+    if is_train:
+        reader = train()
+    else:
+        reader = val()
+
+    if not args.use_reader_op:
+        batched_reader = paddle.batch(
+            reader, batch_size=args.batch_size * args.gpus, drop_last=True)
+    else:
+        pyreader.decorate_paddle_reader(
+            paddle.batch(
+                reader, batch_size=args.batch_size))
+
+    return avg_cost, optimizer, [acc_top1, acc_top5], batched_reader, pyreader
diff --git a/benchmark/fluid/models/stacked_dynamic_lstm.py b/benchmark/fluid/models/stacked_dynamic_lstm.py
index 3231542a17ace99a17c9f9b9bdb3c2527637d9ef..f23bb59de9158b0481320cc409879b3b72cbd43e 100644
--- a/benchmark/fluid/models/stacked_dynamic_lstm.py
+++ b/benchmark/fluid/models/stacked_dynamic_lstm.py
@@ -26,7 +26,6 @@ import numpy
 import paddle
 import paddle.dataset.imdb as imdb
 import paddle.fluid as fluid
-import paddle.batch as batch
 import paddle.fluid.profiler as profiler
 
 word_dict = imdb.word_dict()
@@ -43,19 +42,7 @@ def crop_sentence(reader, crop_size):
     return __impl__
 
 
-def get_model(args):
-    if args.use_reader_op:
-        raise Exception(
-            "stacked_dynamic_lstm do not support reader op for now.")
-    lstm_size = 512
-    emb_dim = 512
-    crop_size = 1500
-
-    data = fluid.layers.data(
-        name="words", shape=[1], lod_level=1, dtype='int64')
-    sentence = fluid.layers.embedding(
-        input=data, size=[len(word_dict), emb_dim])
-
+def lstm_net(sentence, lstm_size):
     sentence = fluid.layers.fc(input=sentence, size=lstm_size, act='tanh')
 
     rnn = fluid.layers.DynamicRNN()
@@ -97,31 +84,47 @@ def get_model(args):
 
     last = fluid.layers.sequence_pool(rnn(), 'last')
     logit = fluid.layers.fc(input=last, size=2, act='softmax')
-    loss = fluid.layers.cross_entropy(
-        input=logit,
-        label=fluid.layers.data(
-            name='label', shape=[1], dtype='int64'))
-    loss = fluid.layers.mean(x=loss)
+    return logit
 
-    # add acc
-    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
-    batch_acc = fluid.layers.accuracy(input=logit, label=fluid.layers.data(name='label', \
-                shape=[1], dtype='int64'), total=batch_size_tensor)
 
-    inference_program = fluid.default_main_program().clone()
-    with fluid.program_guard(inference_program):
-        inference_program = fluid.io.get_inference_program(
-            target_vars=[batch_acc, batch_size_tensor])
-
-    adam = fluid.optimizer.Adam()
+def get_model(args, is_train, main_prog, startup_prog):
+    if args.use_reader_op:
+        raise Exception(
+            "stacked_dynamic_lstm do not support reader op for now.")
+    lstm_size = 512
+    emb_dim = 512
+    crop_size = 1500
 
-    train_reader = batch(
+    with fluid.program_guard(main_prog, startup_prog):
+        with fluid.unique_name.guard():
+            data = fluid.layers.data(
+                name="words", shape=[1], lod_level=1, dtype='int64')
+            sentence = fluid.layers.embedding(
+                input=data, size=[len(word_dict), emb_dim])
+            logit = lstm_net(sentence, lstm_size)
+            loss = fluid.layers.cross_entropy(
+                input=logit,
+                label=fluid.layers.data(
+                    name='label', shape=[1], dtype='int64'))
+            loss = fluid.layers.mean(x=loss)
+
+            # add acc
+            batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+            batch_acc = fluid.layers.accuracy(input=logit, label=fluid.layers.data(name='label', \
+                        shape=[1], dtype='int64'), total=batch_size_tensor)
+
+            if is_train:
+                adam = fluid.optimizer.Adam()
+                adam.minimize(loss)
+
+    if is_train:
+        reader = crop_sentence(imdb.train(word_dict), crop_size)
+    else:
+        reader = crop_sentence(imdb.test(word_dict), crop_size)
+
+    batched_reader = paddle.batch(
         paddle.reader.shuffle(
-            crop_sentence(imdb.train(word_dict), crop_size), buf_size=25000),
+            reader, buf_size=25000),
         batch_size=args.batch_size * args.gpus)
-    test_reader = batch(
-        paddle.reader.shuffle(
-            crop_sentence(imdb.test(word_dict), crop_size), buf_size=25000),
-        batch_size=args.batch_size)
 
-    return loss, inference_program, adam, train_reader, test_reader, batch_acc
+    return loss, adam, [batch_acc], batched_reader, None
diff --git a/benchmark/fluid/models/vgg.py b/benchmark/fluid/models/vgg.py
index 932601302d2f5d56b53e3462af886429034d8989..cf9708d500684465dc8ec1666bf269e7e1300f59 100644
--- a/benchmark/fluid/models/vgg.py
+++ b/benchmark/fluid/models/vgg.py
@@ -25,7 +25,7 @@ import functools
 import os
 
 
-def vgg16_bn_drop(input):
+def vgg16_bn_drop(input, is_train=True):
     def conv_block(input, num_filter, groups, dropouts):
         return fluid.nets.img_conv_group(
             input=input,
@@ -46,13 +46,13 @@ def vgg16_bn_drop(input):
 
     drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
     fc1 = fluid.layers.fc(input=drop, size=512, act=None)
-    bn = fluid.layers.batch_norm(input=fc1, act='relu')
+    bn = fluid.layers.batch_norm(input=fc1, act='relu', is_test=not is_train)
     drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
     fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
     return fc2
 
 
-def get_model(args):
+def get_model(args, is_train, main_prog, startup_prog):
     if args.data_set == "cifar10":
         classdim = 10
         if args.data_format == 'NCHW':
@@ -65,57 +65,56 @@ def get_model(args):
             data_shape = [3, 224, 224]
         else:
             data_shape = [224, 224, 3]
+    filelist = [
+        os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
+    ]
+    with fluid.program_guard(main_prog, startup_prog):
+        if args.use_reader_op:
+            data_file_handle = fluid.layers.open_files(
+                filenames=filelist,
+                shapes=[[-1] + data_shape, (-1, 1)],
+                lod_levels=[0, 0],
+                dtypes=["float32", "int64"],
+                thread_num=1,
+                pass_num=1)
+            data_file = fluid.layers.double_buffer(
+                fluid.layers.batch(
+                    data_file_handle, batch_size=args.batch_size))
+        with fluid.unique_name.guard():
+            if args.use_reader_op:
+                images, label = fluid.layers.read_file(data_file)
+            else:
+                images = fluid.layers.data(
+                    name='data', shape=data_shape, dtype='float32')
+                label = fluid.layers.data(
+                    name='label', shape=[1], dtype='int64')
+            # Train program
+            net = vgg16_bn_drop(images, is_train=is_train)
+            predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
+            cost = fluid.layers.cross_entropy(input=predict, label=label)
+            avg_cost = fluid.layers.mean(x=cost)
 
-    if args.use_reader_op:
-        filelist = [
-            os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
-        ]
-        data_file = fluid.layers.open_files(
-            filenames=filelist,
-            shapes=[[-1] + data_shape, (-1, 1)],
-            lod_levels=[0, 0],
-            dtypes=["float32", "int64"],
-            thread_num=args.gpus,
-            pass_num=args.pass_num)
-        data_file = fluid.layers.double_buffer(
-            fluid.layers.batch(
-                data_file, batch_size=args.batch_size))
-        images, label = fluid.layers.read_file(data_file)
-    else:
-        images = fluid.layers.data(
-            name='data', shape=data_shape, dtype='float32')
-        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-
-    # Train program
-    net = vgg16_bn_drop(images)
-    predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
-    cost = fluid.layers.cross_entropy(input=predict, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
-
-    # Evaluator
-    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
-    batch_acc = fluid.layers.accuracy(
-        input=predict, label=label, total=batch_size_tensor)
-
-    # inference program
-    inference_program = fluid.default_main_program().clone()
-    with fluid.program_guard(inference_program):
-        inference_program = fluid.io.get_inference_program(
-            target_vars=[batch_acc, batch_size_tensor])
-
-    # Optimization
-    optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
+            # Evaluator
+            batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+            batch_acc = fluid.layers.accuracy(
+                input=predict, label=label, total=batch_size_tensor)
+            # Optimization
+            if is_train:
+                optimizer = fluid.optimizer.Adam(
+                    learning_rate=args.learning_rate)
+                optimizer.minimize(avg_cost)
 
     # data reader
-    train_reader = paddle.batch(
+    if is_train:
+        reader = paddle.dataset.cifar.train10() \
+            if args.data_set == 'cifar10' else paddle.dataset.flowers.train()
+    else:
+        reader = paddle.dataset.cifar.test10() \
+            if args.data_set == 'cifar10' else paddle.dataset.flowers.test()
+
+    batched_reader = paddle.batch(
         paddle.reader.shuffle(
-            paddle.dataset.cifar.train10()
-            if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
-            buf_size=5120),
+            reader, buf_size=5120),
         batch_size=args.batch_size * args.gpus)
-    test_reader = paddle.batch(
-        paddle.dataset.cifar.test10()
-        if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
-        batch_size=args.batch_size)
 
-    return avg_cost, inference_program, optimizer, train_reader, test_reader, batch_acc
+    return avg_cost, optimizer, [batch_acc], batched_reader, data_file_handle
diff --git a/benchmark/paddle/image/run.sh b/benchmark/paddle/image/run.sh
index 717ed487ba7657db6535efcb1128a355a0f15eaf..5b58a8d773aab795e5439b0f0e5d81bec66b5f56 100755
--- a/benchmark/paddle/image/run.sh
+++ b/benchmark/paddle/image/run.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
 set -e
 
 function train() {
diff --git a/benchmark/paddle/image/run_mkl_infer.sh b/benchmark/paddle/image/run_mkl_infer.sh
index 62c9bf6efd3810f506fd4592b2ba3a21b1b7f0e7..0fad5e04cc992a3ec97591d3833957bb7517a8f3 100755
--- a/benchmark/paddle/image/run_mkl_infer.sh
+++ b/benchmark/paddle/image/run_mkl_infer.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
 set -e
 
 function clock_to_seconds() {
diff --git a/benchmark/paddle/image/run_mkl_train.sh b/benchmark/paddle/image/run_mkl_train.sh
index 03d2d378fb72e36f765d89af788f6ee96fe21d4e..1583bf134a276a08aa2f8e84dc63adbb205a83d6 100755
--- a/benchmark/paddle/image/run_mkl_train.sh
+++ b/benchmark/paddle/image/run_mkl_train.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
 set -e
 
 function train() {
diff --git a/benchmark/paddle/image/run_openblas_infer.sh b/benchmark/paddle/image/run_openblas_infer.sh
index a9a7b8a66717c4be0543c3fe2db293fe199e3dc4..987381cabc2e793886099212660723c122b73bb0 100755
--- a/benchmark/paddle/image/run_openblas_infer.sh
+++ b/benchmark/paddle/image/run_openblas_infer.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
 set -e
 
 function clock_to_seconds() {
diff --git a/benchmark/paddle/image/run_openblas_train.sh b/benchmark/paddle/image/run_openblas_train.sh
index 935cff6f2c97d25d6de556cfee25e27dbe49b5b6..cc64e1d09da02087b1737190a0b75dc7758600a6 100755
--- a/benchmark/paddle/image/run_openblas_train.sh
+++ b/benchmark/paddle/image/run_openblas_train.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
 set -e
 
 function train() {
diff --git a/benchmark/paddle/rnn/run.sh b/benchmark/paddle/rnn/run.sh
index e9dfeb2e525979f47e4ef48f7610dc1007900f2c..f99a562b3f88a98560f4bf7aee98ceee9daefe67 100755
--- a/benchmark/paddle/rnn/run.sh
+++ b/benchmark/paddle/rnn/run.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
 set -e
 
 function train() {
diff --git a/benchmark/tensorflow/image/run.sh b/benchmark/tensorflow/image/run.sh
index eade36beb9df5f8d3978939216e058203e024c1a..cf894fe3f2dca24e3acf863d625b3a7008793b83 100755
--- a/benchmark/tensorflow/image/run.sh
+++ b/benchmark/tensorflow/image/run.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
 set -e
 
 function test() {
diff --git a/benchmark/tensorflow/image/run_multi.sh b/benchmark/tensorflow/image/run_multi.sh
index 69faa4331744f2276e7706185ae10bc507f95764..bf1435bc55b90669e0b8bd893b8ed7bbb99d51e2 100755
--- a/benchmark/tensorflow/image/run_multi.sh
+++ b/benchmark/tensorflow/image/run_multi.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
 set -e
 
 function test() {
diff --git a/benchmark/tensorflow/rnn/run.sh b/benchmark/tensorflow/rnn/run.sh
index bb4c69cb95f965eff35f1c5a60376bf1e84f841b..db10eefdea8676ad34fb84a161f0fc1309147824 100755
--- a/benchmark/tensorflow/rnn/run.sh
+++ b/benchmark/tensorflow/rnn/run.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
 set -e
 
 function test() {
diff --git a/benchmark/tensorflow/rnn/run_multi.sh b/benchmark/tensorflow/rnn/run_multi.sh
index c2d7dd597e6da54cd5c4cda311fbbd18486b4647..ec62fc26b51543f2f8ddfc5e73aa6ff7d611e4dd 100755
--- a/benchmark/tensorflow/rnn/run_multi.sh
+++ b/benchmark/tensorflow/rnn/run_multi.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
 set -e
 
 function test() {
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index e4af34d10ed92c501dd805addb62747c91c00978..ce1857582bd3e8ab3077158384beaae36a83a4b2 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -50,13 +50,22 @@ if(NOT WITH_PROFILER)
 endif(NOT WITH_PROFILER)
 
 if(NOT CMAKE_CROSSCOMPILING)
-    if(WITH_AVX AND AVX_FOUND)
+    if(WITH_AVX AND AVX512F_FOUND)
+        set(SIMD_FLAG ${AVX512F_FLAG})
+    elseif(WITH_AVX AND AVX2_FOUND)
+        set(SIMD_FLAG ${AVX2_FLAG})
+    elseif(WITH_AVX AND AVX_FOUND)
         set(SIMD_FLAG ${AVX_FLAG})
     elseif(SSE3_FOUND)
         set(SIMD_FLAG ${SSE3_FLAG})
     endif()
 endif()
 
+if(WIN32)
+  # windows stupid compile option for all targets.
+  add_definitions(-D_XKEYCHECK_H)
+endif(WIN32)
+
 if(NOT WITH_GOLANG)
     add_definitions(-DPADDLE_WITHOUT_GOLANG)
 endif(NOT WITH_GOLANG)
@@ -97,6 +106,23 @@ if(WITH_GPU)
         endif()
         include_directories(${TENSORRT_INCLUDE_DIR})
     endif()
+    if(WITH_ANAKIN)
+        if(${CUDA_VERSION_MAJOR} VERSION_LESS 8)
+            message(WARNING "Anakin needs CUDA >= 8.0 to compile. Force WITH_ANAKIN=OFF")
+            set(WITH_ANAKIN OFF CACHE STRING "Anakin is valid only when CUDA >= 8.0." FORCE)
+        endif()
+        if(${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
+            message(WARNING "Anakin needs CUDNN >= 7.0 to compile. Force WITH_ANAKIN=OFF")
+            set(WITH_ANAKIN OFF CACHE STRING "Anakin is valid only when CUDNN >= 7.0." FORCE)
+        endif()
+    endif()
+    if(WITH_ANAKIN)
+        # NOTICE(minqiyang): the end slash is important because $CUDNN_INCLUDE_DIR
+        # is a softlink to real cudnn.h directory
+        set(ENV{CUDNN_INCLUDE_DIR} "${CUDNN_INCLUDE_DIR}/")
+        get_filename_component(CUDNN_LIBRARY_DIR ${CUDNN_LIBRARY} DIRECTORY)
+        set(ENV{CUDNN_LIBRARY} ${CUDNN_LIBRARY_DIR})
+    endif()
 elseif(WITH_AMD_GPU)
     add_definitions(-DPADDLE_WITH_HIP)
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__HIP_PLATFORM_HCC__")
diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index b520c03a836a9e3f263ba050f151877ffe0d071d..03c73786a6c31868b1893bfcb319e43e37db1a3d 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -169,14 +169,19 @@ set(CUDA_PROPAGATE_HOST_FLAGS OFF)
 
 # Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
 # So, don't set these flags here.
+if (NOT WIN32) # windows msvc2015 support c++11 natively. 
+# -std=c++11 -fPIC not recoginize by msvc, -Xcompiler will be added by cmake.
 list(APPEND CUDA_NVCC_FLAGS "-std=c++11")
-list(APPEND CUDA_NVCC_FLAGS "--use_fast_math")
 list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -fPIC")
+endif(NOT WIN32)
+
+list(APPEND CUDA_NVCC_FLAGS "--use_fast_math")
 # in cuda9, suppress cuda warning on eigen 
 list(APPEND CUDA_NVCC_FLAGS "-w")
 # Set :expt-relaxed-constexpr to suppress Eigen warnings
 list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr")
 
+if (NOT WIN32)
 if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
     list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_DEBUG})
 elseif(CMAKE_BUILD_TYPE  STREQUAL "Release")
@@ -187,6 +192,13 @@ elseif(CMAKE_BUILD_TYPE  STREQUAL "MinSizeRel")
     # nvcc 9 does not support -Os. Use Release flags instead
     list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELEASE})
 endif()
+else(NOT WIN32)
+if(CMAKE_BUILD_TYPE STREQUAL "Release")
+  list(APPEND CUDA_NVCC_FLAGS "-O3 -DNDEBUG")
+else()
+  message(FATAL "Windows only support Release build now. Please set visual studio build type to Release, x64 build.")
+endif()
+endif(NOT WIN32)
 
 mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD)
 mark_as_advanced(CUDA_SDK_ROOT_DIR CUDA_SEPARABLE_COMPILATION)
diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake
index 2c84061ff572de4687b4d496f8ded6deee8d1011..cd51533926de7bb132ab7bfab1686d664a331410 100644
--- a/cmake/cudnn.cmake
+++ b/cmake/cudnn.cmake
@@ -21,11 +21,29 @@ list(APPEND CUDNN_CHECK_LIBRARY_DIRS
     ${CUDNN_ROOT}/lib64
     ${CUDNN_ROOT}/lib
     ${CUDNN_ROOT}/lib/${TARGET_ARCH}-linux-gnu
+    ${CUDNN_ROOT}/local/cuda-${CUDA_VERSION}/targets/${TARGET_ARCH}-linux/lib/
     $ENV{CUDNN_ROOT}
     $ENV{CUDNN_ROOT}/lib64
     $ENV{CUDNN_ROOT}/lib
-    /usr/lib)
-find_library(CUDNN_LIBRARY NAMES libcudnn.so libcudnn.dylib # libcudnn_static.a
+    /usr/lib
+	${CUDA_TOOLKIT_ROOT_DIR}
+	${CUDA_TOOLKIT_ROOT_DIR}/lib/x64
+	)
+set(CUDNN_LIB_NAME "")
+if (LINUX)
+set(CUDNN_LIB_NAME "libcudnn.so")
+endif(LINUX)
+
+if(WIN32)
+# only support cudnn7
+set(CUDNN_LIB_NAME "cudnn.lib" "cudnn64_7.dll")
+endif(WIN32)
+
+if(Apple)
+set(CUDNN_LIB_NAME "libcudnn.dylib" "libcudnn.so")
+endif(Apple)
+
+find_library(CUDNN_LIBRARY NAMES ${CUDNN_LIB_NAME} # libcudnn_static.a
     PATHS ${CUDNN_CHECK_LIBRARY_DIRS} ${CUDNN_INCLUDE_DIR} ${__libpath_hist}
           NO_DEFAULT_PATH
     DOC "Path to cuDNN library.")
diff --git a/cmake/external/anakin.cmake b/cmake/external/anakin.cmake
index d205e3958234cabfbfeba8c3d725fe618ce48ace..ed054ff41ae0ec5a4b31dd256e397129cba3e8f1 100644
--- a/cmake/external/anakin.cmake
+++ b/cmake/external/anakin.cmake
@@ -2,43 +2,74 @@ if (NOT WITH_ANAKIN)
   return()
 endif()
 
-set(ANAKIN_INSTALL_DIR "${THIRD_PARTY_PATH}/install/anakin" CACHE PATH
-  "Anakin install path." FORCE)
-set(ANAKIN_INCLUDE "${ANAKIN_INSTALL_DIR}" CACHE STRING "root of Anakin header files")
-set(ANAKIN_LIBRARY "${ANAKIN_INSTALL_DIR}" CACHE STRING "path of Anakin library")
-
-set(ANAKIN_COMPILE_EXTRA_FLAGS -Wno-error=unused-variable -Wno-error=format-extra-args -Wno-error=comment -Wno-error=format -Wno-error=switch -Wno-error=return-type -Wno-error=non-virtual-dtor -Wno-reorder -Wno-error=cpp)
-
-set(ANAKIN_LIBRARY_URL "https://github.com/pangge/Anakin/releases/download/3.0/anakin_release_simple.tar.gz")
-
-# A helper function used in Anakin, currently, to use it, one need to recursively include
-# nearly all the header files.
-function(fetch_include_recursively root_dir)
-    if (IS_DIRECTORY ${root_dir})
-        include_directories(${root_dir})
-    endif()
-
-    file(GLOB ALL_SUB RELATIVE ${root_dir} ${root_dir}/*)
-    foreach(sub ${ALL_SUB})
-        if (IS_DIRECTORY ${root_dir}/${sub})
-            fetch_include_recursively(${root_dir}/${sub})
-        endif()
-    endforeach()
-endfunction()
-
-if (NOT EXISTS "${ANAKIN_INSTALL_DIR}")
-    # download library
-    message(STATUS "Download Anakin library from ${ANAKIN_LIBRARY_URL}")
-    execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}")
-    execute_process(COMMAND bash -c "rm -rf ${ANAKIN_INSTALL_DIR}/*")
-    execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; wget -q ${ANAKIN_LIBRARY_URL}")
-    execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}")
-    execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; tar xzf anakin_release_simple.tar.gz")
+option(ANAKIN_ENABLE_OP_TIMER      "Get more detailed information with Anakin op time"        OFF)
+if(ANAKIN_ENABLE_OP_TIMER)
+  add_definitions(-DPADDLE_ANAKIN_ENABLE_OP_TIMER)
 endif()
 
-if (WITH_ANAKIN)
-    message(STATUS "Anakin for inference is enabled")
-    message(STATUS "Anakin is set INCLUDE:${ANAKIN_INCLUDE} LIBRARY:${ANAKIN_LIBRARY}")
-    fetch_include_recursively(${ANAKIN_INCLUDE})
-    link_directories(${ANAKIN_LIBRARY})
+INCLUDE(ExternalProject)
+set(ANAKIN_SOURCE_DIR  ${THIRD_PARTY_PATH}/anakin)
+# the anakin install dir is only default one now
+set(ANAKIN_INSTALL_DIR ${THIRD_PARTY_PATH}/anakin/src/extern_anakin/output)
+set(ANAKIN_INCLUDE     ${ANAKIN_INSTALL_DIR})
+set(ANAKIN_LIBRARY     ${ANAKIN_INSTALL_DIR})
+set(ANAKIN_SHARED_LIB  ${ANAKIN_LIBRARY}/libanakin.so)
+set(ANAKIN_SABER_LIB   ${ANAKIN_LIBRARY}/libanakin_saber_common.so)
+
+include_directories(${ANAKIN_INCLUDE})
+include_directories(${ANAKIN_INCLUDE}/saber/)
+include_directories(${ANAKIN_INCLUDE}/saber/core/)
+include_directories(${ANAKIN_INCLUDE}/saber/funcs/impl/x86/)
+include_directories(${ANAKIN_INCLUDE}/saber/funcs/impl/cuda/base/cuda_c/)
+
+set(ANAKIN_COMPILE_EXTRA_FLAGS
+    -Wno-error=unused-but-set-variable -Wno-unused-but-set-variable
+    -Wno-error=unused-variable -Wno-unused-variable
+    -Wno-error=format-extra-args -Wno-format-extra-args
+    -Wno-error=comment -Wno-comment 
+    -Wno-error=format -Wno-format 
+    -Wno-error=maybe-uninitialized -Wno-maybe-uninitialized
+    -Wno-error=switch -Wno-switch
+    -Wno-error=return-type -Wno-return-type
+    -Wno-error=non-virtual-dtor -Wno-non-virtual-dtor
+    -Wno-error=ignored-qualifiers
+    -Wno-ignored-qualifiers
+    -Wno-sign-compare
+    -Wno-reorder
+    -Wno-error=cpp)
+
+if(WITH_GPU)
+    set(CMAKE_ARGS_PREFIX -DUSE_GPU_PLACE=YES -DCUDNN_ROOT=${CUDNN_ROOT} -DCUDNN_INCLUDE_DIR=${CUDNN_INCLUDE_DIR})
+else()
+    set(CMAKE_ARGS_PREFIX -DUSE_GPU_PLACE=NO)
 endif()
+ExternalProject_Add(
+    extern_anakin
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    DEPENDS             ${MKLML_PROJECT}
+    GIT_REPOSITORY      "https://github.com/PaddlePaddle/Anakin"
+    GIT_TAG             "3c8554f4978628183566ab7dd6c1e7e66493c7cd"
+    PREFIX              ${ANAKIN_SOURCE_DIR}
+    UPDATE_COMMAND      ""
+    CMAKE_ARGS          ${CMAKE_ARGS_PREFIX}
+                        -DUSE_X86_PLACE=YES
+                        -DBUILD_WITH_UNIT_TEST=NO
+                        -DPROTOBUF_ROOT=${THIRD_PARTY_PATH}/install/protobuf
+                        -DMKLML_ROOT=${THIRD_PARTY_PATH}/install/mklml
+                        -DENABLE_OP_TIMER=${ANAKIN_ENABLE_OP_TIMER}
+                        ${EXTERNAL_OPTIONAL_ARGS}
+    CMAKE_CACHE_ARGS    -DCMAKE_INSTALL_PREFIX:PATH=${ANAKIN_INSTALL_DIR}
+)
+
+message(STATUS "Anakin for inference is enabled")
+message(STATUS "Anakin is set INCLUDE:${ANAKIN_INCLUDE} LIBRARY:${ANAKIN_LIBRARY}")
+
+add_library(anakin_shared SHARED IMPORTED GLOBAL)
+set_property(TARGET anakin_shared PROPERTY IMPORTED_LOCATION ${ANAKIN_SHARED_LIB})
+add_dependencies(anakin_shared extern_anakin protobuf mklml)
+
+add_library(anakin_saber SHARED IMPORTED GLOBAL)
+set_property(TARGET anakin_saber PROPERTY IMPORTED_LOCATION ${ANAKIN_SABER_LIB})
+add_dependencies(anakin_saber extern_anakin protobuf mklml)
+
+list(APPEND external_project_dependencies anakin_shared anakin_saber)
diff --git a/cmake/external/boost.cmake b/cmake/external/boost.cmake
index 73713d93d5a52738651dda498fac5ea66e3589d2..ada61de8eb15ae10288ac54f588e9adf84acee37 100644
--- a/cmake/external/boost.cmake
+++ b/cmake/external/boost.cmake
@@ -28,7 +28,12 @@ if((NOT DEFINED BOOST_TAR) OR (NOT DEFINED BOOST_URL))
     set(BOOST_TAR "boost_1_41_0" CACHE STRING "" FORCE)
     set(BOOST_URL "http://paddlepaddledeps.cdn.bcebos.com/${BOOST_TAR}.tar.gz" CACHE STRING "" FORCE)
 endif()
-MESSAGE(STATUS "BOOST_TAR: ${BOOST_TAR}, BOOST_URL: ${BOOST_URL}")
+IF (WIN32)
+    MESSAGE(WARNING, "In windows, boost can not be downloaded automaticlly, please build it manually and put it at " ${THIRD_PARTY_PATH}install/boost)
+else()
+    MESSAGE(STATUS "BOOST_TAR: ${BOOST_TAR}, BOOST_URL: ${BOOST_URL}")
+ENDIF(WIN32)
+
 set(BOOST_SOURCES_DIR ${THIRD_PARTY_PATH}/boost)
 set(BOOST_DOWNLOAD_DIR  "${BOOST_SOURCES_DIR}/src/${BOOST_PROJECT}")
 set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}/${BOOST_TAR}" CACHE PATH "boost include directory." FORCE)
@@ -36,12 +41,13 @@ set_directory_properties(PROPERTIES CLEAN_NO_CUSTOM 1)
 
 include_directories(${BOOST_INCLUDE_DIR})
 
+if (NOT WIN32)
 ExternalProject_Add(
     ${BOOST_PROJECT}
     ${EXTERNAL_PROJECT_LOG_ARGS}
     DOWNLOAD_DIR          ${BOOST_DOWNLOAD_DIR}
     DOWNLOAD_COMMAND      wget --no-check-certificate ${BOOST_URL} -c -q -O ${BOOST_TAR}.tar.gz
-                          && tar zxf ${BOOST_TAR}.tar.gz
+    && tar zxf ${BOOST_TAR}.tar.gz
     DOWNLOAD_NO_PROGRESS  1
     PREFIX                ${BOOST_SOURCES_DIR}
     CONFIGURE_COMMAND     ""
@@ -49,8 +55,9 @@ ExternalProject_Add(
     INSTALL_COMMAND       ""
     UPDATE_COMMAND        ""
 )
+endif(NOT WIN32)
 
-if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
+if (${CMAKE_VERSION} VERSION_LESS "3.3.0" OR NOT WIN32)
     set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/boost_dummy.c)
     file(WRITE ${dummyfile} "const char *dummy = \"${dummyfile}\";")
     add_library(boost STATIC ${dummyfile})
diff --git a/cmake/external/cub.cmake b/cmake/external/cub.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..c94849cf4b96746e6c507db2a6310c2f305dacf5
--- /dev/null
+++ b/cmake/external/cub.cmake
@@ -0,0 +1,35 @@
+if(NOT WITH_GPU)
+  return()
+endif()
+
+include(ExternalProject)
+
+set(CUB_SOURCE_DIR ${THIRD_PARTY_PATH}/cub)
+set(CUB_INCLUDE_DIR ${CUB_SOURCE_DIR}/src/extern_cub)
+
+include_directories(${CUB_INCLUDE_DIR})
+
+ExternalProject_Add(
+  extern_cub
+  ${EXTERNAL_PROJECT_LOG_ARGS}
+  GIT_REPOSITORY "https://github.com/NVlabs/cub.git"
+  GIT_TAG        "v1.8.0"
+  PREFIX         ${CUB_SOURCE_DIR}
+  UPDATE_COMMAND ""
+  CONFIGURE_COMMAND ""
+  BUILD_COMMAND     ""
+  INSTALL_COMMAND   ""
+  TEST_COMMAND      ""
+)
+
+if(${CMAKE_VERSION} VERSION_LESS "3.3.0")
+  set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/cub_dummy.c)
+  file(WRITE ${dummyfile} "const char *dummy = \"${dummyfile}\";")
+  add_library(cub STATIC ${dummyfile})
+else()
+  add_library(cub INTERFACE)
+endif()
+
+add_dependencies(cub extern_cub)
+
+LIST(APPEND externl_project_dependencies cub)
diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake
index a1d2d0f44685c342db9d868da716809b49575c01..cf58cc39762351f8b37d073bcd218d249285bf52 100644
--- a/cmake/external/gflags.cmake
+++ b/cmake/external/gflags.cmake
@@ -18,7 +18,7 @@ SET(GFLAGS_SOURCES_DIR ${THIRD_PARTY_PATH}/gflags)
 SET(GFLAGS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/gflags)
 SET(GFLAGS_INCLUDE_DIR "${GFLAGS_INSTALL_DIR}/include" CACHE PATH "gflags include directory." FORCE)
 IF(WIN32)
-  set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/gflags.lib" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE)
+  set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/libgflags.lib" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE)
 ELSE(WIN32)
   set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/libgflags.a" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE)
 ENDIF(WIN32)
@@ -45,7 +45,13 @@ ExternalProject_Add(
                      -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
                      -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
 )
-
+IF(WIN32)
+  IF(NOT EXISTS "${GFLAGS_INSTALL_DIR}/lib/libgflags.lib")
+    add_custom_command(TARGET extern_gflags POST_BUILD
+    COMMAND cmake -E rename ${GFLAGS_INSTALL_DIR}/lib/gflags_static.lib ${GFLAGS_INSTALL_DIR}/lib/libgflags.lib
+  )
+  ENDIF()
+ENDIF(WIN32)
 ADD_LIBRARY(gflags STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET gflags PROPERTY IMPORTED_LOCATION ${GFLAGS_LIBRARIES})
 ADD_DEPENDENCIES(gflags extern_gflags)
@@ -60,3 +66,4 @@ IF(WITH_C_API)
     INSTALL(FILES ${GFLAGS_LIBRARIES} DESTINATION third_party/gflags/lib)
   ENDIF()
 ENDIF()
+
diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake
index ac0181e69cbf5efeee44c5ca801b2710eefb3e6d..25ef2970ac52f12f961c9c6d3a589fec4c80983f 100644
--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@@ -60,6 +60,13 @@ ExternalProject_Add(
                      -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
                      -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
 )
+IF(WIN32)
+  IF(NOT EXISTS "${GLOG_INSTALL_DIR}/lib/libglog.lib")
+    add_custom_command(TARGET extern_glog POST_BUILD
+    COMMAND cmake -E rename ${GLOG_INSTALL_DIR}/lib/glog.lib ${GLOG_INSTALL_DIR}/lib/libglog.lib
+  )
+  ENDIF()
+ENDIF(WIN32)
 
 ADD_LIBRARY(glog STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET glog PROPERTY IMPORTED_LOCATION ${GLOG_LIBRARIES})
diff --git a/cmake/external/grpc.cmake b/cmake/external/grpc.cmake
index 85f40585da29bab9a107f5546e64870975f4c2d3..fd9835d023c67b76579913f2ec56c2444fea8c15 100644
--- a/cmake/external/grpc.cmake
+++ b/cmake/external/grpc.cmake
@@ -44,12 +44,13 @@ ExternalProject_Add(
     # 3. keep only zlib, cares, protobuf, boringssl under "third_party",
     #    checkout and clean other dirs under third_party
     # 4. remove .git, and package the directory.
-    URL "http://paddlepaddledeps.bj.bcebos.com/grpc-v1.10.x.tar.gz"
+    URL "http://paddlepaddledeps.cdn.bcebos.com/grpc-v1.10.x.tar.gz"
     URL_MD5  "1f268a2aff6759839dccd256adcc91cf"
     PREFIX          ${GRPC_SOURCES_DIR}
     UPDATE_COMMAND  ""
     CONFIGURE_COMMAND ""
     BUILD_IN_SOURCE 1
+    PATCH_COMMAND cp ${PADDLE_SOURCE_DIR}/patches/grpc/grpc_library.h ${GRPC_SOURCES_DIR}/src/extern_grpc/include/grpcpp/impl/codegen/grpc_library.h && cp ${PADDLE_SOURCE_DIR}/patches/grpc/completion_queue.h ${GRPC_SOURCES_DIR}/src/extern_grpc/include/grpcpp/impl/codegen/completion_queue.h
     # NOTE(yuyang18):
     # Disable -Werror, otherwise the compile will fail in MacOS.
     # It seems that we cannot configure that by make command.
diff --git a/cmake/external/libxsmm.cmake b/cmake/external/libxsmm.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..530f7ebe2813fb2f00c6b5b4d1f7b2f04fe650b0
--- /dev/null
+++ b/cmake/external/libxsmm.cmake
@@ -0,0 +1,57 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+OPTION(WITH_LIBXSMM "Compile with libxsmm" OFF)
+
+IF(NOT WITH_LIBXSMM)
+    return()
+ENDIF()
+
+IF(WIN32 OR APPLE OR ANDROID OR IOS)
+    MESSAGE(WARNING "Windows, Mac or Mobile are not supported with libxsmm in Paddle yet.")
+    SET(WITH_LIBXSMM OFF CACHE STRING "Disable LIBXSMM" FORCE)
+    return()
+ENDIF()
+
+INCLUDE (ExternalProject)
+
+SET(LIBXSMM_SOURCES_DIR ${THIRD_PARTY_PATH}/libxsmm)
+SET(LIBXSMM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/libxsmm)
+SET(LIBXSMM_INCLUDE_DIR "${LIBXSMM_INSTALL_DIR}/include" CACHE PATH "LIBXSMM include directory." FORCE)
+SET(LIBXSMM_LIBRARY_DIR "${LIBXSMM_INSTALL_DIR}/lib" CACHE PATH "LIBXSMM library directory." FORCE)
+SET(LIBXSMM_LIBS        "${LIBXSMM_LIBRARY_DIR}/libxsmm.a"
+                        "${LIBXSMM_LIBRARY_DIR}/libxsmmnoblas.a")
+
+ExternalProject_Add(
+    extern_libxsmm
+    GIT_REPOSITORY  "https://github.com/hfp/libxsmm.git"
+    GIT_TAG         "7cc03b5b342fdbc6b6d990b190671c5dbb8489a2"
+    PREFIX          ${LIBXSMM_SOURCES_DIR}
+    UPDATE_COMMAND  ""
+    CONFIGURE_COMMAND ""
+    BUILD_IN_SOURCE 1
+    BUILD_COMMAND   $(MAKE) --silent PREFIX=${LIBXSMM_INSTALL_DIR} CXX=g++ CC=gcc WARP=0 install
+    INSTALL_COMMAND ""
+)
+ADD_LIBRARY(libxsmm STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET libxsmm PROPERTY IMPORTED_LOCATION "${LIBXSMM_LIBRARY_DIR}/libxsmm.a")
+SET_PROPERTY(TARGET libxsmm PROPERTY IMPORTED_LOCATION "${LIBXSMM_LIBRARY_DIR}/libxsmmnoblas.a")
+
+MESSAGE(STATUS "Libxsmm library: ${LIBXSMM_LIBS}")
+include_directories(${LIBXSMM_INCLUDE_DIR})
+ADD_DEFINITIONS(-DPADDLE_WITH_LIBXSMM)
+ADD_DEPENDENCIES(libxsmm extern_libxsmm)
+LIST(APPEND external_project_dependencies libxsmm)
+
diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index 20dda35c5ccd98f5672d867c26ab97a215483543..baf253df2755657b01b67c410f63b7d8422d4df3 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -24,7 +24,7 @@ SET(MKLDNN_INSTALL_DIR    ${THIRD_PARTY_PATH}/install/mkldnn)
 SET(MKLDNN_INC_DIR        "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE)
 
 IF(WIN32 OR APPLE)
-    MESSAGE(WARNING 
+    MESSAGE(WARNING
         "Windows or Mac is not supported with MKLDNN in Paddle yet."
         "Force WITH_MKLDNN=OFF")
     SET(WITH_MKLDNN OFF CACHE STRING "Disable MKLDNN in Windows and MacOS" FORCE)
@@ -54,11 +54,13 @@ ExternalProject_Add(
     ${EXTERNAL_PROJECT_LOG_ARGS}
     DEPENDS             ${MKLDNN_DEPENDS}
     GIT_REPOSITORY      "https://github.com/01org/mkl-dnn.git"
-    GIT_TAG             "a29d8487a63afca3d5b8c5bbdbb473cf8ccc6e51"
+    GIT_TAG             "64e03a1939e0d526aa8e9f2e3f7dc0ad8d372944"
     PREFIX              ${MKLDNN_SOURCES_DIR}
     UPDATE_COMMAND      ""
+    CMAKE_ARGS          -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+    CMAKE_ARGS          -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
     CMAKE_ARGS          -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR}
-    CMAKE_ARGS          -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} 
+    CMAKE_ARGS          -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
     CMAKE_ARGS          -DMKLROOT=${MKLML_ROOT}
     CMAKE_ARGS          -DCMAKE_C_FLAGS=${MKLDNN_CFLAG}
     CMAKE_ARGS          -DCMAKE_CXX_FLAGS=${MKLDNN_CXXFLAG}
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index ce6a88b51dc98ac46dd3935f12658d60d364ba8c..c3fbe4dbdb28f1008bb274ee18293db348bfc6ed 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -17,20 +17,29 @@ IF(USE_EIGEN_FOR_BLAS)
 ENDIF(USE_EIGEN_FOR_BLAS)
 
 INCLUDE(cblas)
+# IF(WIN32 AND NOT ${CBLAS_FOUND})
+
+
 
 IF(NOT ${CBLAS_FOUND})
+
     INCLUDE(ExternalProject)
 
     SET(CBLAS_SOURCES_DIR ${THIRD_PARTY_PATH}/openblas)
     SET(CBLAS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/openblas)
-    SET(CBLAS_INC_DIR "${CBLAS_INSTALL_DIR}/include" CACHE PATH "openblas include directory." FORCE)
+    SET(CBLAS_INCLUDE_DIR "${CBLAS_INSTALL_DIR}/include" CACHE PATH "openblas include directory." FORCE)
 
     SET(CBLAS_LIBRARIES
         "${CBLAS_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}openblas${CMAKE_STATIC_LIBRARY_SUFFIX}"
         CACHE FILEPATH "openblas library." FORCE)
 
     ADD_DEFINITIONS(-DPADDLE_USE_OPENBLAS)
+    IF (WIN32)
+        SET(CBLAS_FOUND true)
+        MESSAGE(WARNING, "In windows, openblas only support msvc build, please build it manually and put it at " ${CBLAS_INSTALL_DIR})
+    ENDIF(WIN32)
 
+    IF (NOT WIN32)
     SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -Wno-unused-but-set-variable -Wno-unused-variable")
     SET(OPENBLAS_COMMIT "v0.2.20")
 
@@ -69,7 +78,6 @@ IF(NOT ${CBLAS_FOUND})
     ENDIF()
 
     SET(COMMON_ARGS CC=${OPENBLAS_CC} NO_SHARED=1 NO_LAPACK=1 libs)
-
     ExternalProject_Add(
         extern_openblas
         ${EXTERNAL_PROJECT_LOG_ARGS}
@@ -84,9 +92,11 @@ IF(NOT ${CBLAS_FOUND})
         UPDATE_COMMAND      ""
         CONFIGURE_COMMAND   ""
     )
+    ELSE()
+    ENDIF(NOT WIN32)
     SET(CBLAS_PROVIDER openblas)
     IF(WITH_C_API)
-        INSTALL(DIRECTORY ${CBLAS_INC_DIR} DESTINATION third_party/openblas)
+        INSTALL(DIRECTORY ${CBLAS_INCLUDE_DIR} DESTINATION third_party/openblas)
         # Because libopenblas.a is a symbolic link of another library, thus need to
         # install the whole directory.
         IF(ANDROID)
@@ -107,7 +117,8 @@ IF(NOT ${CBLAS_FOUND})
 ENDIF(NOT ${CBLAS_FOUND})
 
 MESSAGE(STATUS "BLAS library: ${CBLAS_LIBRARIES}")
-INCLUDE_DIRECTORIES(${CBLAS_INC_DIR})
+MESSAGE(STATUS "BLAS Include: ${CBLAS_INCLUDE_DIR}")
+INCLUDE_DIRECTORIES(${CBLAS_INCLUDE_DIR})
 
 # FIXME(gangliao): generate cblas target to track all high performance
 # linear algebra libraries for cc_library(xxx SRCS xxx.c DEPS cblas)
@@ -121,6 +132,11 @@ ELSE()
   TARGET_LINK_LIBRARIES(cblas ${CBLAS_LIBRARIES})
 ENDIF("${CBLAS_PROVIDER}" STREQUAL "MKLML")
 
+IF(WITH_LIBXSMM)
+  TARGET_LINK_LIBRARIES(cblas ${LIBXSMM_LIBS})
+  ADD_DEPENDENCIES(cblas extern_libxsmm)
+ENDIF()
+
 IF(NOT ${CBLAS_FOUND})
     ADD_DEPENDENCIES(cblas extern_openblas)
     LIST(APPEND external_project_dependencies cblas)
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index 2665996432b1f6681927320a85d6835094abe4cd..550b0dada8e90c1e2b33705fd53c065672113b45 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -14,11 +14,14 @@
 
 INCLUDE(ExternalProject)
 # Always invoke `FIND_PACKAGE(Protobuf)` for importing function protobuf_generate_cpp
+IF(NOT WIN32)
 FIND_PACKAGE(Protobuf QUIET)
+ENDIF(NOT WIN32)
 macro(UNSET_VAR VAR_NAME)
     UNSET(${VAR_NAME} CACHE)
     UNSET(${VAR_NAME})
 endmacro()
+
 UNSET_VAR(PROTOBUF_INCLUDE_DIR)
 UNSET_VAR(PROTOBUF_FOUND)
 UNSET_VAR(PROTOBUF_PROTOC_EXECUTABLE)
@@ -94,12 +97,14 @@ macro(PROMPT_PROTOBUF_LIB)
     SET(protobuf_DEPS ${ARGN})
 
     MESSAGE(STATUS "Protobuf protoc executable: ${PROTOBUF_PROTOC_EXECUTABLE}")
+    MESSAGE(STATUS "Protobuf-lite library: ${PROTOBUF_LITE_LIBRARY}")
     MESSAGE(STATUS "Protobuf library: ${PROTOBUF_LIBRARY}")
+    MESSAGE(STATUS "Protoc library: ${PROTOBUF_PROTOC_LIBRARY}")
     MESSAGE(STATUS "Protobuf version: ${PROTOBUF_VERSION}")
     INCLUDE_DIRECTORIES(${PROTOBUF_INCLUDE_DIR})
 
     # Assuming that all the protobuf libraries are of the same type.
-    IF(${PROTOBUF_LIBRARY} MATCHES "${CMAKE_STATIC_LIBRARY_SUFFIX}$")
+    IF(${PROTOBUF_LIBRARY} MATCHES ${CMAKE_STATIC_LIBRARY_SUFFIX})
         SET(protobuf_LIBTYPE STATIC)
     ELSEIF(${PROTOBUF_LIBRARY} MATCHES "${CMAKE_SHARED_LIBRARY_SUFFIX}$")
         SET(protobuf_LIBTYPE SHARED)
@@ -137,18 +142,25 @@ macro(SET_PROTOBUF_VERSION)
 endmacro()
 
 set(PROTOBUF_ROOT "" CACHE PATH "Folder contains protobuf")
+IF (WIN32)
+    SET(PROTOBUF_ROOT ${THIRD_PARTY_PATH}/install/protobuf)
+    MESSAGE(WARNING, "In windows, protobuf only support msvc build, please build it manually and put it at " ${PROTOBUF_ROOT})
+ENDIF(WIN32)
+
 if (NOT "${PROTOBUF_ROOT}" STREQUAL "")
+
     find_path(PROTOBUF_INCLUDE_DIR google/protobuf/message.h PATHS ${PROTOBUF_ROOT}/include NO_DEFAULT_PATH)
-    find_library(PROTOBUF_LIBRARY protobuf PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
-    find_library(PROTOBUF_LITE_LIBRARY protobuf-lite PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
-    find_library(PROTOBUF_PROTOC_LIBRARY protoc PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
+    find_library(PROTOBUF_LIBRARY protobuf libprotobuf.lib PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
+    find_library(PROTOBUF_LITE_LIBRARY protobuf-lite libprotobuf-lite.lib PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
+    find_library(PROTOBUF_PROTOC_LIBRARY protoc libprotoc.lib PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
     find_program(PROTOBUF_PROTOC_EXECUTABLE protoc PATHS ${PROTOBUF_ROOT}/bin NO_DEFAULT_PATH)
     if (PROTOBUF_INCLUDE_DIR AND PROTOBUF_LIBRARY AND PROTOBUF_LITE_LIBRARY AND PROTOBUF_PROTOC_LIBRARY AND PROTOBUF_PROTOC_EXECUTABLE)
         message(STATUS "Using custom protobuf library in ${PROTOBUF_ROOT}.")
+        SET(PROTOBUF_FOUND true)
         SET_PROTOBUF_VERSION()
         PROMPT_PROTOBUF_LIB()
     else()
-        message(WARNING "Cannot find protobuf library in ${PROTOBUF_ROOT}.")
+        message(WARNING "Cannot find protobuf library in ${PROTOBUF_ROOT}")
     endif()
 endif()
 
@@ -239,6 +251,7 @@ IF(CMAKE_CROSSCOMPILING)
         CACHE FILEPATH "protobuf executable." FORCE)
 ENDIF()
 
+
 IF(NOT PROTOBUF_FOUND)
     build_protobuf(extern_protobuf FALSE)
 
diff --git a/cmake/external/python.cmake b/cmake/external/python.cmake
index d7e5571bdbd8ba58d8a08c9426971f1c7b186413..f17b8d46dc2d8ded81ced7de5827d5e7fd5109f0 100644
--- a/cmake/external/python.cmake
+++ b/cmake/external/python.cmake
@@ -18,8 +18,9 @@ ENDIF()
 
 INCLUDE(python_module)
 
-FIND_PACKAGE(PythonInterp 2.7)
-FIND_PACKAGE(PythonLibs 2.7)
+FIND_PACKAGE(PythonInterp ${PY_VERSION})
+FIND_PACKAGE(PythonLibs ${PY_VERSION})
+
 # Fixme: Maybe find a static library. Get SHARED/STATIC by FIND_PACKAGE.
 ADD_LIBRARY(python SHARED IMPORTED GLOBAL)
 SET_PROPERTY(TARGET python PROPERTY IMPORTED_LOCATION ${PYTHON_LIBRARIES})
diff --git a/cmake/external/xbyak.cmake b/cmake/external/xbyak.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..384c2f9328296ce6a8a6293be6cc47e5063dd3c4
--- /dev/null
+++ b/cmake/external/xbyak.cmake
@@ -0,0 +1,58 @@
+# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set(WITH_XBYAK ON)
+if(WIN32 OR APPLE)
+    SET(WITH_XBYAK OFF CACHE STRING "Disable XBYAK in Windows and MacOS" FORCE)
+    return()
+endif()
+
+include(ExternalProject)
+
+set(XBYAK_PROJECT       extern_xbyak)
+set(XBYAK_PREFIX_DIR    ${THIRD_PARTY_PATH}/xbyak)
+set(XBYAK_INSTALL_ROOT  ${THIRD_PARTY_PATH}/install/xbyak)
+set(XBYAK_INC_DIR       ${XBYAK_INSTALL_ROOT}/include)
+
+include_directories(${XBYAK_INC_DIR})
+include_directories(${XBYAK_INC_DIR}/xbyak)
+
+add_definitions(-DPADDLE_WITH_XBYAK)
+
+# xbyak options
+add_definitions(-DXBYAK64)
+add_definitions(-DXBYAK_NO_OP_NAMES)
+
+ExternalProject_Add(
+    ${XBYAK_PROJECT}
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    DEPENDS             ""
+    GIT_REPOSITORY      "https://github.com/herumi/xbyak.git"
+    GIT_TAG             "v5.661"  # Jul 26th
+    PREFIX              ${XBYAK_PREFIX_DIR}
+    UPDATE_COMMAND      ""
+    CMAKE_ARGS          -DCMAKE_INSTALL_PREFIX=${XBYAK_INSTALL_ROOT}
+    CMAKE_CACHE_ARGS    -DCMAKE_INSTALL_PREFIX:PATH=${XBYAK_INSTALL_ROOT}
+)
+
+if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
+    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/xbyak_dummy.c)
+    file(WRITE ${dummyfile} "const char *dummy_xbyak = \"${dummyfile}\";")
+    add_library(xbyak STATIC ${dummyfile})
+else()
+    add_library(xbyak INTERFACE)
+endif()
+
+add_dependencies(xbyak ${XBYAK_PROJECT})
+list(APPEND external_project_dependencies xbyak)
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index 1120677a37e0d44163816b66600121c8f0d545af..e0556a0babc74ba6efa0a190d4f7b77416bef3bf 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -102,7 +102,6 @@ set(COMMON_FLAGS
     -fno-omit-frame-pointer
     -Wall
     -Wextra
-    -Werror
     -Wnon-virtual-dtor
     -Wdelete-non-virtual-dtor
     -Wno-unused-parameter
@@ -115,6 +114,11 @@ set(COMMON_FLAGS
     -Wno-error=terminate  # Warning in PADDLE_ENFORCE
 )
 
+# https://github.com/PaddlePaddle/Paddle/issues/12773
+if (NOT WIN32)
+list(APPEND COMMON_FLAGS -Werror)
+endif()
+
 set(GPU_COMMON_FLAGS
     -fPIC
     -fno-omit-frame-pointer
@@ -142,6 +146,11 @@ else()
         ${GPU_COMMON_FLAGS})
 endif()
 
+if(UNIX AND NOT APPLE)
+  # except apple from nix*Os family
+  set(LINUX TRUE)
+endif(UNIX AND NOT APPLE)
+
 
 foreach(flag ${COMMON_FLAGS})
     safe_set_cflag(CMAKE_C_FLAGS ${flag})
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index eafb11b6f21e226fc68556a78d675dea94080140..6d230942321f8d82a14f5c58037134deb0ab222d 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -148,7 +148,8 @@ function(merge_static_libs TARGET_NAME)
       COMMAND rm "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a"
       COMMAND /usr/bin/libtool -static -o "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" ${libfiles}
       )
-  else() # general UNIX: use "ar" to extract objects and re-add to a common lib
+  endif(APPLE)
+  if(LINUX) # general UNIX: use "ar" to extract objects and re-add to a common lib
     set(target_DIR ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}.dir)
 
     foreach(lib ${libs})
@@ -187,7 +188,36 @@ function(merge_static_libs TARGET_NAME)
         COMMAND ${CMAKE_AR} crs ${target_LIBNAME} `find ${target_DIR} -name '*.o'`
         COMMAND ${CMAKE_RANLIB} ${target_LIBNAME}
         WORKING_DIRECTORY ${target_DIR})
-  endif()
+  endif(LINUX)
+  if(WIN32) # windows do not support gcc/nvcc combined compiling. Use msvc lib.exe to merge libs.
+    # Make the generated dummy source file depended on all static input
+    # libs. If input lib changes,the source file is touched
+    # which causes the desired effect (relink).
+    add_custom_command(OUTPUT ${target_SRCS}
+      COMMAND ${CMAKE_COMMAND} -E touch ${target_SRCS}
+      DEPENDS ${libs})
+
+    # Generate dummy staic lib
+    file(WRITE ${target_SRCS} "const char *dummy_${TARGET_NAME} = \"${target_SRCS}\";")
+    add_library(${TARGET_NAME} STATIC ${target_SRCS})
+    target_link_libraries(${TARGET_NAME} ${libs_deps})
+
+    foreach(lib ${libs})
+      # Get the file names of the libraries to be merged
+      #if(NOT $<TARGET_FILE:${lib}> MATCHES "lib.*\\.lib")
+      #  message("library" ${lib})
+      #  set(libfiles ${libfiles} lib$<TARGET_FILE:${lib}>)
+      #else()
+      set(libfiles ${libfiles} $<TARGET_FILE:${lib}>)
+      #endif()
+    endforeach()
+   
+    # windows cmd return error in clean env.
+    # COMMAND del "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/${TARGET_NAME}.lib"
+    add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
+      COMMAND lib /OUT:${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.lib ${libfiles}
+      )
+  endif(WIN32)
 endfunction(merge_static_libs)
 
 function(cc_library TARGET_NAME)
@@ -195,6 +225,10 @@ function(cc_library TARGET_NAME)
   set(oneValueArgs "")
   set(multiValueArgs SRCS DEPS)
   cmake_parse_arguments(cc_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  if(WIN32)
+      # add libxxx.lib prefix in windows
+      set(${TARGET_NAME}_LIB_NAME "${CMAKE_STATIC_LIBRARY_PREFIX}${TARGET_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}" CACHE STRING "output library name for target ${TARGET_NAME}")
+  endif(WIN32)
   if(cc_library_SRCS)
     if(cc_library_SHARED OR cc_library_shared) # build *.so
       add_library(${TARGET_NAME} SHARED ${cc_library_SRCS})
@@ -263,8 +297,11 @@ function(cc_test TARGET_NAME)
              COMMAND ${TARGET_NAME} ${cc_test_ARGS}
              WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
     if (${cc_test_SERIAL})
-        set_property(TEST ${TARGET_NAME} PROPERTY SERIAL 1)
+        set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
+
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
     endif()
   endif()
 endfunction(cc_test)
@@ -328,8 +365,11 @@ function(nv_test TARGET_NAME)
     add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
     add_test(${TARGET_NAME} ${TARGET_NAME})
     if (nv_test_SERIAL)
-        set_property(TEST ${TARGET_NAME} PROPERTY SERIAL 1)
+        set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
+
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
     endif()
   endif()
 endfunction(nv_test)
@@ -577,7 +617,9 @@ function(py_test TARGET_NAME)
     set(multiValueArgs SRCS DEPS ARGS ENVS)
     cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     add_test(NAME ${TARGET_NAME}
-             COMMAND env FLAGS_init_allocated_mem=true PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
+             COMMAND env FLAGS_init_allocated_mem=true FLAGS_cudnn_deterministic=true
+             FLAGS_cpu_deterministic=true
+             PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
              ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
              WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
   endif()
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index c6979713231f631f8757e4139d6f685d4554b54e..077072f6eadb0c48f4ae32f94828613d89ed01c9 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -101,6 +101,7 @@ if(WITH_MKLDNN)
   )
 endif()
 
+if (NOT WIN32)
 if(NOT MOBILE_INFERENCE AND NOT RPI)
   set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappy")
   copy(snappy_lib
@@ -120,14 +121,19 @@ if(NOT MOBILE_INFERENCE AND NOT RPI)
     DSTS ${dst_dir} ${dst_dir}/lib
     DEPS zlib)
 endif()
+endif(NOT WIN32)
 
 # paddle fluid module
 set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
 set(dst_dir "${FLUID_INSTALL_DIR}/paddle/fluid")
 set(module "framework")
-copy(framework_lib DEPS framework_py_proto 
+if (NOT WIN32)
+set(framework_lib_deps framework_py_proto)
+endif(NOT WIN32)
+copy(framework_lib DEPS ${framework_lib_deps}
   SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h
-  DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module}
+       ${src_dir}/${module}/ir/*.h
+  DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module} ${dst_dir}/${module}/ir
 )
 
 set(module "memory")
@@ -138,29 +144,22 @@ copy(memory_lib
 
 set(inference_deps paddle_fluid_shared paddle_fluid)
 
-if(WITH_CONTRIB)
-    message(STATUS "installing contrib")
-    set(contrib_dst_dir "${FLUID_INSTALL_DIR}/contrib/inference")
-    if (WITH_ANAKIN AND WITH_GPU)
-        copy(contrib_anakin_inference_lib DEPS paddle_inference_api inference_anakin_api
-            SRCS
-            ${PADDLE_BINARY_DIR}/paddle/contrib/inference/libinference_anakin_api* # compiled anakin api
-            ${PADDLE_BINARY_DIR}/third_party/install/anakin/*.tar.gz # anakin release
-            DSTS ${contrib_dst_dir}/anakin ${contrib_dst_dir}/anakin)
-        list(APPEND inference_deps contrib_anakin_inference_lib)
-   endif()
-
-  copy(contrib_inference_lib DEPS paddle_inference_api paddle_inference_api_shared
-        SRCS ${PADDLE_SOURCE_DIR}/paddle/contrib/inference/paddle_inference_api.h
-        ${PADDLE_BINARY_DIR}/paddle/contrib/inference/libpaddle_inference_api*
-        DSTS ${contrib_dst_dir} ${contrib_dst_dir})
-  list(APPEND inference_deps contrib_inference_lib)
+set(module "inference/api")
+if (WITH_ANAKIN AND WITH_MKL)
+    copy(anakin_inference_lib DEPS paddle_inference_api inference_anakin_api
+        SRCS
+        ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/libinference_anakin_api* # compiled anakin api
+        ${ANAKIN_INSTALL_DIR} # anakin release
+        DSTS ${dst_dir}/inference/anakin ${FLUID_INSTALL_DIR}/third_party/install/anakin)
+     list(APPEND inference_deps anakin_inference_lib)
 endif()
 
 set(module "inference")
 copy(inference_lib DEPS ${inference_deps}
   SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*
-  DSTS ${dst_dir}/${module} ${dst_dir}/${module}
+       ${src_dir}/${module}/api/paddle_inference_api.h ${src_dir}/${module}/api/demo_ci
+       ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h
+  DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}
 )
 
 set(module "platform")
diff --git a/cmake/simd.cmake b/cmake/simd.cmake
index 53c2de332ea74b06d1bd6e5bb119cad6af27ed01..3eacf4d86aa0385eddb690d72e85e3384929bb99 100644
--- a/cmake/simd.cmake
+++ b/cmake/simd.cmake
@@ -10,6 +10,7 @@ if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID
     set(SSE3_FLAG "-msse3")
     set(AVX_FLAG "-mavx")
     set(AVX2_FLAG "-mavx2")
+    set(AVX512F_FLAG "-mavx512f")
 elseif(MSVC)
     set(MMX_FLAG "/arch:MMX")
     set(SSE2_FLAG "/arch:SSE2")
@@ -81,5 +82,16 @@ int main()
     return 0;
 }" AVX2_FOUND)
 
+# Check AVX512F
+set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG})
+set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
+CHECK_CXX_SOURCE_RUNS("
+#include <immintrin.h>
+int main()
+{
+    __m512i a = _mm512_undefined_epi32();
+    return 0;
+}" AVX512F_FOUND)
+
 set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_RETAINED})
-mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND)
+mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND AVX512F_FOUND)
diff --git a/cmake/tensorrt.cmake b/cmake/tensorrt.cmake
index ac19b1651893f18b14c62a0986df75bed25d7e80..8f65a737c43a124c05574d6eb9c3050fdab5299a 100644
--- a/cmake/tensorrt.cmake
+++ b/cmake/tensorrt.cmake
@@ -16,7 +16,9 @@ find_library(TENSORRT_LIBRARY NAMES libnvinfer.so libnvinfer.a
     DOC "Path to TensorRT library.")
 
 if(TENSORRT_INCLUDE_DIR AND TENSORRT_LIBRARY)
+  if(WITH_DSO)
     set(TENSORRT_FOUND ON)
+  endif(WITH DSO)
 else()
     set(TENSORRT_FOUND OFF)
 endif()
diff --git a/cmake/version.cmake b/cmake/version.cmake
index cde650128a068faf32f4abfff5cdfdeb656d8577..ac10bdf067be549fe90112aef73fd6e1fbe0ac48 100644
--- a/cmake/version.cmake
+++ b/cmake/version.cmake
@@ -1,23 +1,46 @@
 # Get the latest git tag.
 set(PADDLE_VERSION $ENV{PADDLE_VERSION})
 set(tmp_version "HEAD")
+set(TAG_VERSION_REGEX "[0-9]+\\.[0-9]+\\.[0-9]+(\\.(a|b|rc)\\.[0-9]+)?")
+set(COMMIT_VERSION_REGEX "[0-9a-f]+[0-9a-f]+[0-9a-f]+[0-9a-f]+[0-9a-f]+")
 while ("${PADDLE_VERSION}" STREQUAL "")
+  # Check current branch name
   execute_process(
-    COMMAND ${GIT_EXECUTABLE} describe --tags --abbrev=0 ${tmp_version}
+    COMMAND ${GIT_EXECUTABLE} rev-parse --abbrev-ref ${tmp_version}
     WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
-    OUTPUT_VARIABLE GIT_TAG_NAME
-    RESULT_VARIABLE GIT_RESULT
+    OUTPUT_VARIABLE GIT_BRANCH_NAME
+    RESULT_VARIABLE GIT_BRANCH_RESULT
     ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
-  if (NOT ${GIT_RESULT})
-    # Check the tag is a correct version
-    if (${GIT_TAG_NAME} MATCHES "v[0-9]+\\.[0-9]+\\.[0-9]+(\\.(a|b|rc)\\.[0-9]+)?")
-      string(REPLACE "v" "" PADDLE_VERSION ${GIT_TAG_NAME})
-    else()  # otherwise, get the previous git tag name.
-      set(tmp_version "${GIT_TAG_NAME}~1")
+  if (NOT ${GIT_BRANCH_RESULT})
+    execute_process(
+      COMMAND ${GIT_EXECUTABLE} describe --tags --abbrev=0 --always ${tmp_version}
+      WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
+      OUTPUT_VARIABLE GIT_TAG_NAME
+      RESULT_VARIABLE GIT_RESULT
+      ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+    if (NOT ${GIT_RESULT})
+      # Check if current branch is release branch
+      if (${GIT_BRANCH_NAME} MATCHES "release/${TAG_VERSION_REGEX}")
+        # Check the tag is a correct version
+        if (${GIT_TAG_NAME} MATCHES "${COMMIT_VERSION_REGEX}")
+          # if no tag was found, set PADDLE_VERSION to 0.0.0 to represent latest
+          set(PADDLE_VERSION "0.0.0")
+        elseif (${GIT_TAG_NAME} MATCHES "v${TAG_VERSION_REGEX}")
+          string(REPLACE "v" "" PADDLE_VERSION ${GIT_TAG_NAME})
+        else()  # otherwise, get the previous git tag name.
+          set(tmp_version "${GIT_TAG_NAME}~1")
+        endif()
+      else()
+        # otherwise, we always set PADDLE_VERSION to 0.0.0 to represent latest
+        set(PADDLE_VERSION "0.0.0")
+      endif()
+    else()
+      set(PADDLE_VERSION "0.0.0")
+      message(WARNING "Cannot add paddle version from git tag")
     endif()
   else()
     set(PADDLE_VERSION "0.0.0")
-    message(WARNING "Cannot add paddle version from git tag")
+    message(WARNING "Cannot add paddle version for wrong git branch result")
   endif()
 endwhile()
 
diff --git a/doc/fluid/api/executor.rst b/doc/fluid/api/executor.rst
index db2842e7f23e74130a966bb347004bee1ccb08fd..f23ecc1f80030f20359ce9675130a167722606c9 100644
--- a/doc/fluid/api/executor.rst
+++ b/doc/fluid/api/executor.rst
@@ -38,11 +38,3 @@ _switch_scope
 ..  autofunction:: paddle.fluid.executor._switch_scope
     :noindex:
 
-.. _api_fluid_executor_fetch_var:
-
-fetch_var
----------
-
-..  autofunction:: paddle.fluid.executor.fetch_var
-    :noindex:
-
diff --git a/doc/fluid/api/fluid.rst b/doc/fluid/api/fluid.rst
index 51cdfe0c2ed045a5b3247c4fdec9868d756eae86..7eab58355c3648d929d3b5d98984adce9034f016 100644
--- a/doc/fluid/api/fluid.rst
+++ b/doc/fluid/api/fluid.rst
@@ -106,22 +106,6 @@ _switch_scope
 ..  autofunction:: paddle.fluid._switch_scope
     :noindex:
 
-.. _api_fluid_fetch_var:
-
-fetch_var
----------
-
-..  autofunction:: paddle.fluid.fetch_var
-    :noindex:
-
-.. _api_fluid_Go:
-
-Go
---
-
-..  autoclass:: paddle.fluid.Go
-    :members:
-    :noindex:
 
 .. _api_fluid_make_channel:
 
diff --git a/doc/fluid/api/layers.rst b/doc/fluid/api/layers.rst
index d443c49657b92583e527035f49e74462cf41487d..6f0267cd7a1d0afcdcb1596a46ffe2d15eea100d 100644
--- a/doc/fluid/api/layers.rst
+++ b/doc/fluid/api/layers.rst
@@ -822,6 +822,14 @@ pad
 ..  autofunction:: paddle.fluid.layers.pad
     :noindex:
 
+.. _api_fluid_layers_pad_constant_like:
+
+pad_constant_like
+---
+
+..  autofunction:: paddle.fluid.layers.pad_constant_like
+    :noindex:
+
 .. _api_fluid_layers_label_smooth:
 
 label_smooth
@@ -1145,6 +1153,14 @@ sigmoid
 ..  autofunction:: paddle.fluid.layers.sigmoid
     :noindex:
 
+.. _api_fluid_layers_hsigmoid:
+
+hsigmoid
+-------
+
+..  autofunction:: paddle.fluid.layers.hsigmoid
+    :noindex:
+
 .. _api_fluid_layers_logsigmoid:
 
 logsigmoid
@@ -1768,3 +1784,11 @@ reverse
 ..  autofunction:: paddle.fluid.layers.reverse
     :noindex:
 
+.. _api_fluid_layers_rank_loss:
+
+rank_loss
+-------
+
+..  autofunction:: paddle.fluid.layers.rank_loss
+    :noindex:
+
diff --git a/doc/fluid/design/dist_train/dist_train_nccl2.md b/doc/fluid/design/dist_train/dist_train_nccl2.md
index aa7455ec5de0d46d7c2b0cef3b7ebf4754af3cb1..b8b8427811cddcddf872db5badfd37c96a76c3e3 100644
--- a/doc/fluid/design/dist_train/dist_train_nccl2.md
+++ b/doc/fluid/design/dist_train/dist_train_nccl2.md
@@ -1,7 +1,7 @@
 # Distributed Training with NCCL2
 
 We design a pattern that can enable training with `ParallelExecutor` and
-using [NCCL2](https://developer.nvidia.com/nccl) as it's collective
+use [NCCL2](https://developer.nvidia.com/nccl) as it's collective
 communication library.
 
 In `ParallelExecutor` we can use `AllReduce` or `Reduce` and `Broadcast`
@@ -9,14 +9,14 @@ to do multi GPU training. And if we initialize NCCL2 communicators as
 ranks in a distributed environment, we can simply run the `ParallelExecutor`
 as a distributed program! The only thing that may be different than in
 the single node version is that we need to broadcast the NCCL unique ID
-to all the nodes, and initialize communicators using that ID, so NCCL2
-will know each other as ranks.
+to all the nodes and initialize communicators using that ID, so NCCL2
+can know each other as ranks.
 
 To achieve this feature, we introduce a new operator: `gen_nccl_id` op,
 so we are ***not*** "bind to" running NCCL2 with MPI, we can run it in
-what ever platform you like.
+whatever platform you like.
 
-It have two running modes:
+It has two running modes:
 
 1. Generate and broadcast mode, which should be used on trainer 0;
 1. Listen and fetch mode, which should be used on trainers other than 0.
@@ -29,7 +29,7 @@ initialize NCCL communicator objects.
 <img src="src/ncc2_design.png">
 
 The above figure indicates the general process when training with NCCL2
-distributed. Each trainer have the number of communicators equal to the
+distributed. Each trainer has the number of communicators equal to the
 number of GPUs, but the ranks should match the global ranks number: here
 we have total 8 GPUs, so `nranks==8`, for each trainer, the ranks should
 be from 0 ~ 3 on trainer 0 and 4 ~ 7 on trainer 1.
diff --git a/doc/fluid/design/dist_train/distributed_lookup_table_design.md b/doc/fluid/design/dist_train/distributed_lookup_table_design.md
index 97f890c88e778a59ea475e984ccbc28cf026fc5b..e284e1ec5cdd18d0049ce3c1a8349bbe1248cb48 100644
--- a/doc/fluid/design/dist_train/distributed_lookup_table_design.md
+++ b/doc/fluid/design/dist_train/distributed_lookup_table_design.md
@@ -1,6 +1,6 @@
 # Design Doc: Distributed Lookup Table Operator
 
-A lookup table operator in PaddlePaddle where the table could be out
+A distribute lookup table operator in PaddlePaddle where the table could be out
 of the memory of a computer.
 
 ## Background
@@ -24,14 +24,14 @@ memory, so we'd need a distributed storage service, which supports the
 lookup of rows.
 
 The following figure illustrates the multiplication of x with two
-non-zero elements, or say, two symbols, and a lookup table W:
+non-zero elements, or say two symbols, and a lookup table W:
 
 ![lookup table](./src/lookup_table.png)
 
 ### The Backward Algorithm
 
 The backward algorithm computes W'(x) using W(x).  W'(x) has the same
-scale of size as W(x) and is much smaller than W.
+the scale of size as W(x) and is much smaller than W.
 
 To optimize W given W', we can do simple SGD update:
 
@@ -44,111 +44,46 @@ $$W = f(W, W')$$
 The following figure illustrates the backward pass of the lookup
 operator: ![lookup table training](./src/lookup_table_training.png)
 
-## Distributed Storage Service
-
-The forward algorithm requires a distributed storage service for W.
-The backward algorithm prefers that the storage system can apply the
-optimization algorithm on W.  The following two sections describe two
-solutions -- the former doesn't require that the storage service can
-do optimization, the latter does.
-
-### Storage Service Doesn't Optimize
-
-In this design, we use highly-optimized distributed storage, e.g.,
-memcached, as the storage service, and we run the optimization
-algorithm on parameter servers of PaddlePaddle.  The following figure
-illustrates the training process.
-
-<!--
-Note: please update the following URL when update this digraph.
-<img src='https://g.gravizo.com/svg?
-digraph G {
-  rankdir="LR";
-  subgraph cluster1 {
-  P1 [label="pserver 1"];
-  P2 [label="pserver 2"];
-  T1 [label="trainer 1"];
-  T2 [label="trainer 2"];
-  T3 [label="trainer 3"];
-  }
-  KV [label="memcached"];
-  T1 -> P1;
-  T1 -> P2;
-  T2 -> P1;
-  T2 -> P2;
-  T3 -> P1;
-  T3 -> P2;
-  P1 -> KV [color=gray, weight=0.1];
-  KV -> P1 [color=gray, weight=0.1];
-  P2 -> KV [color=gray, weight=0.1];
-  KV -> P2 [color=gray, weight=0.1];
-  KV -> T1 [color=gray, weight=0.1];
-  KV -> T2 [color=gray, weight=0.1];
-  KV -> T3 [color=gray, weight=0.1];
-}
-)
-'/>
--->
-
-<img src='https://g.gravizo.com/svg?%20digraph%20G%20{%20rankdir=%22LR%22;%20subgraph%20cluster1%20{%20P1%20[label=%22pserver%201%22];%20P2%20[label=%22pserver%202%22];%20T1%20[label=%22trainer%201%22];%20T2%20[label=%22trainer%202%22];%20T3%20[label=%22trainer%203%22];%20}%20KV%20[label=%22memcached%22];%20T1%20-%3E%20P1;%20T1%20-%3E%20P2;%20T2%20-%3E%20P1;%20T2%20-%3E%20P2;%20T3%20-%3E%20P1;%20T3%20-%3E%20P2;%20P1%20-%3E%20KV%20[color=gray,%20weight=0.1];%20KV%20-%3E%20P1%20[color=gray,%20weight=0.1];%20P2%20-%3E%20KV%20[color=gray,%20weight=0.1];%20KV%20-%3E%20P2%20[color=gray,%20weight=0.1];%20KV%20-%3E%20T1%20[color=gray,%20weight=0.1];%20KV%20-%3E%20T2%20[color=gray,%20weight=0.1];%20KV%20-%3E%20T3%20[color=gray,%20weight=0.1];%20}'/>
-
-Each trainer runs the forward and backward passes using their local
-data:
-
-1. In the forward pass, when a trainer runs the forward algorithm of a
-   lookup operator, it retrieves W(x) from the storage service.
-1. The trainer computes W'(x) in the backward pass using W(x).
-
-During the global update process:
-
-1. Each trainer uploads its W'(x) to parameter servers.
-1. The parameter server runs the optimization algorithm, e.g., the
-   Adam optimization algorithm, which requires that
-   1. The parameter server retrieves W(x) from memcached, and
-   1. The parameter server pushes $\Delta W(x)=f(W(x), lambda \sum_j
-      W'(x))$ to memcached, where $f$ denotes the optimization
-      algorithm.
-
-### Storage Service Does Optimize
-
-This design is very similar to the above one, except that the
-optimization algorithm $f$ runs on the storage service.
-
-- Pro: parameter servers do not retrieve W(x) from the storage
-  service, thus saves half network communication.
-- Con: the storage service needs to be able to run the optimization
-  algorithm.
-
-## Distributed Sparse Table in Fluid
-
-For another design, we can implement a distributed sparse table in Fluid,
-and don't need to maintain an external storage component while training.
-
-You may need to read Fluid [Distributed Training Architecture](./distributed_architecture.md)
-and [Parameter Server](./parameter_server.md) before going on.
-
-![fluid lookup remote table](./src/fluid_lookup_remote_table.png)
-
-Partition a large table into multiple pserver instances
-1. `DistributeTranspiler` would split the table partitioned into some small
-table blocks with some partitioned algorithms such as
-[RoundRobin](https://en.wikipedia.org/wiki/Round-robin_scheduling),
-[Hash](https://en.wikipedia.org/wiki/Hash) and etc...
-1. For some cases, the range of input `Ids` is very wide and unpredictable, so the sparse
-table would be able to fill a new value for the id that didn't appear before with
-zero, uniform random or Gaussian distribution.
-
-For each Trainer's training process:
-1. In the forward pass, we use `pre-fetch` op to pre-fetch parameter blocks according to the
-input `Ids` from PServers instead of the local `lookup_table` op, and then merge the blocks
-into a parameter `W`.
-1. Compute `GRAD@W'` in the backward pass using the pre-fetched `W` and send it to PServer to
-execute the optimize pass.
-
-## Conclusion
-
-Let us do the "storage service does not optimize" solution first, as a
-baseline at least, because it is easier to use a well-optimized
-distributed storage service like memcached.  We can do the "storage
-service does optimize" solution later or at the same time, which, if
-implemented carefully, should have better performance than the former.
+## Distributed Lookup Table
+### Problem 1: The lookup table may be very large.
+
+ In the condition like the search engine and recommendation system, the number of feature Id may be very large, say 100,000,000,000, then for a float value lookup table of size 8, the total size of the table is:
+
+ ```
+ 100,000,000,000 * 8 * 4(Bytes) = 2980.23 GB
+ ```
+
+### Solution: Distributed storage
+
+1. Paddle use [SelectedRows](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/selected_rows.md) as the storage format for the lookup table, the lookup table parameter will be split to multi-machine according to the hash of the feature ID, and data will also be split and send to the same machine to prefetch the parameter.
+
+1. For common parameters, the trainer will get the whole parameter for training, but for the big lookup table, the trainer can not store the whole parameter. Because the input data feature is very sparse, every time we only need a few parameters for training, so we use `prefetch_op` to only prefetch the parameter needed to trainer.
+
+### Problem 2. The Id in the lookup table is not sure before training.
+
+ The feature Id is calculated by the hash function because the feature data source is so large, we can not get all the Id before training. So we can not initialize the table before training.
+
+### Solution: Id auto growth
+
+At the beginning of training, paddle only malloc the memory for the lookup table at parameter server side, the Id and it's value will not be initialized. During training, when a parameter server received an Id, if it is already in the lookup table, it will return the existing parameter, if the Id does not exist, paddle will add it into the lookup table and initialize the value for it.
+
+### Problem 3: parameter load and save
+
+For common parameters, paddle use trainer to save and load them. But for distributed lookup table, trainer cannot do this because it's large size.
+
+### Solution: Parameter server side save and load
+
+Paddle support parameter server side save and load for distribute lookup table. Each machine of parameter servers will only save and load part of the whole table.
+
+## Architecture
+The whole architecture of the distribute lookup table is as below:
+
+### Training steps:
+1. Read a batch of data, the data is feature ids.
+1. The input ids will be split by `split_ids_op` with the same hash function of the lookup table.
+1. The `prefetch_op` use the split result to prefetch parameters back from the lookup table.
+1. Run forward-backward to get the gradient of the lookup table.
+1. `split_ids_op` split the gradient and then use `send_op` to the parameter server.
+1. parameter server update the table with the received gradient.
+
+![distribute lookup table](./src/distributed_lookup_table.jpeg)
diff --git a/doc/fluid/design/dist_train/src/distributed_lookup_table.graffle b/doc/fluid/design/dist_train/src/distributed_lookup_table.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..65dfdbbacd219739db6ddfdf243cc16c3c4e8d1e
Binary files /dev/null and b/doc/fluid/design/dist_train/src/distributed_lookup_table.graffle differ
diff --git a/doc/fluid/design/dist_train/src/distributed_lookup_table.jpeg b/doc/fluid/design/dist_train/src/distributed_lookup_table.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..5353a16fd329f62ff893d32706b9c3c0bcc46a07
Binary files /dev/null and b/doc/fluid/design/dist_train/src/distributed_lookup_table.jpeg differ
diff --git a/doc/fluid/design/ir/overview.md b/doc/fluid/design/ir/overview.md
new file mode 100644
index 0000000000000000000000000000000000000000..83ef97c99efeaf27a27f93f0cd3857c0f1bc812e
--- /dev/null
+++ b/doc/fluid/design/ir/overview.md
@@ -0,0 +1,185 @@
+## Motivation
+
+There is a `gap` between the `Program` defined by
+user and the `Executable` that can be scheduled
+efficiently on heterogeneous hardware, either locally
+or distributedly.
+
+Usually, the `gap` is bridged by
+
+* A serious transformations with defined order.
+
+* These transformations usually involve
+`insert, delete, clustering, split, dependency analysis`.
+
+* Has a simple way to verify and debug each transformation.
+
+* Flexible to add, remove or customize transformations to fit
+the requirements of various algorithms (models) and hardware secenarios.
+
+Some other events also push us to a better unified pattern.
+
+* The deep learning framework is built around the concepts of graphs.
+To leverage tools such as compilation (e.g. TVM and nGraph) or
+cross-framework conversion (e.g. ONNX), we also need a intermediate
+representation that can be connected to the rest of the ecosystem.
+
+
+We need a unified pattern to naturally support the requirements
+described above. The pattern should fit both training, inference
+and other offline serielized model transformations.
+Learned from LLVM and other deep learning framework, we draft the
+design below.
+
+
+## Design
+
+### Major Concepts
+
+#### Node
+
+`Node` represents an operation that performs some computation or
+a variable that is input or output of operation.
+
+`Node`s are connected to other `Node`s via inputs and outputs.
+
+Other properties (maybe device placement information) can be added
+to `Node` in the future if it's a
+common requirement of many other `Pass`es. Otherwise, it should live
+in a `Node` wrapper class that is private to some `Pass` or be
+a local member of a `Pass`.
+
+#### Graph
+
+`Graph` contains a list of `Node`s, which are connected to
+each other via inputs and outputs.
+
+TODO: Better definitions for the graph.
+
+`Graph` can also contain `Attribute`s. `Attribute`s
+can be `any` thing. For example, it can be a list of "wraper"
+nodes. The `wrapper` nodes compose `Node`s and provide
+helper method for execution or transformation. `Attribute`
+can also contain other things that describe some properties of
+the `Graph` or `Graph` nodes. `Attribute` can be passed
+across `Pass`. However, it should be used with care.
+
+```cpp
+class Graph {
+ public:
+  explicit Graph(const ProgramDesc &program);
+
+  bool Has(const std::string &attr_name) const;
+
+  template <typename AttrType>
+  AttrType &Get(const std::string &attr_name) const;
+
+  template <typename AttrType>
+  void Set(const std::string &attr_name, AttrType *attr);
+  const std::unordered_set<ir::Node *> &Nodes() const;
+
+  // Create a normal variable with non-null VarDesc.
+  ir::Node *CreateVarNode(VarDesc *var_desc);
+
+  // Create a normal runnable operator with OpDesc.
+  ir::Node *CreateOpNode(OpDesc *op_desc);
+
+  // Create a control dependency var that connects 2 operations. The
+  // var doesn't hold any data. Other than that, it's no different from
+  // other var, considering dependency analysis.
+  ir::Node *CreateControlDepVar();
+
+  // A more free style way of creating a graph node. Mostly use for test
+  // or "copy" from another node. Avoid using it if possible.
+  ir::Node *CreateEmptyNode(const std::string &name, ir::Node::Type type);
+
+  // Clear all node information of the graph and return the ownership of the
+  // nodes.
+  std::vector<std::unique_ptr<ir::Node>> ReleaseNodes();
+};
+```
+
+#### Pass
+
+`Pass` represents a transformation of `Graph`. Its input
+is a `Graph` and its output is also a `Graph`. For example,
+a `Pass` can simply print out the `Graph`. A `Pass`
+can also fuse some `Graph`'s `Node`s.
+
+```cpp
+class Pass {
+ public:
+
+  std::unique_ptr<Graph> Apply(std::unique_ptr<Graph> graph) const {
+    // Some correctness check.
+    auto new_graph = ApplyImpl(std::move(graph));
+    // Some correctness check.
+    return new_graph;
+  }
+
+  // Get a reference to the attributed previously set.
+  template <typename AttrType>
+  AttrType &Get(const std::string &attr_name) const;
+
+  // Set a pointer to the attribute. Pass takes ownership of the attribute.
+  template <typename AttrType>
+  void Set(const std::string &attr_name, AttrType *attr) ;
+
+  // Set a pointer to the attribute. Pass doesn't take ownership. Caller
+  // should delete the attribute.
+  template <typename AttrType>
+  void SetNotOwned(const std::string &attr_name, AttrType *attr);
+
+ protected:
+  virtual std::unique_ptr<Graph> ApplyImpl(std::unique_ptr<Graph> graph) const = 0;
+};
+
+// In my_pass.cc
+class MyPass : public Pass {
+ protected:
+  std::unique_ptr<Graph> ApplyImpl(std::unique_ptr<Graph> graph) const override {
+    // do something.
+    return graph;
+  }
+}
+REGISTER_PASS(my_pass, MyPass)
+.RequirePassAttr("places")
+.RequireGraphAttr("dep_vars");
+
+
+// To use the pass.
+auto my_pass = ir::PassRegistry::Instance().Get("my_pass");
+graph = my_pass->Apply(std::move(graph));
+// Note: to force link my_pass.cc, in the code:
+USE_PASS(my_pass);
+```
+
+#### Optimize
+
+`Optimize` contains a series of `Pass` with defined order.
+`Optimize` transforms a `Graph` that only contains raw
+modeling logic to a `Graph` that can be run efficiently while
+maintaining the original modeling logic.
+
+
+### Optimize Process
+
+* Program is first converted to Graph.
+* Graph goes through a series of Pass
+* Graph is transformed from raw model logic to a
+form that is efficient to execute.
+
+```
+// Program->ProgramToGraph->Graph->Pass1->Graph->Pass2->Graph->Pass3->Graph->Executor
+auto graph = Graph(program);
+graph = PassRegistry::Instance().Get("op_fuse_pass").Apply(std::move(grah));
+// For more complex Pass, Optimize Process can provide Pass attributes.
+auto mem_opt_pass = PassRegistry::Instance().Get("memory_optimization_pass");
+mem_opt_pass.SetNotOwned<int>("optimize_level", 1);
+mem_opt_pass->Apply(std::move(graph));
+graph = PassRegistry::Instance().Get("multi_devices_pass").Apply(std::move(grah));
+graph = PassRegistry::Instance().Get("multi_devices_check_pass").Apply(std::move(grah));
+Executor exe;
+exe.Run(graph);
+
+```
diff --git a/doc/fluid/design/modules/python_api.md b/doc/fluid/design/modules/python_api.md
index 265732a348ea77d21005e335390d99abcdfbd045..83af4e55485c079265d3f2b1e15070825b532c02 100644
--- a/doc/fluid/design/modules/python_api.md
+++ b/doc/fluid/design/modules/python_api.md
@@ -98,13 +98,13 @@ class Block(objects):
     def append_operator(self, ...):
         self.ops.append(Operator(self, ...))
 
-    def prepend_operator(self, ...): # Parameter's ctor prepands initialize operators.
+    def _prepend_operator(self, ...): # Parameter's ctor prepands initialize operators.
        self.ops.prepend(Operator(self, ...))
 ```
 
 `create_parameter` is necessary because parameters are global variables, defined in the global block, but can be created in some sub-blocks. For example, an FC layer in the step block of an RNN operator.
 
-`prepend_operator` is necessary because the constructor of `Parameter` needs to create the initialize (or load) operator of the parameter, and would like to put it in the *preamble* of the global block.
+`_prepend_operator` is necessary because the constructor of `Parameter` needs to create the initialize (or load) operator of the parameter, and would like to put it in the *preamble* of the global block.
 
 ### Operator
 
diff --git a/doc/fluid/design/others/graph_survey.md b/doc/fluid/design/others/graph_survey.md
index 6c6db08f463ae0a2b94fc4546f123a1d7c151870..97f395133b48a1d0ed5136f0ebc8720b8ca87ded 100644
--- a/doc/fluid/design/others/graph_survey.md
+++ b/doc/fluid/design/others/graph_survey.md
@@ -28,7 +28,7 @@ def get_symbol(num_classes=10, **kwargs):
 
 
 
-Varible here is actually a Symbol. Every basic Symbol will correspond to one Node, and every Node has its own NodeAttr. There is a op field in NodeAttr class, when a Symbol represents Variable(often input data), the op field is null.
+Varible here is actually a Symbol. Every basic Symbol will correspond to one Node, and every Node has its own AnyAttr. There is a op field in AnyAttr class, when a Symbol represents Variable(often input data), the op field is null.
 
 Symbol contains a data member, std::vector<NodeEntry> outputs, and NodeEntry cantains a poniter to Node. We can follow the Node pointer to get all the Graph.
 
diff --git a/doc/fluid/design/quantization/fixed_point_quantization.md b/doc/fluid/design/quantization/fixed_point_quantization.md
new file mode 100644
index 0000000000000000000000000000000000000000..085352fc5614d693e63a2f7241e868a9649456af
--- /dev/null
+++ b/doc/fluid/design/quantization/fixed_point_quantization.md
@@ -0,0 +1,110 @@
+Fixed-point quantization uses lower bits, for example, 2-bit, 3-bit or 8-bit fixed point to represent weights and activations, which usually are in singe-precision float-point with 32 bits. The fixed-point representation has advantages in reducing memory bandwidth, lowering power consumption and computational resources as well as the model storage requirements.  It is especially important for the inference in embedded-device deployment.
+
+According to some experiments, the apporach to quantize the model trained in float point directly works effectively on the large models, like the VGG model having many parameters. But the accuracy drops a lot for the small model. In order to improve the tradeoff between accuracy and latency, many quantized training apporaches are proposed.
+
+This document is to design a quantized training framework on Fluid. The first part will introduce how to quantize, The second part will describe the quantized training framework. The last part will illustrate how to calculate the quantization scale.
+
+
+### How to quantize
+
+There are many ways to quantize the float value to fixed-point value. For example:
+
+$$ r = min(max(x, a), b)$$
+$$ s = \frac{b - a}{n - 1} $$
+$$ q = \left \lfloor \frac{r - a}{s} \right \rceil $$
+
+where, $x$ is the float value to be quantized, $[a, b]$ is the quantization range, $a$ is the minimum value and $b$ is the maximal value. $\left \lfloor \right \rceil$  denotes rounding to the nearest integer. If the quantization level is $k$, $n$ is $2^k$, for example, $k$ is 8 and $n$ is 256. $q$ is the quantized integer. 
+
+
+The quantization we applied is parameterized by the number of quantization levels and maximum absolute value:
+
+$$ M  = max(abs(x))  $$
+$$ q = \left \lfloor \frac{x}{M} * (n - 1) \right \rceil $$
+
+where, $x$ is the float value to be quantized, $M$ is maximum absolute value. $\left \lfloor \right \rceil$ denotes rounding to the nearest integer.  For 8 bit quantization, $n=2^{8}=256$. $q$ is the quantized integer. 
+
+
+Wether the *min-max* quantization or *max-abs* quantization, they also can be represent:
+
+$q = scale * r + b$
+
+We call *min-max*, *max-abs* as the quantization arguments, also call them quantization scale or quantization range.
+
+
+How to calculate the quantization scale (or maximum absolute value) for inference will be described in the last part.
+
+
+### Training Framework
+
+#### Forward pass
+
+The forward pass is simulated quantization, see Figure 1.
+
+The training framework is as following figure. 
+
+<p align="center"> 
+<img src="quantization_forward.png" width="300" height="340"><br/>
+Figure 1. Forward in training with simulated quantization.
+</p>
+
+- Firstly, both input and weight will be quantized to 8-bit integers. 
+- Second, do the multiplication (or convolution) operation with integers.
+- Third, dequantize the multiplication (or convolution) results to 32-bit float point.
+- Finally, do bias-addition in float type of 32 bit. Here, the bias is not quantized.
+
+For general matrix multiplication (GEMM), quantize for $X$ and $W$:
+
+$$ X_q = \left \lfloor \frac{X}{X_m} * (n - 1) \right \rceil  $$
+$$ W_q = \left \lfloor \frac{W}{W_m} * (n - 1) \right \rceil $$
+
+Do GEMM:
+
+$$ Y = X_q * W_q $$
+
+
+Dequantize $Y$:
+
+$$
+\begin{align}
+Y_{dq} &=\frac{Y}{(n - 1) * (n - 1)} * X_m * W_m \\\
+       &=\frac{X_q * W_q}{(n - 1) * (n - 1)} * X_m * W_m \\\
+       &=(\frac{X_q}{n - 1} * X_m) * (\frac{W_q}{n - 1} * W_m) 
+\end{align}
+$$
+
+From these formulas, dequantization also can be moved before GEMM, do dequantization for $Xq$ and $Wq$ at first, then do GEMM. The forward workflow in training is equivalent to following framework.
+
+<p align="center"> 
+<img src="quantization_equivalent_forward.png"  width="300" height="330"><br/>
+Figure 2. Equivalent forward in training with simulated quantization.
+</p>
+
+We use this equivalent workflow in the training. In our desigin, there is a quantization transpiler to insert the quantization operator and the de-quantization operator in the Fluid `ProgramDesc`. Since the outputs of quantization and de-quantization operator are still in floating point, they are called faked quantization and de-quantization operator. And the training framework is called simulated quantization.
+
+#### Backward pass
+
+See Figure 3. The gradients are calculated by dequantized weights and activations. All inputs and outputs are float point with 32-bit. And in the weight updating process, the gradients will be added to the original weight, not the quantized or dequantized weights.
+
+<p align="center"> 
+<img src="quantization_backward_and_optimization.png"><br/>
+Figure 3. Backward and weight updating in training with simulated quantization.
+</p>
+
+So the quantization transipler will change some inputs of the corresponding backward operators. 
+
+### How to calculate quantization scale
+
+There are two strategies to calculate quantization scale, we call them dynamic and static strategy. The dynamic strategy calculates the quantization scale value each iteration. The static strategy keeps the quantization scale for different inputs.
+
+For weights, we apply the dynamic strategy in the training, that is to say, the quantization scale will be recalculated during each iteration until the traning is finished.
+
+For activations, the quantization scales are estimated during training, then used in inference. There are several different ways to estimate them:
+
+
+1. Calculate the mean of maximum absolute during a window.
+2. Calculate the max of maximum absolute during a window.
+3. Calculate the running mean of maximum absolute during a window, as follows:
+
+    $$ Vt = (1 - k) * V +  k * V_{t-1}  $$
+    
+    where, $V$ is the maximum absolute value of current batch, $Vt$ is the running mean value. $k$ is a factor, such as 0.9.
diff --git a/doc/fluid/design/quantization/quantization_backward_and_optimization.png b/doc/fluid/design/quantization/quantization_backward_and_optimization.png
new file mode 100644
index 0000000000000000000000000000000000000000..84f8235ab87cb631992b691f8e05b9c0b6c93da2
Binary files /dev/null and b/doc/fluid/design/quantization/quantization_backward_and_optimization.png differ
diff --git a/doc/fluid/design/quantization/quantization_equivalent_forward.png b/doc/fluid/design/quantization/quantization_equivalent_forward.png
new file mode 100644
index 0000000000000000000000000000000000000000..df49c864537c047c785da12d24893e54ce0a5341
Binary files /dev/null and b/doc/fluid/design/quantization/quantization_equivalent_forward.png differ
diff --git a/doc/fluid/design/quantization/quantization_forward.png b/doc/fluid/design/quantization/quantization_forward.png
new file mode 100644
index 0000000000000000000000000000000000000000..0913f61621bb6533bcb10bd1d18120ccaaa96cff
Binary files /dev/null and b/doc/fluid/design/quantization/quantization_forward.png differ
diff --git a/doc/fluid/dev/new_op_cn.md b/doc/fluid/dev/new_op_cn.md
index 587d819f79fcf82549826359fbf04ad3af404446..ff7408111fa20a7a6a3a2fe9f9ba20835918f399 100644
--- a/doc/fluid/dev/new_op_cn.md
+++ b/doc/fluid/dev/new_op_cn.md
@@ -36,19 +36,19 @@
 <tbody>
 <tr>
 <td>OpProtoMake定义 </td>
-<td>`.cc`文件，Backward Op不需要定义OpProtoMake </td>
+<td>.cc 文件，Backward Op不需要定义OpProtoMake </td>
 </tr>
 <tr>
 <td>Op定义 </td>
-<td> `.cc`文件</td>
+<td> .cc 文件</td>
 </tr>
 <tr>
 <td>Kernel实现 </td>
-<td> CPU、CUDA共享Kernel实现在`.h`文件中，否则，CPU 实现在`.cc`文件中，CUDA 实现在`.cu`文件中。</td>
+<td> CPU、CUDA共享Kernel实现在.h 文件中，否则，CPU 实现在.cc 文件中，CUDA 实现在.cu 文件中。</td>
 </tr>
 <tr>
 <td>注册Op </td>
-<td> Op注册实现在`.cc`文件；Kernel注册CPU实现在`.cc`文件中，CUDA实现在`.cu`文件中</td>
+<td> Op注册实现在.cc 文件；Kernel注册CPU实现在.cc 文件中，CUDA实现在.cu 文件中</td>
 </tr>
 </tbody>
 </table>
@@ -119,10 +119,29 @@ $$Out = scale*X$$
 
 这个例子有`AddAttr<AttrType>("scale", "...").SetDefault(1.0);` : 增加`scale`系数，作为参数属性，并且设置默认值为1.0。
 
+### 定义GradProtoMaker类
+每个Op的必须有一个对应的GraProtoMaker，若未定制对应前向Op的GradProtoMaker，fluid提供了DefaultGradProtoMaker，默认注册会使用全部输入输出，包括Input, Output, Output@Grad等，使用不需要的变量的会造成显存浪费。
+下面示例定义了ScaleOp的GradProtoMaker。
+
+```cpp
+class ScaleGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
+    grad_op->SetType("scale");
+    grad_op->SetInput("X", OutputGrad("Out"));
+    grad_op->SetOutput("Out", InputGrad("X"));
+    grad_op->SetAttr("scale", GetAttr("scale"));
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+```
 
 ### 定义Operator类
 
-下面的点实现了MulOp的定义：
+下面实现了MulOp的定义：
 
 ```cpp
 class MulOp : public framework::OperatorWithKernel {
@@ -334,3 +353,83 @@ ctest -R test_mul_op
 - 注册Op时的类型名，需要和该Op的名字一样。即不允许在`A_op.cc`里面，注册`REGISTER_OPERATOR(B, ...)`等，这将会导致单元测试出错。
 - 如果Op没有实现CUDA Kernel，请不要创建空的`*_op.cu`，这将会导致单元测试出错。
 - 如果多个Op依赖一些共用的函数，可以创建非`*_op.*`格式的文件来存放，如`gather.h`文件。
+
+### PADDLE_ENFORCE使用注意
+
+实现Op时检查数据的合法性需要使用PADDLE_ENFORCE以及PADDLE_ENFORCE_EQ等宏定义，基本格式如下：
+
+```
+PADDLE_ENFORCE(表达式, 错误提示信息)
+PADDLE_ENFORCE_EQ(比较对象A, 比较对象B, 错误提示信息)
+```
+
+如果表达式为真，或者比较对象A=B，则检查通过，否则会终止程序运行，向用户反馈相应的错误提示信息。
+为了确保提示友好易懂，开发者需要注意其使用方法。
+
+#### 总体原则
+
+任何使用了PADDLE_ENFORCE与PADDLE_ENFORCE_**检查的地方，必须有详略得当的备注解释！**错误提示信息**不能为空！
+
+#### 提示信息书写标准
+
+1. [required] 哪里错了？为什么错了？
+    - 例如：`ValueError: Mismatched label shape`
+2. [optional] 期望的输入是什么样的？实际的输入是怎样的？
+    - 例如：`Expected labels dimension=1. Received 4.`
+3. [optional] 能否给出修改意见？
+    - 例如：`Suggested Fix:If your classifier expects one-hot encoding label,check your n_classes argument to the estimatorand/or the shape of your label.Otherwise, check the shape of your label.`
+
+如果并非必要或者简洁的描述即可表达清楚以上要点，根据情况书写亦可。
+
+##### FAQ 典型问题
+
+1. 无报错信息或报错信息过于简单，不能给用户提供有效的提示！
+
+问题示例1 ：未写提示信息
+```
+PADDLE_ENFORCE(ctx->HasInput("X"), "");
+```
+问题示例2 ：提示信息过于简单
+```
+PADDLE_ENFORCE(i != nullptr, "i must be set"); // i是什么？
+```
+
+2. 在报错信息中使用开发人员定义的变量缩写，不易理解！
+
+问题示例：
+```
+PADDLE_ENFORCE(forward_pd != nullptr,
+                    "Fail to find eltwise_fwd_pd in device context");  //eltwise_fwd_pd用户可能看不懂
+```
+
+3. OP内部调用非法接口：Op内部如果出现Output = ShareDataWith(Input) 
+问题示例：
+```cpp
+auto *out = ctx.Output<framework::LoDTensor>("Out");
+auto *in = ctx.Input<framework::LoDTensor>("X");
+out->ShareDataWith(*in);
+```
+Op内部如果出现Output = ShareDataWith(Input)，相当于operator图的中有一条隐藏边，连接了Input和Output，这条边无法在图分析中表达，引发基于图优化的错误。
+
+4. OP实现的性能实践
+调用了eigen的broadcast, chop等操作，性能会比手写cuda kernel差几倍以上。此时cpu的实现可以复用eigen，gpu实现可以实现cuda kernel.
+
+
+#### OP InferShape检查提示信息特别说明
+
+- 检查输入输出变量，请统一遵循以下格式
+`Input(变量名) of OP名 operator should not be null.`  
+
+正确示例：
+```
+PADDLE_ENFORCE(ctx->HasInput("Input"),
+                        "Input(Input) of LSTMP operator should not be null.");
+```
+
+- 反向Op的输入输出检查，要写明反向Op的名字
+
+正确示例：
+```
+PADDLE_ENFORCE(ctx->HasInput("X"),
+                        "Input(X) of LoDResetGrad opreator should not be null.");
+```
diff --git a/doc/fluid/dev/releasing_process_cn.md b/doc/fluid/dev/releasing_process_cn.md
index 4c6728fba7150b0f1e180e57590f18a5b677c70d..acea9a2b5df903a958edf3683900e165670e196f 100644
--- a/doc/fluid/dev/releasing_process_cn.md
+++ b/doc/fluid/dev/releasing_process_cn.md
@@ -1,24 +1,23 @@
 # PaddlePaddle发行规范
 
-PaddlePaddle使用git-flow branching model做分支管理，使用[Semantic Versioning](http://semver.org/)标准表示PaddlePaddle版本号。
+PaddlePaddle使用Trunk Based Development，使用[Semantic Versioning](http://semver.org/)标准表示PaddlePaddle版本号。
 
 PaddlePaddle每次发新的版本，遵循以下流程:
 
 1. 从`develop`分支派生出新的分支，分支名为`release/版本号`。例如，`release/0.10.0`
-1. 将新分支的版本打上tag，tag为`版本号rc.Patch号`。第一个tag为`0.10.0rc1`，第二个为`0.10.0rc2`，依次类推。
-1. 对这个版本的提交，做如下几个操作:
-  * 使用Regression Test List作为检查列表，测试本次release的正确性。
-	  * 如果失败，记录下所有失败的例子，在这个`release/版本号`分支中，修复所有bug后，Patch号加一，到第二步
-	* 修改`python/setup.py.in`中的版本信息,并将`istaged`字段设为`True`。
-	* 将这个版本的python wheel包发布到pypi。
-	* 更新Docker镜像（参考后面的操作细节）。
-1. 第三步完成后，将`release/版本号`分支合入master分支，将master分支的合入commit打上tag，tag为`版本号`。同时再将`master`分支合入`develop`分支。
-1. 协同完成Release Note的书写。
+2. 将新分支的版本打上tag，tag为`版本号rc-Patch号`。例如，第一个tag为`0.10.0-rc0`。
+3. 新分支一般不接受新的feature和优化。QA在release分支上进行测试。研发基于最新的develop开发。
+4. QA和研发发现的bug，在develop上修复验证后，cherry-pick修复到release分支。直到release分支相对稳定。
+5. 如果有需要，在release分支最新代码上打上新的tag，比如`0.10.0-rc1`，让更多的用户加入测试。重复3-4步。
+6. release分支稳定后，打上正式的release tag，比如`0.10.0`。
+7. 将这个版本的python wheel包发布到pypi。
+8. 更新Docker镜像（参考后面的操作细节）。
 
 需要注意的是:
 
-* `release/版本号`分支一旦建立，一般不允许再从`develop`分支合入`release/版本号`。这样保证`release/版本号`分支功能的封闭，方便测试人员测试PaddlePaddle的行为。
-* 在`release/版本号`分支存在的时候，如果有bugfix的行为，需要将bugfix的分支同时merge到`master`, `develop`和`release/版本号`这三个分支。
+* bug修复需要先在develop上进行，然后进入release分支。而不是直接在release分支上开发。
+
+* release分支原则上只接受修复类的修改，不接受新feature。
 
 ## 发布wheel包到pypi
 
@@ -61,24 +60,21 @@ docker push [镜像]:[version]
 
 ## PaddlePaddle 分支规范
 
-PaddlePaddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范，并适应github的特性做了一些区别。
-
-* PaddlePaddle的主版本库遵循[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范。其中:
-	* `master`分支为稳定(stable branch)版本分支。每一个`master`分支的版本都是经过单元测试和回归测试的版本。
-	* `develop`分支为开发(develop branch)版本分支。每一个`develop`分支的版本都经过单元测试，但并没有经过回归测试。
-	* `release/版本号`分支为每一次Release时建立的临时分支。在这个阶段的代码正在经历回归测试。
+PaddlePaddle开发过程使用[Trunk Based Development](https://trunkbaseddevelopment.com/) 开发规范。
 
-* 其他用户的fork版本库并不需要严格遵守[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范，但所有fork的版本库的所有分支都相当于特性分支。
-	* 建议，开发者fork的版本库使用`develop`分支同步主版本库的`develop`分支
-	* 建议，开发者fork的版本库中，再基于`develop`版本fork出自己的功能分支。
-	* 当功能分支开发完毕后，向PaddlePaddle的主版本库提交`Pull Reuqest`，进而进行代码评审。
-		* 在评审过程中，开发者修改自己的代码，可以继续在自己的功能分支提交代码。
+* `develop`分支为开发(develop branch)版本分支。每一个`develop`分支的版本都经过单元测试。并且会经过模型回归测试。
+* `release/版本号`分支为每一次Release时建立的临时分支。release分支主要用于测试，bug修复和最终发版。
+* `master`分支因为历史原因，已经废弃。
 
-* BugFix分支也是在开发者自己的fork版本库维护，与功能分支不同的是，BugFix分支需要分别给主版本库的`master`、`develop`与可能有的`release/版本号`分支，同时提起`Pull Request`。
+* 其他开发者fork的feature branch。
+	* 建议，开发者的feature branch需要同步主版本库的`develop`分支。
+	* 建议，开发者的feature branch需要基于主版本库中的`develop`分支。
+	* 当feature branch开发完毕后，向PaddlePaddle的主版本库提交`Pull Reuqest`，进而进行代码评审。
+		* 在评审过程中，开发者修改自己的代码，可以继续在自己的feature branch提交代码。
 
 ## PaddlePaddle回归测试列表
 
-本列表说明PaddlePaddle发版之前需要测试的功能点。
+TODO
 
 ### PaddlePaddle Book中所有章节
 
diff --git a/doc/fluid/dev/releasing_process_en.md b/doc/fluid/dev/releasing_process_en.md
index f989b964d6d1a329bbe31adc7ec10db017acaefa..b810dc941d27fdb5004812ab58e105502e83280f 100644
--- a/doc/fluid/dev/releasing_process_en.md
+++ b/doc/fluid/dev/releasing_process_en.md
@@ -4,26 +4,21 @@ PaddlePaddle manages its branches using "git-flow branching model", and [Semanti
 
 Each time we release a new PaddlePaddle version, we should follow the below steps:
 
-1. Fork a new branch from `develop` named `release/[version]`, e.g. `release/0.10.0`.
-1. Push a new tag on the release branch, the tag name should be like `[version]rc.patch`. The
-   first tag should be `0.10.0rc1`, and the second should be `0.10.0.rc2` and so on.
-1. After that, we should do:
-  * Run all regression test on the Regression Test List (see PaddlePaddle TeamCity CI), to confirm
-      that this release has no major bugs.
-        * If regression test fails, we must fix those bugs and create a new `release/[version]`
-          branch from previous release branch.
-    * Modify `python/setup.py.in`, change the version number and change `ISTAGED` to `True`.
-    * Publish PaddlePaddle release wheel packages to pypi (see below instructions for detail).
-    * Update the Docker images (see below instructions for detail).
-1. After above step, merge `release/[version]` branch to master and push a tag on the master commit,
-   then merge `master` to `develop`.
-1. Update the Release Note.          
-
-***NOTE:***
-
-* Do ***NOT*** merge commits from develop branch to release branches to keep the release branch contain
-  features only for current release, so that we can test on that version.
-* If we want to fix bugs on release branches, we must merge the fix to master, develop and release branch.
+1. Create a new release branch from `develop`，named `release/[version]`. E.g.，`release/0.10.0`
+2. Create a new tag for the release branch, tag format: `version-rc.Patch`. E.g. the first tag is `0.10.0-rc0`。
+3. New release branch normally doesn't accept new features or optimizations. QA will test on the release branch. Developer should develop based on `develop` branch.
+4. If QA or Developer find bugs. They should first fix and verify on `develop` branch. Then cherry-pick the fix to the release branch. Wait until the release branch is stable.
+5. If necessary, create a new tag on the relese branch, e.g. `0.10.0-rc1`. Involve more users to try it and repeat step 3-4.
+6. After release branch is stable，Create the official release tag，such as `0.10.0`.
+7. Release the python wheel package to pypi.
+8. Update the docker image (More details below).
+
+NOTE:
+
+* bug fix should happen on `develop` branch, then cherry-pick to relese branch. Avoid developing directly on release branch.
+
+* release normally only accept bug fixes. Don't add new features.
+
 
 ## Publish Wheel Packages to pypi
 
@@ -50,6 +45,33 @@ pop-up box, choose the current release branch and click "Run Build" button. You
 * pypi does not allow overwrite the already uploaded version of wheel package, even if you delete the
   old version. you must change the version number before upload a new one.
 
+### Publish wheel Packages for MacOS
+
+You need to build the binary wheel package for MacOS before publishing, to
+make sure that the package can be used by many versions of MacOS
+(10.11, 10.12, 10.13) and different python installs (python.org, homebrew, etc.),
+you must build the package ***exactly*** following below steps:
+
+Build steps:
+
+1. install python from python.org downloads, and make sure it's currently in use
+   in your system.
+1. `export MACOSX_DEPLOYMENT_TARGET=10.11`, use `10.11` is enough for recent versions.
+1. `git clone https://github.com/PaddlePaddle/Paddle.git && cd Paddle && mkdir build && cd build`
+1. `cmake -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_SYSTEM_BLAS=OFF  ..`, make sure the output of `cmake` command is using the correct python interpreter installed from python.org
+1. `make -j`
+1. `pip install delocate`
+1. `mkdir fixed_wheel && delocate-wheel -w fixed_wheel python/dist/*.whl`
+
+Then the whl under `fixed_wheel` is ready to upload.
+
+Install steps:
+
+1. run `pip install paddlepaddle...whl`
+1. find the `libpython.dylib` that are currently in use:
+    - for python.org package installs, do nothing.
+    - for other python installs, find the path of `libpython*.dylib` and `export LD_LIBRARY_PATH=you path && DYLD_LIBRARY_PATH=your path`
+
 ## Publish Docker Images
 
 Our CI tool will push latest images to DockerHub, so we only need to push a version tag like:
@@ -70,26 +92,22 @@ You can then checkout the latest pushed tags at https://hub.docker.com/r/paddlep
 
 ## Branching Model
 
-We use [git-flow](http://nvie.com/posts/a-successful-git-branching-model/) as our branching model,
-with some modifications:
-
-* `master` branch is the stable branch. Each version on the master branch is tested and guaranteed.
-* `develop` branch is for development. Each commit on develop branch has passed CI unit test, but no
-  regression tests are run.
-* `release/[version]` branch is used to publish each release. Latest release version branches have
-  bugfix only for that version, but no feature updates.
-* Developer forks are not required to follow
-  [git-flow](http://nvie.com/posts/a-successful-git-branching-model/)
-  branching model, all forks is like a feature branch.
-    * Advise: developer fork's develop branch is used to sync up with main repo's develop branch.
-    * Advise: developer use it's fork's develop branch to for new branch to start developing.
-  * Use that branch on developer's fork to create pull requests and start reviews.
-      * developer can push new commits to that branch when the pull request is open.
-* Bug fixes are also started from developers forked repo. And, bug fixes branch can merge to
-  `master`, `develop` and `releases`.
+PaddlePaddle uses [Trunk Based Development](https://trunkbaseddevelopment.com/) as our branching model.
+
+* `develop` branch is used for development. Each comment to `develop` branc goes through unit tests and model regression tests.
+* `release/[version]` branch is used for each release. Release branch is used for tests, bug fix and evetual release.
+* `master` branch as been deprecated for historical reasons
+
+* Developer's feature branch。
+	* Developer's feature branch should sync with upstream `develop` branch.
+	* Developer's feature branch should be forked from upstream `develop` branch.
+	* After feature branch is ready, create a `Pull Request` against the Paddle repo and go through code review.
+	   * In the review process, develop modify codes and push to their own feature branch.
 
 ## PaddlePaddle Regression Test List
 
+TODO
+
 ### All Chapters of PaddlePaddle Book
 
 We need to guarantee that all the chapters of PaddlePaddle Book can run correctly. Including
diff --git a/doc/fluid/dev/use_eigen_cn.md b/doc/fluid/dev/use_eigen_cn.md
index 75922e7d85a13e53ce94619a48d8da8b960e6c9a..56203d6fad444f61ef1be187ad0d149b2aa99ba4 100644
--- a/doc/fluid/dev/use_eigen_cn.md
+++ b/doc/fluid/dev/use_eigen_cn.md
@@ -7,7 +7,7 @@
 
 Eigen Tensor模块对element-wise计算提供了强大的支持，并且书写一份代码，可以同时在CPU、GPU执行。但Eigen Tensor是一个正在开发中的模块，因此可能测试不够完备，文档较少。
 
-关于Eigen Tensor模块的详细介绍请参考[文档1](https://github.com/RLovelett/eigen/blob/master/unsupported/Eigen/CXX11/src/Tensor/README.md) 和[文档2](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md)
+关于Eigen Tensor模块的详细介绍请参考[Eigen文档](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md)
 
 
 ## paddle::framework::Tensor
diff --git a/doc/fluid/howto/cluster/nccl2_rdma_training.md b/doc/fluid/howto/cluster/nccl2_rdma_training.md
index cecd5c3a7a7339e3be6772543a534728ec132105..8adaf324fccb4cda7af16b9bace559c0642ae444 100644
--- a/doc/fluid/howto/cluster/nccl2_rdma_training.md
+++ b/doc/fluid/howto/cluster/nccl2_rdma_training.md
@@ -1,12 +1,12 @@
 # Distributed Training with NCCL2 and RDMA
 
-When doing distributed multi-GPU training, network bandwith often becomes the
-bottle neck. We introduce a way to use NCCL2 to do such training job to
-achieve best performace.
+When doing distributed multi-GPU training, network bandwidth often becomes the
+bottleneck. We introduce a way to use NCCL2 to do such training job to
+achieve best performance.
 
-## Prepare Hardwares with RDMA and Multiple GPUs
+## Prepare Hardware with RDMA and Multiple GPUs
 
-I'm using two Linux servers each of them is installed with 8 GPUs and
+I'm using two Linux servers each of them installed with 8 GPUs and
 one 100Gb RDMA card.
 Base environment is:
 
@@ -25,7 +25,7 @@ In general, the steps including:
 1. Use docker to run tests and make sure GPUs and RDMA can work inside
    the container.
 
-I'll ommit section "Install GPU drivers" because we can find it easily
+I'll omit the section "Install GPU drivers" because we can find it easily
 somewhere else.
 
 ### Install RDMA drivers
@@ -33,7 +33,7 @@ somewhere else.
 For my case, I've got two machines with device
 "Mellanox Technologies MT27700 Family [ConnectX-4]" installed. The OS was
 "CentOS 7.4" and I updated the kernel to version 4.4 so that docker can
-work with latest overlay2 filesystem.
+work with the latest overlay2 filesystem.
 
 ***NOTE: before you start, make sure you have a way to get a console
 of the server other than ssh because we may need to re-configure the
@@ -45,14 +45,14 @@ network device.***
 1. Run `./mlnxofedinstall --add-kernel-support` in the software package.
 1. Run `/etc/init.d/openibd restart` to make everything work, note that
    this operation may cause the network goes down if you are using this
-   RDMA device as default network device and use ssh to login the server.
+   RDMA device as default network device and use ssh to log in the server.
 1. Re-configure the network interface, for example:
    `ifconfig eth2 192.168.16.30/20 up`, then add routes if needed:
    `ip route add default via 192.168.16.1 dev eth2`.
 1. Do the same thing on the other node.
 1. Use `ping` to test if the two nodes have typical ICMP connection.
 1. Use either `udaddy` or `ib_write_bw` to test the network connection is
-   ready and have the desired bandwith.
+   ready and have the desired bandwidth.
 
 ### Prepare Docker Image to Run RDMA Programs
 
@@ -60,7 +60,7 @@ network device.***
    package in it.
 1. Start a docker container and mount GPU driver libs into it (you can
    skip this step if you are using nvidia-docker).
-1. Mount RDMA dirvers and libs into the docker image (see below section),
+1. Mount RDMA drivers and libs into the docker image (see below section),
    also `udaddy` and `ib_write_bw` if needed.
 1. Mount GPU devices and RDMA devices into the container using `--device`
    or just use privileged mode `--privileged`.
diff --git a/doc/fluid/howto/inference/build_and_install_lib_cn.rst b/doc/fluid/howto/inference/build_and_install_lib_cn.rst
index 84005b54e07cf810649370d2c1f6b6c522434bf6..91357dd8c8da19f2f33c6f285ed7eb234428b1ab 100644
--- a/doc/fluid/howto/inference/build_and_install_lib_cn.rst
+++ b/doc/fluid/howto/inference/build_and_install_lib_cn.rst
@@ -7,13 +7,13 @@
 ======================   ========================================
 版本说明                            C++预测库   
 ======================   ========================================
-cpu_avx_mkl              `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/fluid.tgz>`_ 
-cpu_avx_openblas         `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/fluid.tgz>`_
-cpu_noavx_openblas       `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/fluid.tgz>`_
-cuda7.5_cudnn5_avx_mkl   `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/fluid.tgz>`_
-cuda8.0_cudnn5_avx_mkl   `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/fluid.tgz>`_
-cuda8.0_cudnn7_avx_mkl   `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/fluid.tgz>`_
-cuda9.0_cudnn7_avx_mkl   `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/fluid.tgz>`_
+cpu_avx_mkl              `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/fluid.tgz/?branch=0.14.0>`_ 
+cpu_avx_openblas         `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/fluid.tgz/?branch=0.14.0>`_
+cpu_noavx_openblas       `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/fluid.tgz/?branch=0.14.0>`_
+cuda7.5_cudnn5_avx_mkl   `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/fluid.tgz/?branch=0.14.0>`_
+cuda8.0_cudnn5_avx_mkl   `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/fluid.tgz/?branch=0.14.0>`_
+cuda8.0_cudnn7_avx_mkl   `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/fluid.tgz/?branch=0.14.0>`_
+cuda9.0_cudnn7_avx_mkl   `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/fluid.tgz/?branch=0.14.0>`_
 ======================   ========================================
 
 从源码编译
diff --git a/doc/fluid/howto/optimization/timeline_cn.md b/doc/fluid/howto/optimization/timeline_cn.md
index 5d061e1c00d2ca0194153730a39486b8357fa5b0..faf39f276dbddcd4961407ba2d082c9826051cbe 100644
--- a/doc/fluid/howto/optimization/timeline_cn.md
+++ b/doc/fluid/howto/optimization/timeline_cn.md
@@ -1,21 +1,27 @@
 # 如何使用timeline工具做性能分析
 
-1. 在训练的主循环外加上`with profiler.profiler(...)`。运行之后，代码会在`/tmp/profile`目录下生成一个profile的记录文件。
+1. 在训练的主循环外加上`profiler.start_profiler(...)`和`profiler.stop_profiler(...)`。运行之后，代码会在`/tmp/profile`目录下生成一个profile的记录文件。
 
 	**提示：**
 	请不要在timeline记录信息时运行太多次迭代，因为timeline中的记录数量和迭代次数是成正比的。
 
 	```python
-	with profiler.profiler('All', 'total', '/tmp/profile') as prof:
-	    for pass_id in range(pass_num):
-	        for batch_id, data in enumerate(train_reader()):
-	            exe.run(fluid.default_main_program(),
-	                    feed=feeder.feed(data),
-	                    fetch_list=[])
+    for pass_id in range(pass_num):
+        for batch_id, data in enumerate(train_reader()):
+            if pass_id == 0 and batch_id == 5:
+                profiler.start_profiler("All")
+            elif pass_id == 0 and batch_id == 10:
+                profiler.stop_profiler("total", "/tmp/profile")
+            exe.run(fluid.default_main_program(),
+                    feed=feeder.feed(data),
+                    fetch_list=[])
 	            ...
 	```
 
 1. 运行`python paddle/tools/timeline.py`来处理`/tmp/profile`，这个程序默认会生成一个`/tmp/timeline`文件，你也可以用命令行参数来修改这个路径，请参考[timeline.py](https://github.com/PaddlePaddle/Paddle/blob/develop/tools/timeline.py)。
+```python
+python Paddle/tools/timeline.py --profile_path=/tmp/profile --timeline_path=timeline
+```
 
 1. 打开chrome浏览器，访问<chrome://tracing/>，用`load`按钮来加载生成的`timeline`文件。
 
diff --git a/doc/fluid/howto/optimization/timeline_en.md b/doc/fluid/howto/optimization/timeline_en.md
index 96481ae2a6e4442d40803f8d5361e5f942502df3..6f963c6b4da6967fb2f493ada917a4b08917fa4c 100644
--- a/doc/fluid/howto/optimization/timeline_en.md
+++ b/doc/fluid/howto/optimization/timeline_en.md
@@ -1,15 +1,17 @@
 # how to use timeline tool to do profile
 
-1. Add `with profiler.profiler(...)` to the main training loop. After run, the code will generate a profile record file `/tmp/profile`. **Warning**: Please do not run too many batches when use profiler to record timeline information, for the profile record will grow with the batch number.
+1. Add `profiler.start_profiler(...)`和`profiler.stop_profiler(...)` to the main training loop. After run, the code will generate a profile record file `/tmp/profile`. **Warning**: Please do not run too many batches when use profiler to record timeline information, for the profile record will grow with the batch number.
 
 	```python
-	with profiler.profiler('All', 'total', '/tmp/profile') as prof:
-	    for pass_id in range(pass_num):
-	        for batch_id, data in enumerate(train_reader()):
-	            exe.run(fluid.default_main_program(),
-	                    feed=feeder.feed(data),
-	                    fetch_list=[],
-	                    use_program_cache=True)
+    for pass_id in range(pass_num):
+        for batch_id, data in enumerate(train_reader()):
+            if pass_id == 0 and batch_id == 5:
+                profiler.start_profiler("All")
+            elif pass_id == 0 and batch_id == 10:
+                profiler.stop_profiler("total", "/tmp/profile")
+            exe.run(fluid.default_main_program(),
+                    feed=feeder.feed(data),
+                    fetch_list=[])
 	            ...
 	```
 
@@ -17,6 +19,10 @@
 file `/tmp/timeline` by default. You can change the path by cmd parameter, please take a look at
 [timeline.py](https://github.com/PaddlePaddle/Paddle/blob/develop/tools/timeline.py) for details.
 
+```python
+python Paddle/tools/timeline.py --profile_path=/tmp/profile --timeline_path=timeline
+```
+
 1. Open chrome and visit <chrome://tracing/>, use `load` button to load the generated `timeline` file.
 
 	![chrome tracing](./tracing.jpeg)
diff --git a/doc/fluid/howto/performance/error_clip.md b/doc/fluid/howto/performance/error_clip.md
index 58aa73b8cd38d01e2426278a3479714e4fb6a3b0..749cf7693c75696feb17f8556224ed03649baa80 100644
--- a/doc/fluid/howto/performance/error_clip.md
+++ b/doc/fluid/howto/performance/error_clip.md
@@ -78,7 +78,7 @@ def error_clip_callback(block, context):
     op_desc = block.desc.op(block.desc.op_size() - 1)
     for grad_n in filter(lambda n: grad_to_var.has_key(n),
                          op_desc.output_arg_names()):
-        fwd_var = block.var_recursive(grad_to_var[grad_n])
+        fwd_var = block.__var_recursive(grad_to_var[grad_n])
         error_clip = getattr(fwd_var, "error_clip", None)
         if not (error_clip is None or isinstance(error_clip,
                                                  BaseErrorClipAttr)):
diff --git a/doc/fluid/index_cn.rst b/doc/fluid/index_cn.rst
index d878d192cae7ee9e8b8fdb4f615839c186fdf334..6b1ef3ceed4f7ed5073d42c13ce103e2ab467e58 100644
--- a/doc/fluid/index_cn.rst
+++ b/doc/fluid/index_cn.rst
@@ -1,12 +1,16 @@
- PaddlePaddle Fluid
-==========================
+.. PaddlePaddle Fluid documentation master file, created by
+   sphinx-quickstart on Thu Jun  7 17:04:53 2018.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+##############
+欢迎使用 Fluid
+##############
 
 ..  toctree::
-  :maxdepth: 1
+    :maxdepth: 1
 
-  getstarted/index_cn.rst
-  build_and_install/index_cn.rst
-  design/index_cn.rst
-  howto/index_cn.rst
-  dev/index_cn.rst
-  faq/index_cn.rst
+    new_docs/beginners_guide/index.rst
+    new_docs/user_guides/index.rst
+    new_docs/advanced_usage/index.rst
+    new_docs/faq/index_cn.rst
diff --git a/doc/fluid/new_docs/advanced_usage/benchmark.rst b/doc/fluid/new_docs/advanced_usage/benchmark.rst
new file mode 100644
index 0000000000000000000000000000000000000000..7854263bf8f64c840492550fb22152582c7d2361
--- /dev/null
+++ b/doc/fluid/new_docs/advanced_usage/benchmark.rst
@@ -0,0 +1,120 @@
+#################
+如何进行基准测试
+#################
+
+本文介绍如何给深度学习框架做基准测试。基准测试主要包含验证模型的精度和性能两方面，下文包含搭建测试环境，选择基准测试模型，验证测试结果等几方面内容。
+
+验证深度学习框架，可分为训练和测试两个阶段， 验证指标略有不同，本文只介绍训练阶段的指标验证。训练阶段关注的是模型训练集上的精度，训练集是完备的，因此关注大batch\_size下的训练速度,关注吞吐量，例如图像模型常用的batch\_size=128, 多卡情况下会加大；预测阶段关注的是在测试集上的精度，线上服务测试数据不能提前收集，因此关注小batch\_size下的预测速度，关注延迟，例如预测服务常用的batch\_size=1, 4等。
+
+`Fluid <https://github.com/PaddlePaddle/Paddle>`__ 是PaddlePaddle从0.11.0版本开始引入的设计，本文的基准测试在该版本上完成。
+
+
+环境搭建
+""""""""""""
+
+基准测试中模型精度和硬件、框架无关，由模型结构和数据共同决定；性能方面由测试硬件和框架性能决定。框架基准测试为了对比框架之间的差异，控制硬件环境，系统库等版本一致。下文中的对比实验都在相同的硬件条件和系统环境条件下进行.
+
+
+不同架构的GPU卡性能差异巨大，在验证模型在GPU上训练性能时，可使用NVIDIA提供的工具:code `nvidia-smi` 检验当前使用的GPU型号，如果测试多卡训练性能，需确认硬件连接是 `nvlink <https://zh.wikipedia.org/zh/NVLink>`__ 或 `PCIe <https://zh.wikipedia.org/zh-hans/PCI_Express>`__ 。 同样地，CPU型号会极大影响模型在CPU上的训练性能。可读取`/proc/cpuinfo`中的参数，确认当前正在使用的CPU型号。
+
+下载GPU对应的Cuda Tool Kit和 Cudnn，或者使用NVIDIA官方发布的nvidia-docker镜像 `nvidia-docker <https://github.com/NVIDIA/nvidia-docker>`__, 镜像内包含了Cuda和Cudnn，本文采用这种方式。 Cuda Tool Kit包含了GPU代码使用到的基础库，影响在此基础上编译出的Fluid二进制运行性能。
+
+准备好Cuda环境后，从github上的下载Paddle并源码编译，会生成对应的最适合当前GPU的sm\_arch二进制\ `sm\_arch <https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html>`__\ 。另外，cudnn对卷积类任务影响巨大，在基准测试中需要小版本一致，例如Cudnn7.0.2与Cudnn7.1.4在Resnet上有5%以上差异。
+
+
+选择基准模型
+""""""""""""
+
+对框架做基准测试，需要覆盖不同训练任务和不同大小的模型，本文中选取了图像和NLP的最为常用的5个模型。
+
+============  ============  =================  ============
+任务种类        模型名称       网络结构         数据集     
+============  ============  =================  ============
+图像分类      mnist         Lenet              mnist
+图像分类      VGG           VGG-16             Flowers102
+图像分类      Resnet        Resnet-50          Flowers102
+文本分类      Stacked-LSTM  Stacked-LSTM       IMDB 
+机器翻译      seq-seq       Stacked-LSTM       wmt14 
+============  ============  =================  ============
+
+其中mnist, VGG, Resnet属于CNN模型, stacked-lstm, seq2seq代表RNN模型。
+`benchmark <https://github.com/PaddlePaddle/Paddle/tree/develop/benchmark/fluid>`__
+基准模型测试脚本中，均跳过了前几个batch的训练过程，原因是加载数据和分配显存受系统当前运行情况影响，会导致统计性能不准确。运行完若干个轮次后，统计对应指标。
+
+
+基准模型的数据的选择方面，数据量大且验证效果多的公开数据集为首选。图像模型VGG和resnet, 本文选择了 `flowers102 <http://www.robots.ox.ac.uk/~vgg/data/flowers/102/>`__ ，图像大小预处理为和Imagenet相同大小，因此性能可直接对比
+NLP模型的公开且影响力大数据集较少，seq2seq模型选择了wmt14数据，stacked-lstm模型中选择了 `imdb <https://www.imdb.com/interfaces/>`__ 数据。
+
+
+注意，图像模型每条样本大小相同，图像经过变换后大小一致，因此经过的计算路径基本相同，计算速度和显存占用波动较小，可以从若干个batch的数据中采样得到当前的训练性能数据。而NLP模型由于样本长度不定，计算路径和显存占用也不相同，因此只能完整运行若干个轮次后，统计速度和显存消耗。
+显存分配是特别耗时的操作，因此Fluid默认会占用所有可用显存空间形成显存池，用以加速计算过程中的显存分配。如果需要统计模型真实显存消耗，可设置环境变量`FLAGS_fraction_of_gpu_memory_to_use=0.0`，观察最大显存开销。
+
+
+测试过程
+""""""""""""
+
+-  CPU 单机单线程测试
+
+测试CPU上单线程的性能，先设置CUDA的环境变量为空，``CUDA_VISIBLE_DEVICES=``，并通过环境变量关闭OpenMP和MKL的多线程 ``OMP_NUM_THREADS=1``， ``MKL_NUM_THREADS=1;``。
+然后代码中设置为使用CPUPlace，如果使用Paddle代码库中的脚本，只需要命令行参数传入 use_gpu=False即可。
+
+.. code-block:: python
+
+    >>> import paddle.fluid as fluid
+    >>> place = fluid.CPUPlace() 
+
+.. code:: bash
+
+    docker run -it --name CASE_NAME --security-opt seccomp=unconfined -v $PWD/benchmark:/benchmark paddlepaddle/paddle:latest-dev /bin/bash
+
+
+-  GPU 单机单卡测试
+
+本教程使用了Cuda8, Cudnn7.0.1。来源为:code `nvidia/cuda:8.0-cudnn7-devel-ubuntu16.04`
+
+.. code:: bash
+
+    nvidia-docker run -it --name CASE_NAME --security-opt seccomp=unconfined -v $PWD/benchmark:/benchmark -v /usr/lib/x86_64-linux-gnu:/usr/lib/x86_64-linux-gnu paddlepaddle/paddle:latest-dev /bin/bash
+在单卡上测试，设置CUDA的环境变量使用一块GPU，``CUDA_VISIBLE_DEVICES=0``
+然后代码中设置为使用CUDAPlace，如果使用Paddle代码库中的脚本，只需要命令行参数传入 use_gpu=True即可。
+
+.. code-block:: python
+
+    >>> import paddle.fluid as fluid
+    >>> place = fluid.CUDAPlace(0) // 0 指第0块GPU
+
+
+测试结果
+""""""""""""
+
+本教程对比相同环境下的Fluid0.12.0和TensorFlow1.4.0的性能表现。
+硬件环境为 CPU: Intel(R) Xeon(R) CPU E5-2660 v4 @ 2.00GHz, GPU: TITAN X(Pascal) 12G x 1, Nvidia-Driver 384.90。
+系统环境为Ubuntu 16.04.3 LTS, 本文中采用了docker环境，系统版本为nvidia-docker17.05.0-ce。
+测试的Fluid版本为\ `v.0.12.0 <https://github.com/PaddlePaddle/Paddle/releases/tag/v.0.12.0>`__ 。
+TensorFlow版本为\ `v.1.4.0-rc1 <https://github.com/tensorflow/tensorflow/tree/v1.4.0-rc1>`__ 。
+使用的脚本和配置见\ `benchmark <https://github.com/PaddlePaddle/Paddle/tree/develop/benchmark/fluid>`__ 。
+图表中统计单位为samples/秒。
+
+- CPU 单机单线程测试结果
+
+  ================  ====================  ===================
+   Speed            Fluid CPU              TensorFlow CPU    
+  ================  ====================  ===================
+  mnist             1298.75 samples/s     637.57 samples/s  
+  VGG-16            0.4147 images/s       0.1229 images/s   
+  Resnet-50         1.6935 images/s       0.3657 images/s   
+  Stacked-LSTM      472.3225 words/s      48.2293words/s    
+  Seq2Seq           217.1655 words/s      28.6164 words/s   
+  ================  ====================  ===================
+
+- GPU 单机单卡测试结果
+
+  =============== =====================  =================
+   Speed           Fluid GPU              TensorFlow GPU      
+  =============== =====================  =================
+   mnist           19710.90 samples/s    15576.3 samples/s        
+   VGG-16          59.83327 images/s     40.9967 images/s    
+   Resnet-50       105.84412             97.8923 images/s    
+   Stacked-LSTM    1319.99315            1608.2526 words/s   
+   Seq2Seq         7147.89081            6845.1161 words/s   
+  =============== =====================  =================
diff --git a/doc/fluid/new_docs/advanced_usage/deploy/anakin_arm_benchmark.md b/doc/fluid/new_docs/advanced_usage/deploy/anakin_arm_benchmark.md
new file mode 100644
index 0000000000000000000000000000000000000000..08ea379f81d16407ed5f82770b55a34bcf138da8
--- /dev/null
+++ b/doc/fluid/new_docs/advanced_usage/deploy/anakin_arm_benchmark.md
@@ -0,0 +1,56 @@
+# Anakin ARM 性能测试
+
+## 测试环境和参数:
++ 测试模型Mobilenetv1, mobilenetv2, mobilenet-ssd
++ 采用android ndk交叉编译，gcc 4.9，enable neon， ABI： armveabi-v7a with neon -mfloat-abi=softfp
++ 测试平台
+   - 荣耀v9(root): 处理器:麒麟960, 4 big cores in 2.36GHz, 4 little cores in 1.8GHz
+   - nubia z17:处理器:高通835, 4 big cores in 2.36GHz, 4 little cores in 1.9GHz
+   - 360 N5:处理器:高通653, 4 big cores in 1.8GHz, 4 little cores in 1.4GHz
++ 多线程：openmp
++ 时间：warmup10次，运行10次取均值
++ ncnn版本：来源于github的master branch中commits ID：307a77f04be29875f40d337cfff6df747df09de6（msg:convert            LogisticRegressionOutput)版本
++ TFlite版本：来源于github的master branch中commits ID：65c05bc2ac19f51f7027e66350bc71652662125c（msg:Removed unneeded file copy that was causing failure in Pi builds)版本
+
+在BenchMark中本文将使用**`ncnn`**、**`TFlite`**和**`Anakin`**进行性能对比分析
+
+## BenchMark model
+
+> 注意在性能测试之前，请先将测试model通过[External Converter](#10003)转换为Anakin model
+> 对这些model，本文在ARM上进行多线程的单batch size测试。
+
+- [Mobilenet v1](#11)  *caffe model 可以在[这儿](https://github.com/shicai/MobileNet-Caffe)下载*
+- [Mobilenet v2](#22)  *caffe model 可以在[这儿](https://github.com/shicai/MobileNet-Caffe)下载*
+- [mobilenet-ssd](#33)  *caffe model 可以在[这儿](https://github.com/chuanqi305/MobileNet-SSD)下载*
+
+### <span id = '11'> mobilenetv1 </span>
+
+   |platform | Anakin (1) | Anakin (2) | Anakin (4) | ncnn (1) | ncnn (2) | ncnn (4) | TFlite (1) | TFlite (2) | TFlite (4)| 
+   |:---: | :---: | :---: | :---:| :---:| :---:| :---:| :---:| :---:| :---:|
+   |麒麟960|107.7ms|61.1ms|38.2ms|152.8ms|85.2ms|51.9ms|152.6ms|nan|nan|
+   |高通835|105.7ms|63.1ms|~~46.8ms~~|152.7ms|87.0ms|~~92.7ms~~|146.9ms|nan|nan|
+   |高通653|120.3ms|64.2ms|46.6ms|202.5ms|117.6ms|84.8ms|158.6ms|nan|nan| 
+
+### <span id = '22'> mobilenetv2 </span>
+
+   |platform | Anakin (1) | Anakin (2) | Anakin (4) | ncnn (1) | ncnn (2) | ncnn (4) | TFlite (1) | TFlite (2) | TFlite (4)| 
+   |:---: | :---: | :---: | :---:| :---:| :---:| :---:| :---:| :---:| :---:|
+   |麒麟960|93.1ms|53.9ms|34.8ms|144.4ms|84.3ms|55.3ms|100.6ms|nan|nan|
+   |高通835|93.0ms|55.6ms|41.1ms|139.1ms|88.4ms|58.1ms|95.2ms|nan|nan|
+   |高通653|106.6ms|64.2ms|48.0ms|199.9ms|125.1ms|98.9ms|108.5ms|nan|nan|
+
+### <span id = '33'> mobilenet-ssd </span>
+
+   |platform | Anakin (1) | Anakin (2) | Anakin (4) | ncnn (1) | ncnn (2) | ncnn (4) | TFlite (1) | TFlite (2) | TFlite (4)| 
+   |:---: | :---: | :---: | :---:| :---:| :---:| :---:| :---:| :---:| :---:|
+   |麒麟960|213.9ms|120.5ms|74.5ms|307.9ms|166.5ms|104.2ms|nan|nan|nan|
+   |高通835|213.0ms|125.7ms|~~98.4ms~~|292.9ms|177.9ms|~~167.8ms~~|nan|nan|nan|
+   |高通653|236.0ms|129.6ms|96.0ms|377.7ms|228.9ms|165.0ms|nan|nan|nan
+
+## How to run those Benchmark models?
+
+1. 首先, 使用[External Converter](../docs/Manual/Converter_en.md)对caffe model 进行转换
+2. 然后将转换后的Anakin model和编译好的benchmark_arm 二进制文件通过'adb push'命令上传至测试机
+3. 接着在测试机含有Anakin model的目录中运行'./benchmark_arm ./ anakin_model.anakin.bin 1 10 10 1' 命令
+4. 最后，终端显示器上将会打印该模型的运行时间
+5. 其中运行命令的参数个数和含义可以通过运行'./benchmark_arm'看到
diff --git a/doc/fluid/new_docs/advanced_usage/deploy/anakin_example.md b/doc/fluid/new_docs/advanced_usage/deploy/anakin_example.md
new file mode 100644
index 0000000000000000000000000000000000000000..e6b9e18fe2d64b3fda6382bb23a6a818a3e17fbe
--- /dev/null
+++ b/doc/fluid/new_docs/advanced_usage/deploy/anakin_example.md
@@ -0,0 +1,28 @@
+# Example
+Anakin目前只支持NCHW的格式
+示例文件在test/framework/net下
+
+## 在NV的GPU上运行CNN模型
+示例文件为打开example_nv_cnn_net.cpp，整体流程如下：
+- 将模型的的path设置为anakin模型的路径，初始化NV平台的图对象。 anakin模型可以通过转换器转化caffe或fluid的模型得到
+- 根据模型设置网络图的输入尺寸，进行图优化
+- 根据优化后的网络图初始化网络执行器
+- 取出网络的输入tensor，将数据拷贝到输入tensor
+- 运行推导
+- 取出网络的输出tensor
+
+以NV平台为例演示Anakin框架的使用方法，注意编译时需要打开GPU编译开关
+
+## 在X86上运行RNN模型
+示例文件为example_x86_rnn_net.cpp
+整体流程与在NV的GPU上运行CNN模型相似，不同之处如下：
+- 使用X86标识初始化图对象和网络执行器对象
+- rnn模型的输入尺寸是可变的，初始化图时的输入维度是维度的最大值，输入维度N代表总的词的个数。还需要设置输入tensor的seq_offset来标示这些词是如何划分为句子的,如{0,5,12}表示共有12个词，其中第0到第4个词是第一句话，第5到第11个词是第二句话
+
+以X86平台为例演示Anakin框架的使用方法，注意编译时需要打开X86编译开关
+
+## 在NV的GPU上使用Anakin的线程池运行CNN模型
+示例文件为example_nv_cnn_net_multi_thread.cpp ，示例使用worker的同步预测接口
+整体流程与在NV的GPU上运行CNN模型相似，不同之处如下：
+- 用模型地址和线程池大小初始化worker对象
+- 将输入tensor注入任务队列,获得输出tensor
diff --git a/doc/fluid/new_docs/advanced_usage/deploy/anakin_gpu_benchmark.md b/doc/fluid/new_docs/advanced_usage/deploy/anakin_gpu_benchmark.md
new file mode 100644
index 0000000000000000000000000000000000000000..667f9396f1169a0d891b9e6b0e912aa5527ab0b8
--- /dev/null
+++ b/doc/fluid/new_docs/advanced_usage/deploy/anakin_gpu_benchmark.md
@@ -0,0 +1,170 @@
+# Anakin GPU Benchmark
+
+## Machine:
+
+>  CPU: `12-core Intel(R) Xeon(R) CPU E5-2620 v2 @2.10GHz`
+>  GPU: `Tesla P4`
+>  cuDNN: `v7`
+
+
+## Counterpart of anakin  :
+
+The counterpart of **`Anakin`** is the acknowledged high performance inference engine **`NVIDIA TensorRT 3`** ,   The models which TensorRT 3 doesn't support we use the custom plugins  to support.
+
+## Benchmark Model
+
+The following convolutional neural networks are tested with both `Anakin` and `TenorRT3`.
+ You can use pretrained caffe model or the model trained by youself.
+
+> Please note that you should transform caffe model or others into anakin model with the help of [`external converter ->`](../docs/Manual/Converter_en.md)
+
+
+- [Vgg16](#1)   *caffe model can be found [here->](https://gist.github.com/jimmie33/27c1c0a7736ba66c2395)*
+- [Yolo](#2)  *caffe model can be found [here->](https://github.com/hojel/caffe-yolo-model)*
+- [Resnet50](#3)  *caffe model can be found [here->](https://github.com/KaimingHe/deep-residual-networks#models)*
+- [Resnet101](#4)  *caffe model can be found [here->](https://github.com/KaimingHe/deep-residual-networks#models)*
+- [Mobilenet v1](#5)  *caffe model can be found [here->](https://github.com/shicai/MobileNet-Caffe)*
+- [Mobilenet v2](#6)  *caffe model can be found [here->](https://github.com/shicai/MobileNet-Caffe)*
+- [RNN](#7)  *not support yet*
+
+We tested them on single-GPU with single-thread.
+
+### <span id = '1'>VGG16 </span>
+
+- Latency (`ms`) of different batch
+
+| BatchSize | TensorRT | Anakin |
+| --- | --- | --- |
+| 1 | 8.8690 | 8.2815 |
+| 2 | 15.5344 | 13.9116 |
+| 4 | 26.6000 | 21.8747 |
+| 8 | 49.8279 | 40.4076 |
+| 32 | 188.6270 | 163.7660 |
+
+- GPU Memory Used (`MB`)
+
+| BatchSize | TensorRT | Anakin |
+| --- | --- | --- |
+| 1 | 963 | 997 |
+| 2 | 965 | 1039 |
+| 4 | 991 | 1115 |
+| 8 | 1067 | 1269 |
+| 32 | 1715 | 2193 |
+
+
+### <span id = '2'>Yolo </span>
+
+- Latency (`ms`) of different batch
+
+| BatchSize | TensorRT | Anakin |
+| --- | --- | --- |
+| 1 | 16.4596| 15.2124 |
+| 2 | 26.6347| 25.0442 |
+| 4 | 43.3695| 43.5017 |
+| 8 | 80.9139 | 80.9880 |
+| 32 | 293.8080| 310.8810 |
+
+- GPU Memory Used (`MB`)
+
+| BatchSize | TensorRT | Anakin |
+| --- | --- | --- |
+| 1 | 1569 | 1775 |
+| 2 | 1649 | 1815 |
+| 4 | 1709 | 1887 |
+| 8 | 1731 | 2031 |
+| 32 | 2253 | 2907 |
+
+### <span id = '3'> Resnet50 </span>
+
+- Latency (`ms`) of different batch
+
+| BatchSize | TensorRT | Anakin |
+| --- | --- | --- |
+| 1 | 4.2459   |  4.1061 |
+| 2 |  6.2627  |  6.5159 |
+| 4 | 10.1277  | 11.3327 |
+| 8 | 17.8209  | 20.6680 |
+| 32 | 65.8582 | 77.8858 |
+
+- GPU Memory Used (`MB`)
+
+| BatchSize | TensorRT | Anakin |
+| --- | --- | --- |
+| 1 | 531  | 503 |
+| 2 | 543  | 517 |
+| 4 | 583 | 541 |
+| 8 | 611 | 589 |
+| 32 |  809 | 879 |
+
+### <span id = '4'> Resnet101 </span>
+
+- Latency (`ms`) of different batch
+
+| BatchSize | TensorRT | Anakin |
+| --- | --- | --- |
+| 1 | 7.5562 | 7.0837 |
+| 2 | 11.6023 | 11.4079 |
+| 4 | 18.3650 | 20.0493 |
+| 8 | 32.7632 | 36.0648 |
+| 32 | 123.2550 | 135.4880 |
+
+- GPU Memory Used (`MB)`
+
+| BatchSize | TensorRT | Anakin |
+| --- | --- | --- |
+| 1 | 701  | 683 |
+| 2 | 713  | 697 |
+| 4 | 793 | 721 |
+| 8 | 819 | 769 |
+| 32 | 1043 | 1059 |
+
+###  <span id = '5'> MobileNet V1 </span>
+
+- Latency (`ms`) of different batch
+
+| BatchSize | TensorRT | Anakin |
+| --- | --- | --- |
+| 1 | 45.5156  |  1.3947 |
+| 2 |  46.5585  |  2.5483 |
+| 4 | 48.4242  | 4.3404 |
+| 8 |  52.7957 |  8.1513 |
+| 32 | 83.2519 | 31.3178 |
+
+- GPU Memory Used (`MB`)
+
+| BatchSize | TensorRT | Anakin |
+| --- | --- | --- |
+| 1 | 329  | 283 |
+| 2 | 345  | 289 |
+| 4 | 371 | 299 |
+| 8 | 393 | 319 |
+| 32 |  531 | 433 |
+
+###  <span id = '6'> MobileNet V2</span>
+
+- Latency (`ms`) of different batch
+
+| BatchSize | TensorRT | Anakin |
+| --- | --- | --- |
+| 1 | 65.6861 | 2.9842 |
+| 2 | 66.6814 | 4.7472 |
+| 4 | 69.7114 | 7.4163 |
+| 8 | 76.1092 | 12.8779 |
+| 32 | 124.9810 | 47.2142 |
+
+- GPU Memory Used (`MB`)
+
+| BatchSize | TensorRT | Anakin |
+| --- | --- | --- |
+| 1 | 341 | 293 |
+| 2 | 353 | 301 |
+| 4 | 385 | 319 |
+| 8 | 421 | 351 |
+| 32 | 637 | 551 |
+
+## How to run those Benchmark models?
+
+> 1. At first, you should parse the caffe model with [`external converter`](https://github.com/PaddlePaddle/Anakin/blob/b95f31e19993a192e7428b4fcf852b9fe9860e5f/docs/Manual/Converter_en.md).
+> 2. Switch to *source_root/benchmark/CNN* directory. Use 'mkdir ./models' to create ./models and put anakin models into this file.
+> 3. Use command 'sh run.sh', we will create files in logs to save model log with different batch size. Finally, model latency summary will be displayed on the screen.
+> 4. If you want to get more detailed information with op time, you can modify CMakeLists.txt with setting `ENABLE_OP_TIMER` to `YES`, then recompile and run. You will find detailed information in  model log file.
diff --git a/doc/fluid/new_docs/advanced_usage/deploy/anakin_tutorial.md b/doc/fluid/new_docs/advanced_usage/deploy/anakin_tutorial.md
new file mode 100644
index 0000000000000000000000000000000000000000..5efbc89abd469871b318c306e8cb03dd95f0c85b
--- /dev/null
+++ b/doc/fluid/new_docs/advanced_usage/deploy/anakin_tutorial.md
@@ -0,0 +1,639 @@
+# Anakin 使用教程 ##
+
+本教程将会简略的介绍Anakin的工作原理，一些基本的Anakin API，以及如何调用这些API。
+  
+## 内容 ###
+
+- [Anakin的工作原理](#principle)
+- [Anakin APIs](#api)
+- [示例代码](#example)
+
+## <span id = 'principle'> Anakin的工作原理</span> ###
+
+![Anakin_principle](../pics/anakin_fm_ch.png)
+
+用Anakin来进行前向计算主要分为三个步骤：
+
+- 将外部模型通过[Anakin Parser](Converter_ch.md)解析为Anakin模型  
+  在使用Anakin之前，用户必须将所有其他模型转换成Anakin模型，我们提供了转换脚本，用户可通过[Anakin Parser](Converter_ch.md)进行模型转换。
+- 生成Anakin计算图
+  加载Anakin模型生成原始计算图，然后需要对原始计算图进行优化。你只需要调用相应的API优化即可。
+- 执行计算图  
+  Anakin会选择不同硬件平台执行计算图。
+
+
+## <span id ='api'>Anakin APIs </span> ###
+### Tensor ####
+
+`Tensor`提供基础的数据操作和管理，为ops提供统一的数据接口。`Tensor`包含以下几个属性：   
+
+- Buffer  
+   数据存储区
+- Shape  
+   数据的维度信息
+- Event  
+   用于异步计算的同步
+
+ `Tensor` 类包含三个`Shape`对象， 分别是`_shape`, `_valid_shape`和 `offset`。 `_shape`为`tensor`真正空间信息，`_valid_shape`表示当前`tensor`使用的空间信息， `_offset`表示当前`tensor`数据指针相对于真正数据空间的信息。 `Tensor`不同维度与分别与数学中的向量、矩阵等相对应如下表所示。
+
+
+Dimentions | Math entity |
+ :----: | :----:
+1 | vector
+2 | matrix
+3 | 3-tensor
+n | n-tensor
+
+#### 声明tensor对象
+
+`Tensor`接受三个模板参数:
+
+
+```c++
+ template<typename TargetType, DataType datatype, typename LayOutType = NCHW>
+ class Tensor .../* Inherit other class */{
+  //some implements
+  ...
+ };
+```
+
+TargetType是平台类型，如X86，GPU等等，在Anakin内部有相应的标识与之对应；datatype是普通的数据类型，在Anakin内部也有相应的标志与之对应；[LayOutType](#layout)是数据分布类型，如batch x channel x height x width [NxCxHxW], 在Anakin内部用一个struct来标识。 Anakin中数据类型与基本数据类型的对应如下:
+
+1. <span id='target'>TargetType</sapn>
+
+ Anakin TargetType | platform
+  :----: | :----:|
+  NV | NVIDIA GPU
+  ARM | ARM
+  AMD | AMD GPU
+  X86 | X86
+  NVHX86 | NVIDIA GPU with Pinned Memory
+
+2. <sapn id='datatype'>DataType</span>
+
+Anakin DataType | C++ | Description 
+:---: | :---: | :---: |
+AK_HALF | short | fp16
+AK_FLOAT | float | fp32
+AK_DOUBLE | double | fp64
+AK_INT8 | char | int8
+AK_INT16 | short | int16
+AK_INT32 | int | int32
+AK_INT64 | long | int64
+AK_UINT8 | unsigned char | uint8
+AK_UINT16 | unsigned short | uint8
+AK_UINT32 | unsigned int | uint32
+AK_STRING | std::string | /
+AK_BOOL | bool | /
+AK_SHAPE | / | Anakin Shape 
+AK_TENSOR | / | Anakin Tensor 
+
+
+3. <span id = 'layout'>LayOutType </span>
+
+Anakin LayOutType ( Tensor LayOut ) | Tensor Dimention | Tensor Support | Op Support
+:---: | :---: | :---: | :---: |
+W | 1-D | YES | NO
+HW | 2-D | YES | NO
+WH | 2-D | YES | NO
+NW | 2-D | YES | YES
+NHW | 3-D | YES |YES
+NCHW ( default ) | 4-D | YES | YES
+NHWC | 4-D | YES | NO
+NCHW_C4 | 5-D | YES | YES
+
+
+理论上，Anakin支持申明1维以上的tensor，但是对于Anakin中的Op来说，只支持NW、NHW、NCHW、NCHW_C4这四种LayOut，其中NCHW是默认的LayOutType，NCHW_C4是专门针对于int8这种数据类型的。
+
+
+例子
+
+> 下面的代码将展示如何使用tensor， 我们建议先看看这些示例。
+
+> 要想获得更多关于tensor的信息， 请参考 *soure_path/core/tensor.h*
+
+> 1. 使用shape对象初始化tensor
+``` c++  
+  //create a null tensor. A null tensor holds for nothing.
+  //tensor's buffer  is resident at CPU and its datatype is AK_FLOAT.
+  //tensor's Layout is NCHW(default)
+   Tensor<X86, AK_FLOAT> mytensor;
+
+   //1. using shape object to create a tensor.
+   Shape shape1(NUM); //1-D shape. NUM is the number of dimention.
+   Tensor<X86, AK_FLOAT, W> mytensor1(shape1); //1-D tensor.
+
+  // A 4-D shape
+   Shape shape2(N, C, H, W); // batch x channel x height x width
+```
+
+>`注意：Shape的维度必须和tensor的`[LayoutType](#layout)`相同，比如Shape(N,C,H,W), 那么Tensor的 LayoutType必须是NCHW，否则会出错。如下列代码所示`  
+
+
+```c++
+   // A 4-D tensor.
+   Tensor<X86, AK_FLOAT> mytensor2(shape2);  //right
+
+   //A 4-D tensor which is resident at GPU and its datatype is AK_INT8
+   Tensor<NV, AK_INT8> mytensor3(shape2);   //right
+   
+   Tensor<X86, AK_FLOAT, NHW> mytensor4(shape2); //wrong!! shape's dimetion must be equal to tensor's Layout.
+   Tensor<NV, AK_FLOAT, NCHW_C4> mytensor5(shape2); //wrong!!!!
+
+```
+
+> 2. 使用现有的数据和shape初始化tensor
+
+```c++
+
+   /**
+   *  A construtor of Tensor.
+   *  data_ptr is a pointer to any data type of data
+   *  TargetType is type of a platform [Anakin TargetType]
+   *  id : device id
+   *  shape: a Anakin shape
+   */
+   Tensor(Dtype* data_ptr, TargetType_t target, int id, Shape shape);
+
+   //using existing data feed to a tensor
+   Tensor<X86, AK_FLOAT> mytensor(data_ptr, TargetType, device_id, shape); //shape must has dimention (N, C, H, W).
+
+```
+
+> 3. 使用tensor初始化tensor
+
+```c++
+   Tensor<NV, AK_FLOAT> tensor(exist_tensor);
+```
+
+
+> 提示： 你可以用` typedef Tensor<X86, AK_FLOAT> Tensor4d_X86 `方便定义tensor
+
+
+#### 填充tensor数据区
+
+
+填充数据区得看你申明tensor的方式， 下面展示了如何填充tensor的数据区。
+
+```c++
+首先来看看tensor的四种声明方式：
+
+1. Tensor<X86, AK_FLOAT> mytensor;
+2. Tensor<X86, AK_FLOAT, W> mytensor1(shape1);
+3. Tensor<X86, AK_FLOAT> mytensor(data_ptr, TargetType, device_id, shape);
+4. Tensor<NV, AK_FLOAT> tensor(exist_tensor);
+
+
+相关的声明方式的数据填充方法如下：
+
+1：声明一个空的tensor，此时没有为其分配内存，所以，我们需要手动的为其分配内存。
+            
+            //parama shape
+            mytensor.re_alloc(Shape shape); 
+
+            //Get writable pointer to mytensor.
+            //parama index (int): where you start to write.
+            //Dtype is your data type such int, float or double.
+            Dtype *p = mytensor.mutable_data(index/*=0*/);
+            //write data to mytensor
+            for(int i = 0; i < mytensor.size(); i++){
+              p[i] = 1.0f;
+            }
+            //do something ...
+
+2: 这种声明方式会自动分配内存 
+
+          //Get writable pointer to mytensor.
+          //parama index (int): where you start to write.
+          //Dtype is your data type such int, float or double.
+          Dtype *p = mytensor1.mutable_data(index/*=0*/);
+          //write data to mytensor
+          for(int i = 0; i < mytensor.size(); i++){
+            p[i] = 1.0f;
+          }
+          //do something ...
+
+ 
+3：在该种声明方式中，我们仍不需要手动为其分配内存。但在构造函数内部是否为其分配内存，得依情况而定。如果data_ptr和申明的
+tensor都在都一个目标平台上，那么该tensor就会与data_ptr共享内存空间，相反，如果他们不在同一个平台上（如data_ptr在X86上，而
+tensor在GPU上），那么此时tensor就会开辟一个新的内存空间，并将data_ptr所指向的数据拷贝到tensor的buffer中。
+
+          //Get writable pointer to mytensor.
+          //parama index (int): where you start to write.
+          //Dtype is your data type such int, float or double.
+          Dtype *p = mytensor.mutable_data(index/*=0*/);
+          //write data to mytensor
+          for(int i = 0; i < mytensor.size(); i++){
+            p[i] = 1.0f;
+          }
+          //do something ...
+
+4：该种方式仍不需要手动分配内存
+
+          //Get writable pointer to mytensor.
+          //parama index (int): where you start to write.
+          //Dtype is your data type such int, float or double.
+          Dtype *p = mytensor.mutable_data(index/*=0*/);
+          //write data to mytensor
+          for(int i = 0; i < mytensor.size(); i++){
+            p[i] = 1.0f;
+          }
+          //do something ...
+
+
+另外，你还可以获取一个tensor的可读指针，示例如下：
+        //Get read-only pointer to mytensor.
+        //parama index (int): where you start to read.
+        //Dtype is your data type such int, float or double.
+         Dtype *p = mytensor.data(index/*=0*/);
+        //do something ...
+```
+
+如果想更详细的了解tensor，请查阅*soure_path/saber/core/tensor.h*
+
+#### 获取tensor的shape
+
+```c++
+//some declarations
+// ...
+Shape shape = mytensor.shape();
+
+//Get a first dimetion size of tesor, if it has.
+int d1 = shape[0];
+
+//Get a second dimention size of tensor, if it has.
+int d2 = shape[1];
+
+...
+
+//Get a n-th dimention size of tensor, if it has.
+int dn = shape[n-1];
+
+
+//Get a tensor's dimention
+int dims = mytensor.dims();
+
+//Get the size of tensor.
+//size = d1 x d2 x ... x dn.
+int size = mytensor.size();
+
+//Get the size of tensor at interval [Di, Dj)
+// form i-th dimention to j-th dimention, but not including the j-th dimention.
+// which means di x (di+1) x ... x (dj -1)
+int size = mytensor.count(start, end);
+```
+
+#### 设置tensor的shape
+
+我们可以用tensor的成员函数set_shape来设置tensor的shape。 下面是set_shape的定义
+
+
+```c++
+/**
+ * \brief set a tensor's shape
+ * \param valid_shape [a Shape object]
+ * \param shape [a Shape object]
+ * \param offset [a Shape object]
+ * \return the status of this operation, that means whether it success * or not.
+ */
+SaberStatus set_shape(Shape valid_shape, Shape shape = Shape::zero(TensorAPI::layout_dims::value), Shape offset = Shape::minusone(TensorAPI::layout_dims::value)); 
+```
+
+这个成员函数只设置tensor的shape。这些shape对象(valid_shape, shape, offset)的[LayOutType](#layout)必须和当前的tensor的相应三个shape对象的LayOutType相同，如果不同就会出错，返回SaberInvalidValue。 如果相同，那么将成功设置tensor的shape。
+
+```c++
+
+// some declarations
+// ...
+//valid_shape, shape , offset are Shape object;
+//All these Shape object's LayOutType must be equal to mytensor's.
+mytensor.set_shape(valid_shape, shape, offset);
+
+```
+
+#### 重置 tensor的shape
+
+```c++
+//some declarations
+Shape shape, valid_shape, offset;
+
+//do some initializations
+... 
+mytensor.reshape(valid_shape, shape, offset);
+```
+
+注意： Reshape操作仍然需要shape的[LayOutType](#layout) 与tensor的相同
+
+
+### Graph ###
+
+`Graph`类负责加载Anakin模型生成计算图、对图进行优化、存储模型等操作。
+
+#### 图的声明
+
+与`Tensor`一样，graph也接受三个模板参数。
+
+```c++
+
+template<typename TargetType, DataType Dtype, Precision Ptype>
+class Graph ... /* inherit other class*/{
+  
+  //some implements
+  ...
+
+};
+```
+
+前面已经介绍过[TargetType](#target)和[DataType](#datatype)是Anakin内部自定义数据类型。[TargetType](#target)表示平台类型 (如NV、X86), [DataType](#datatype)是Anakin基本数据类型与C++/C中的基本数据类型相对应。 [Precision](#precision)为op所支持的精度类型, 稍后我们在介绍它。
+
+
+```c++
+
+//Create a empty graph object.
+Graph graph = Graph<NV, AK_FLOAT, Precision::FP32> tmp();
+
+//Create a pointer to a empty graph.
+Graph *graph = new Graph<NV, AK_FLOAT, Precision::FP32>();
+
+//Create a pointer to a empty graph.
+auto graph = new Graph<NV, AK_FLOAT, Precision::FP32>();
+
+```
+
+#### 加载 Anakin 模型
+
+```c++
+//some declarations
+...
+auto graph = new Graph<NV, AK_FLOAT, Precision::FP32>();
+std::string model_path = "the/path/to/where/your/models/are";
+const char *model_path1 = "the/path/to/where/your/models/are";
+
+//Loading Anakin model to generate a compute graph.
+auto status = graph->load(model_path);
+
+//Or this way.
+auto status = graph->load(model_path1);
+//Check whether load operation success.
+if(!status){
+  std::cout << "error" << endl;
+  //do something...
+}
+
+```
+
+#### 优化计算图
+
+```c++
+//some declarations
+...
+//Load graph.
+...
+//According to the ops of loaded graph, optimize compute graph.
+graph->Optimize();
+
+```
+
+> 注意： 第一次加载原始图，必须要优化。
+
+#### 保存模型
+
+你可以在任何时候保存模型， 特别的， 你可以保存一个优化的模型，这样，下次再加载模型时，就不必进行优化操作。
+
+
+```c++
+//some declarations
+...
+//Load graph.
+...
+// save a model
+//save_model_path: the path to where your model is.
+auto status = graph->save(save_model_path);
+
+//Checking
+if(!status){
+  cout << "error" << endl;
+  //do somethin...
+}
+```
+
+#### 重新设置计算图里的tensor的shape
+
+```c++
+//some declarations
+...
+//Load graph.
+...
+vector<int> shape{10, 256, 256, 10};
+//input_name : std::string.
+//Reshape a tensor named input_name.
+graph->Reshape(input_name, shape);//Note: shape is a vector, not a Shape object.
+```
+
+#### 设置 batch size
+
+`Graph` 支持重新设置batch size的大小。
+
+```c++
+//some declarations
+...
+//Load graph.
+...
+//input_name : std::string.
+//Reset a tensor named input_name.
+int new_batch_size = 4;
+graph->ResetBatchSize(input_name, new_batch_size);
+```
+
+###  Net ###
+
+
+`Net` 是计算图的执行器。你可以通过Net对象获得输入和输出
+#### Creating a graph executor
+
+`Net`接受四个模板参数。  
+
+
+```c++
+template<typename TargetType, DataType Dtype, Precision PType OpRunType RunType = OpRunType::ASYNC>
+class Net{
+  //some implements
+  ...
+
+};
+```
+由于有些Op可能支持多种精度，我们可以通过Precision来指定。OpRunType表示同步或异步类型，异步是默认类型。OpRunType::SYNC表示同步，在GPU上只有单个流；OpRunType::ASYNC表示异步，在GPU上有多个流并以异步方式执行。实际上，Precision和OpRunType都是enum class, 详细设计请参考*source_root/framework/core/types.h*.
+
+
+1. <span id = 'precision'> Precision </span>
+
+Precision | Op support
+:---: | :---:
+Precision::INT4 | NO
+Precision::INT8 | NO
+Precision::FP16 | NO
+Precision::FP32 | YES
+Precision::FP64 | NO
+
+现在Op的精度只支持FP32， 但在将来我们会支持剩下的Precision.
+
+
+
+2. OpRunType
+
+OpRunType | Sync/Aync |Description
+:---: | :---: | :---:
+OpRunType::SYNC | Synchronization | single-stream on GPU
+OpRunType::ASYNC | Asynchronization | multi-stream on GPU
+
+用graph对象创建一个执行器。
+```c++
+//some declarations
+...
+//Create a pointer to a graph.
+auto graph = new Graph<NV, AK_FLOAT, Precision::FP32>();
+//do something...
+...
+
+//create a executor
+Net<NV, AK_FLOAT, Precision::FP32> executor(*graph);
+
+```
+
+#### 获取输入输出tensor
+
+
+获取输入输出tensor，并填充输入tensor的buffer。如果想要获取输入和输出tensor，那么必须指定输入的名字，如"input_0", "input_1", "input_2", ..., 必须传入如上字符串才能够获得输入tensor。另外，如果想知道input_i对应哪个输入，你需要去dash board查看，如何使用dash board请看[Anakin Parser](Converter_ch.md)。请看如下示例代码
+
+```c++
+//some declaratinos
+...
+
+//create a executor
+//TargetType is NV [NVIDIA GPU]
+Net<NV, AK_FLOAT, Precision::FP32> executor(*graph);
+
+//Get the first input tensor.
+//The following tensors(tensor_in0, tensor_in2 ...) are resident at GPU.
+//Note: Member function get_in returns an pointer to tensor.
+Tensor<NV, AK_FLOAT>* tensor_in0 = executor.get_in("input_0");
+
+//If you have multiple input tensors
+//You just type this code below.
+Tensor<NV, AK_FLOAT>* tensor_in1 = executor.get_in("input_1");
+...
+auto tensor_inn = executor.get_in("input_n");
+```
+
+当得到输入tensor之后，就可以填充它的数据区了。
+
+```c++
+//This tensor is resident at GPU.
+auto tensor_d_in = executor.get_in("input_0");
+
+//If we want to feed above tensor, we must feed the tensor which is resident at host. And then copy the host tensor to the device's one.
+
+//using Tensor4d = Tensor<Ttype, Dtype>;
+Tensor4d<X86, AK_FLOAT> tensor_h_in; //host tensor;
+//Tensor<X86, AK_FLOAT> tensor_h_in; 
+
+//Allocate memory for host tensor.
+tensor_h_in.re_alloc(tensor_d_in->valid_shape());
+//Get a writable pointer to tensor.
+float *h_data = tensor_h_in.mutable_data();
+
+//Feed your tensor.
+/** example
+for(int i = 0; i < tensor_h_in.size(); i++){
+  h_data[i] = 1.0f;
+}
+*/
+//Copy host tensor's data to device tensor.
+tensor_d_in->copy_from(tensor_h_in);
+
+// And then
+```
+
+
+类似的，我们可以利用成员函数get_out来获得输出tensor。但与获得输入tensor不同的是， 我们需要指定输入tensor结点的名字，这个可以从dash board中看到，请从[Anakin Parser](Converter_ch.md)中查看dash board的使用方法。假如有个输出结点叫pred_out, 那么我们可以通过如下代码获得相应的输出tensor：
+```c++
+//Note: this tensor are resident at GPU.
+Tensor<NV, AK_FLOAT>* tensor_out_d = executor.get_out("pred_out");
+
+```
+
+
+#### Executing graph
+
+
+当一切准备就绪后，我们就可以执行真正的计算了！
+```c++
+executor.prediction();
+```
+ 
+## <span id='example'> 示例代码 </span> ##
+
+下面的例子展示了如何调用Anakin。
+
+在这儿之前， 请确保你已经有了Anakin模型。如果还没有，那么请使用[Anakin Parser](Converter_ch.md)转换你的模型。
+
+### Single-thread
+
+单线程例子在 *source_root/test/framework/net/net_exec_test.cpp`*
+
+```c++
+
+std::string model_path = "your_Anakin_models/xxxxx.anakin.bin";
+// Create an empty graph object.
+auto graph = new Graph<NV, AK_FLOAT, Precision::FP32>();
+// Load Anakin model.
+auto status = graph->load(model_path);
+if(!status ) {
+    LOG(FATAL) << " [ERROR] " << status.info();
+}
+// Reshape
+graph->Reshape("input_0", {10, 384, 960, 10});
+// You must optimize graph for the first time.
+graph->Optimize();
+// Create a executer.
+Net<NV, AK_FLOAT, Precision::FP32> net_executer(*graph);
+
+//Get your input tensors through some specific string such as "input_0", "input_1", and 
+//so on. 
+//And then, feed the input tensor.
+//If you don't know Which input do these specific string ("input_0", "input_1") correspond with, you can launch dash board to find out.
+auto d_tensor_in_p = net_executer.get_in("input_0");
+Tensor4d<X86, AK_FLOAT> h_tensor_in;
+auto valid_shape_in = d_tensor_in_p->valid_shape();
+for (int i=0; i<valid_shape_in.size(); i++) {
+    LOG(INFO) << "detect input dims[" << i << "]" << valid_shape_in[i]; //see tensor's dimentions
+}
+h_tensor_in.re_alloc(valid_shape_in);
+float* h_data = h_tensor_in.mutable_data();
+for (int i=0; i<h_tensor_in.size(); i++) {
+    h_data[i] = 1.0f;
+}
+d_tensor_in_p->copy_from(h_tensor_in);
+
+//Do inference.
+net_executer.prediction();
+
+//Get result tensor through the name of output node.
+//And also, you need to see the dash board again to find out how many output nodes are and remember their name.
+
+//For example, you've got a output node named obj_pre_out
+//Then, you can get an output tensor.
+auto d_tensor_out_0_p = net_executer.get_out("obj_pred_out"); //get_out returns a pointer to output tensor.
+auto d_tensor_out_1_p = net_executer.get_out("lc_pred_out"); //get_out returns a pointer to output tensor.
+//......
+// do something else ...
+//...
+//save model.
+//You might not optimize the graph when you load the saved model again.
+std::string save_model_path = model_path + std::string(".saved");
+auto status = graph->save(save_model_path);
+if (!status ) {
+    LOG(FATAL) << " [ERROR] " << status.info();
+}
+
+```
diff --git a/doc/fluid/new_docs/advanced_usage/deploy/convert_paddle_to_anakin.md b/doc/fluid/new_docs/advanced_usage/deploy/convert_paddle_to_anakin.md
new file mode 100644
index 0000000000000000000000000000000000000000..56ca582b2b47f404ede777712830731ea7f4e9b5
--- /dev/null
+++ b/doc/fluid/new_docs/advanced_usage/deploy/convert_paddle_to_anakin.md
@@ -0,0 +1,73 @@
+# 模型转换指南
+
+Anakin 支持不同框架的模型预测。但由于格式的差别，Anakin 需要您预先转换模型。本文档介绍如何转换模型。
+
+## 简介
+
+Anakin 模型转换器输入支持 Caffe 和 Fluid 两种格式的预测模型，模型包含网络结构（model 或 prototxt）和权重参数（param 或 caffemodel）。   
+
+模型转换的输出是一个 bin 文件，它作为 Anakin 框架的 graph 参数导入。   
+
+您还可以使用模型转换器的 launch board 功能生成网络结构的 HTML 预览。   
+
+
+## 系统要求
+
+- python 2.7+
+- pyyaml
+- flask
+- protobuf 3.5+
+
+
+## 用法
+
+### 1、环境
+转换器所需的依赖标注于 *系统要求* 一节。
+
+### 2、配置
+您需要对 *config.yaml* 文件进行修改以告知您的需求。工程中给出了 *config.yaml* 示例，下面作进一步说明。
+
+#### config.yaml
+```bash
+OPTIONS:
+    Framework: CAFFE       # 依框架类型填写 CAFFE 或 FLUID
+    SavePath: ./output     # 转换结束后模型的保存位置
+    ResultName: googlenet  # 输出模型的名字
+    Config:
+        LaunchBoard: ON    # 是否生成网络结构预览页面
+        Server:
+            ip: 0.0.0.0
+            port: 8888     # 从一个可用端口访问预览页面
+        OptimizedGraph:    # 当您使用了 Anakin 框架的 Optimized 功能时，才应该打开此项
+            enable: OFF
+            path: /path/to/anakin_optimized_anakin_model/googlenet.anakin.bin.saved
+    LOGGER:
+        LogToPath: ./log/  # 生成日志的路径
+        WithColor: ON
+
+TARGET:
+    CAFFE:
+        # 当 Framework 为 CAFFE 时需填写
+        ProtoPaths:
+            - /path/to/caffe/src/caffe/proto/caffe.proto
+        PrototxtPath: /path/to/your/googlenet.prototxt
+        ModelPath: /path/to/your/googlenet.caffemodel
+
+    FLUID:
+        # 当 Framework 为 FLUID 时需填写
+        Debug: NULL
+        ProtoPaths:
+            - /
+        PrototxtPath: /path/to/fluid/inference_model
+        ModelPath: /path/to/fluid/inference_model
+	# ...
+```
+
+### 3、转换
+在完成配置文件的修改后，您只需执行 ```python converter.py``` 就可以进行模型转换了。
+
+
+### 4、预览
+最后一步，就是在浏览器中查看令人振奋的转换结果！网址是在 *config.yaml* 中配置的，例如 http://0.0.0.0:8888 。
+
+> 注意：若您使用了默认的 IP 地址 0.0.0.0，请在预览时使用真实的服务器地址 real_ip:port 替代它。
diff --git a/doc/fluid/new_docs/advanced_usage/deploy/how_to_add_anakin_op.md b/doc/fluid/new_docs/advanced_usage/deploy/how_to_add_anakin_op.md
new file mode 100644
index 0000000000000000000000000000000000000000..f2783eb9f591a31443f2a692ce0eb1bcc9b1063a
--- /dev/null
+++ b/doc/fluid/new_docs/advanced_usage/deploy/how_to_add_anakin_op.md
@@ -0,0 +1,405 @@
+# 如何增加新的Operator
+
+## 基本概念
+
+简单介绍下几个同Operator相关的基本概念，详情请参考设计文档。
+
+```framework```: 上层的逻辑代码，负责从parser中获取参数及weights，添加op时主要修改framework/operator目录下的内容。
+
+```saber```: 底层的实现代码，Anakin通过saber封装了不同的backends，不同的实现(impl)分别特化出自己的实现，外层framework通过不同的template进入各自的impl完成调用。各个op的parameter放在saber/saber_funcs_param.h文件中，增加op主要修改saber/funcs下的内容。
+
+saber的文件结构：
+* saber/funcs下的是各个funcs的外部接口，这一层的op与具体的设备实现无关，只与各op完成的功能有关。由于跟实现(impl)无关，本层文件明均不带impl。
+* saber/funcs/impl下是各个op的impl声明，特定设备需要完成该层声明的特化版本，如saber/funcs/impl/x86实现了上一层impl声明的x86特化版本，saber/funcs/impl/cuda实现了上一层impl声明的NV特化版本。当增加新的backends时需要特化出新的实现。本层代码同实现相关，均带有```impl_```前缀。
+* saber/funcs/impl/cuda/base/cuda_c内有cuda```.cu```扩展名的文件，添加cuda的kernel需要在该文件目录下添加。
+* saber/funcs/impl/cuda/base/sass 内有不同架构的汇编代码编译的静态库。
+
+### 涉及到的基类及各个类之前的关系
+
+简单介绍相关的基类
+
+* ```anakin::Operator```: framework的operator基类，位于framework/core/operator/operator.h
+
+* ```anakin::saber::BaseFunc```: saber对外的op接口基类，提供统一的对外接口，位于saber/funcs/base.h。BaseFunc的```compute_output_shape```接口只根据input的shape和param的参数计算输出的shape，并通过```tensor```的```set_shape```接口(只设置shape，不分配空间)设置到output中。```operator()```接口为各个op的计算接口。
+
+* ```ankain::saber::ImplBase```: saber设备实现的op的接口，所有设备相关实现的基类。位于saber/funcs/impl/impl_base.h。实现版本中这里分为两类，一类以```vender_```为前缀，带有```vender_```代码意为使用第三方库来实现该op，如cudnn的conv，或mkl的conv等等，这类op的性能我们难以调优，因此单独列为一类。另一类是带有源码的saber实现，这些实现都带有```saber_```为前缀，此类实现带有源码，能够通过后续优化不断提升性能，实现起名时需要注意这一点。
+
+## 添加operator
+
+添加一个新的op需要以下几步：
+
+1. 添加saber的param
+2. 定义saber的Operator类
+3. 定义新的impl声明
+3. 完成新的impl实现
+4. 增加framework的实现或特化
+
+接下来就针对这几步，以一个简单例子为例介绍实现。
+
+例如我们要添加新的Mul op。给出计算公式如下：$$Out = alpha \dot X * Y$$
+
+### 为operator增加param
+
+涉及到的文件：```saber/saber_funcs_param.h```。如果之前已经存在需要添加的op的param，这一步可以跳过。
+这里```XXXParam```是一个```struct```。包含一个无参数的构造函数，含参数的构造函数，复制构造函数，```operator=()```及```operator==()```。
+```
+template <typename opTensor> // 能够获得target, datatype, layout
+struct MulParam{
+  MulParam()
+    : alpha(0)
+  {}
+  MulParam(float alpha_in)
+    : alpha(alpha_in)
+  {}
+  MulParam(const MulParam& right)
+    : alpha(right.alpha)
+  {}
+  MulParam &operator=(const MulParam &right) {
+    alpha = right.alpha;
+  }
+  bool operator==(const MulParam &right) {
+    return alpha == right.alpha;
+  }
+  float alpha;
+};
+```
+
+### 定义Operator类
+涉及到的文件:```saber/funcs/mul.h```。如果之前定义过该op的类，这里需要修改输入的impl定义头文件。
+下面给出一个相对完整的定义结构供参考。
+```
+//不同的设备需要包含对应的operator实现.[详见](#impl)
+#ifdef NVIDIA_GPU
+#include "saber/funcs/impl/cuda/saber_mul.h"
+#include "saber/funcs/impl/cuda/vender_mul.h"
+#endif
+//如果一个设备现在还没有对应的operator实现，需要包含声明。[详见](#declare)
+#ifdef USE_X86_PLACE
+#include "saber/funcs/impl/impl_mul.h"
+#endif
+namespace anakin {
+namespace saber {
+template<typename TargetType,
+        DataType OpDtype,
+        DataType inDtype = AK_FLOAT,
+        DataType outDtype = AK_FLOAT,
+        typename LayOutType_op = NCHW,
+        typename LayOutType_in = NCHW,
+        typename LayOutType_out = NCHW>
+class Mul : public BaseFunc<
+        Tensor<TargetType, inDtype, LayOutType_in>,
+        Tensor<TargetType, outDtype, LayOutType_out>,
+        Tensor<TargetType, OpDtype, LayOutType_op>,
+        ImplBase, MulParam> {
+public:
+    using BaseFunc<
+            Tensor<TargetType, inDtype, LayOutType_in>,
+            Tensor<TargetType, outDtype, LayOutType_out>,
+            Tensor<TargetType, OpDtype, LayOutType_op>,
+            ImplBase, MulParam>::BaseFunc;
+    Mul() = default;
+    typedef Tensor<TargetType, inDtype, LayOutType_in> InDataTensor;
+    typedef Tensor<TargetType, outDtype, LayOutType_out> OutDataTensor;
+    typedef Tensor<TargetType, OpDtype, LayOutType_op> OpTensor;
+    typedef MulParam<OpTensor> Param_t;
+    typedef std::vector<InDataTensor *> Input_v;
+    typedef std::vector<OutDataTensor *> Output_v;
+    typedef std::vector<Shape> Shape_v;
+
+    virtual SaberStatus compute_output_shape(const Input_v &input,
+                                             Output_v &output, Param_t &param) override {
+        //计算输出的shape，
+        Shape output_shape = (input[0]->valid_shape());
+        /* code */
+        return output[0]->set_shape(output_shape);
+    }
+    virtual SaberStatus init_impl(ImplEnum implenum) override {
+      // 不同设备均使用此init_impl, 此接口创建对应impl的实现。
+      switch (implenum) {
+            case VENDER_IMPL:
+                this->_impl.push_back(new VenderMul <TargetType,
+                OpDtype, inDtype, outDtype,
+                LayOutType_op, LayOutType_in, LayOutType_out>);
+                return SaberSuccess;
+            case SABER_IMPL:
+                this->_impl.push_back(new SaberMul <TargetType,
+                OpDtype, inDtype, outDtype,
+                LayOutType_op, LayOutType_in, LayOutType_out>);
+                return SaberSuccess;
+            default:
+                return SaberUnImplError;
+        }
+    }
+private:
+    virtual void pick_best_static() override {
+        if (true) // some condition?
+            this->_best_impl = this->_impl[0];
+    }
+    virtual void pick_best_specify(ImplEnum implenum) override {
+        this->_best_impl = this->_impl[0];
+    }
+};
+} // namespace saber
+} // namespace anakin
+```
+
+### 为operator增加新的impl<span id="declare">声明</span>
+
+涉及的文件:```saber/funcs/impl/impl_mul.h```。不同的设备都特化同一个声明，特化版本放在对应的文件夹下，这里的声明就是给出所有设备的统一声明。下面给出一个参考。
+```
+#include "saber/funcs/impl/impl_macro.h"
+namespace anakin{
+namespace saber{
+DEFINE_OP_CLASS(Mul, MulParam); // 第一个参数是op的名字，第二个是对应param的名字
+}
+}
+```
+
+### 完成新的operator特定后端<span id="impl">实现</span>
+
+涉及的文件:```saber/funcs/impl/xxx/vender_mul.h```或```saber/funcs/impl/xxx/saber_mul.h```
+这里```xxx```指代特定的一种设备。```vender```是指的使用第三方库实现的op，```saber```指的源码实现的op。这里以cuda的vender实现为例，简单介绍一下特化出的函数的几个基本接口。
+
+```
+// include 对应的声明
+#include "saber/funcs/impl/impl_mul.h"
+
+namespace anakin{
+namespace saber{
+template <DataType OpDtype,
+    DataType inDtype,
+    DataType outDtype,
+    typename LayOutType_op,
+    typename LayOutType_in,
+    typename LayOutType_out>
+class VenderMul<NV, //偏特化出需要的后端。
+    OpDtype, inDtype, outDtype,
+    LayOutType_op, LayOutType_in, LayOutType_out> :
+    public ImplBase<
+        Tensor<NV, inDtype, LayOutType_in>,
+        Tensor<NV, outDtype, LayOutType_out>,
+        Tensor<NV, OpDtype, LayOutType_op>,
+        MulParam<Tensor<NV, OpDtype, LayOutType_op> > >
+{
+public:
+    typedef Tensor<NV, inDtype, LayOutType_in> DataTensor_in;
+    typedef Tensor<NV, outDtype, LayOutType_out> DataTensor_out;
+    typedef Tensor<NV, OpDtype, LayOutType_op> OpTensor;
+    typedef typename DataTensor_in::Dtype InDataType;
+    typedef typename DataTensor_out::Dtype OutDataType;
+    typedef typename OpTensor::Dtype OpDataType;
+    VenderMul(){}
+    ~VenderMul() {}
+
+    virtual SaberStatus init(const std::vector<DataTensor_in *>& inputs,
+                            std::vector<DataTensor_out *>& outputs,
+                            MulParam<OpTensor>& param, Context<NV>& ctx) {
+        this->_ctx = ctx;
+        create(inputs, outputs, param, ctx);
+    }
+
+    virtual SaberStatus create(const std::vector<DataTensor_in *>& inputs,
+                            std::vector<DataTensor_out *>& outputs,
+                            MulParam<OpTensor>& param, Context<NV>& ctx) {
+        // set内部参数
+    }
+
+    virtual SaberStatus dispatch(const std::vector<DataTensor_in*>& inputs,
+                          std::vector<DataTensor_out*>& outputs,
+                        MulParam<OpTensor>& param) {
+        // dispatch kernel.
+    }
+
+private:
+};
+}
+}
+```
+```init```和```create```的区别：```init```接口是第一次初始化op的时候进入的接口，此函数只在第一次初始化op时调用，这个接口一般放一些只需要执行一次的代码，如malloc或者create之类的函数。```create```函数除了第一次init执行外，在输入发生变化或者param发生变化时会再次触发，create一般放置set函数，设置内部变量，当input发生变化时这里执行一些同input或weights直接相关的代码。但create因为触发位置在网络内，如果```create```函数执行了一些严重耗时的操作，这里会拖慢整个op的执行时间，需要慎重选择操作放置的位置。
+### 添加framework的特化
+
+涉及的文件:```framework/operators/mul.h```和```framework/operators/mul.cpp```。
+这里简单介绍下如果添加或修改framework内的operator
+
+```
+#include "framework/core/base.h"
+#include "framework/core/data_types.h"
+#include "framework/core/operator/operator.h"
+#include "utils/logger/logger.h"
+#include "saber/funcs/mul.h" // 需要包对应的saber头文件
+namespace anakin {
+namespace ops {
+template<typename Ttype, DataType Dtype, Precision Ptype>
+class MulHelper;
+
+template<typename Ttype, DataType Dtype, Precision Ptype>
+class Mul : public Operator<Ttype, Dtype, Ptype> {
+public:
+    Mul() {}
+    /// forward impl
+    virtual void operator() (OpContext<Ttype> &ctx,
+                             const std::vector<Tensor4dPtr<Ttype, Dtype> >& ins,
+                             std::vector<Tensor4dPtr<Ttype, Dtype> >& outs) {
+        LOG(ERROR) << "Not Impl Yet Operator power<TargetType:"<<"unknown"<<","
+                   <<type_id<typename DataTypeWarpper<Dtype>::type>().type_info()<<">";
+    }
+    friend class MulHelper<Ttype, Dtype, Ptype>;
+};
+template<typename Ttype, DataType Dtype, Precision Ptype>
+class MulHelper : public OperatorHelper<Ttype, Dtype, Ptype> {
+public:
+    MulHelper() = default;
+    ~MulHelper();
+    Status InitParam() override;
+
+    Status Init(OpContext<Ttype> &ctx,
+                const std::vector<Tensor4dPtr<Ttype, Dtype> >& ins,
+                std::vector<Tensor4dPtr<Ttype, Dtype> >& outs) override;
+    Status InferShape(const std::vector<Tensor4dPtr<Ttype, Dtype> >& ins,
+                      std::vector<Tensor4dPtr<Ttype, Dtype> >& outs) override;
+
+public:
+    saber::MulParam<Tensor4d<Ttype, Dtype>> _param_mul;
+    saber::Mul<Ttype, Dtype> _funcs_mul;
+};
+}
+} /* namespace anakin */
+```
+对应的```.cpp```文件如下：
+```
+#include "framework/operators/mul.h"
+
+namespace anakin {
+namespace ops {
+
+#ifdef USE_CUDA
+template<>
+void Mul<NV, AK_FLOAT, Precision::FP32>::operator()(
+    OpContext<NV>& ctx,
+    const std::vector<Tensor4dPtr<NV, AK_FLOAT> >& ins,
+    std::vector<Tensor4dPtr<NV, AK_FLOAT> >& outs) {
+    auto* impl =
+        static_cast<MulHelper<NV, AK_FLOAT, Precision::FP32>*>(this->_helper);
+    auto& param =
+        static_cast<MulHelper<NV, AK_FLOAT, Precision::FP32>*>(this->_helper)->_param_mul;
+    impl->_funcs_mul(ins, outs, param, ctx);
+}
+#endif
+
+template<typename Ttype, DataType Dtype, Precision Ptype>
+Status MulHelper<Ttype, Dtype, Ptype>::InitParam() {
+    auto alpha = GET_PARAMETER(float, alpha);
+    MulParam<Tensor4d<Ttype, Dtype>> param_mul(alpha);
+    _param_mul = param_mul;
+    return Status::OK();
+}
+
+template<typename Ttype, DataType Dtype, Precision Ptype>
+Status MulHelper<Ttype, Dtype, Ptype>::Init(OpContext<Ttype>& ctx,
+        const std::vector<Tensor4dPtr<Ttype, Dtype> >& ins,
+        std::vector<Tensor4dPtr<Ttype, Dtype> >& outs) {
+
+    SABER_CHECK(_funcs_mul.init(ins, outs, _param_mul, SPECIFY, VENDER_IMPL, ctx));
+    return Status::OK();
+}
+
+template<typename Ttype, DataType Dtype, Precision Ptype>
+Status MulHelper<Ttype, Dtype, Ptype>::InferShape(const
+        std::vector<Tensor4dPtr<Ttype, Dtype> >& ins,
+        std::vector<Tensor4dPtr<Ttype, Dtype> >& outs) {
+    SABER_CHECK(_funcs_mul.compute_output_shape(ins, outs, _param_mul));
+    return Status::OK();
+}
+
+#ifdef USE_CUDA
+template class MulHelper<NV, AK_FLOAT, Precision::FP32>;
+#endif
+#ifdef USE_ARM_PLACE
+template class MulHelper<ARM, AK_FLOAT, Precision::FP32>;
+#endif
+// register helper
+#ifdef USE_CUDA
+ANAKIN_REGISTER_OP_HELPER(Mul, MulHelper, NV, AK_FLOAT, Precision::FP32);
+#endif
+#ifdef USE_ARM_PLACE
+ANAKIN_REGISTER_OP_HELPER(Mul, MulHelper, ARM, AK_FLOAT, Precision::FP32);
+#endif
+//! register op
+ANAKIN_REGISTER_OP(Mul)
+.Doc("Mul operator")
+#ifdef USE_CUDA
+.__alias__<NV, AK_FLOAT, Precision::FP32>("mul")
+#endif
+#ifdef USE_ARM_PLACE
+.__alias__<ARM, AK_FLOAT, Precision::FP32>("mul")
+#endif
+.num_in(1)
+.num_out(1)
+.Args<float>("alpha", " alpha of Mul "); //注册
+
+} /* namespace ops */
+
+} /* namespace anakin */
+```
+
+## 实现单元测试
+涉及的文件:```test/saber/xxx/test_saber_funcs_mul_xxx.cpp```
+在对应的test下需要添加新的单元测试
+
+```
+TEST(TestSaberFuncNV, test_depthwise_conv) {
+
+    // init tensors and some param.
+
+    // start Reshape & doInfer
+    Context<NV> ctx1(0, 1, 1);
+
+    // create param
+    MulParam<Tensor<NV, AK_FLOAT, NCHW> > param(alpha);
+
+    std::vector<Tensor<NV, AK_FLOAT, NCHW>*> input;
+    std::vector<Tensor<NV, AK_FLOAT, NCHW>*> output;
+
+    // create saber op
+    Mul<NV, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW> mul;
+
+    // compute output shape
+    mul.compute_output_shape(input, output, param);
+
+    // re_alloc output tensors memory based on output shape
+    output[0]->re_alloc(output[0]->shape());
+
+    // init saber op(calling init and create)
+    mul.init(input, output, param, SPECIFY, VENDER_IMPL, ctx1);
+
+    // call operator()
+    mul(input, output, param, ctx1);
+
+    // cuda specified, record events
+    cudaStream_t cuda_stream = ctx1.get_compute_stream();
+    output[0]->record_event(cuda_stream);
+    output_dev.sync();
+    
+    // param changed 
+    param.alpha = 2.0;
+    // auto calling saber op(create and dispatch)
+    mul(input, output, param, ctx1);
+
+    cudaDeviceSynchronize();
+    CUDA_CHECK(cudaPeekAtLastError());
+}
+
+int main(int argc, const char** argv){
+    anakin::saber::Env<NV>::env_init();
+
+    // initial logger
+    //logger::init(argv[0]);
+    InitTest();
+    RUN_ALL_TESTS(argv[0]);
+    return 0;
+}
+
+```
+## 调试及注意事项
+
+一个op需要有对外的op接口和内部实现，由于存在saber/funcs/impl的非特化版本声明，当有op在某种设备下没有对应实现时，也能够编译，但此时是没有任何实现的空实现，
diff --git a/doc/fluid/new_docs/advanced_usage/deploy/how_to_support_new_device_in_anakin.md b/doc/fluid/new_docs/advanced_usage/deploy/how_to_support_new_device_in_anakin.md
new file mode 100644
index 0000000000000000000000000000000000000000..a1f75f5e95cfb90f26d3782ba30a6d1887a70424
--- /dev/null
+++ b/doc/fluid/new_docs/advanced_usage/deploy/how_to_support_new_device_in_anakin.md
@@ -0,0 +1,459 @@
+# 如何支持一个新的设备
+
+## 概览
+
+添加一个新的设备需要以下3个步骤：
+
+* [在`CMakeList`中添加设备的支持](#0001)
+* [在`saber`中添加设备的实现](#0002)
+* [在`framework`中添加设备的具体化或实例化](#0003)
+
+假设新设备的名称为`TNEW`, 以下将以这个设备名称进行演示。
+
+## <span id = '0001'> 在`CMakeList`中添加设备的支持 </span> ##
+
+* 修改根目录`CMakeList.txt`
+```cmake
+#select the plantform to build
+anakin_option(USE_GPU_PLACE "Select the build mode for GPU place." NO)
+anakin_option(USE_X86_PLACE "Select the build mode for X86 place." NO)
+anakin_option(USE_ARM_PLACE "Select the build mode for ARM place." NO)
+anakin_option(USE_TNEW_PLACE "Select the build mode for ARM place." YES)
+```
+
+* 修改`saber/CMakeList.txt`
+
+根据新增设备的目录完善`saber`目录下的`CMakeList.txt`。
+```cmake
+if(USE_TNEW_PLACE)
+    anakin_fetch_files_with_suffix(${ANAKIN_SABER}/core/impl/tnew "cpp" ANAKIN_SABER_BASE_SRC)
+    anakin_fetch_files_with_suffix(${ANAKIN_SABER}/funcs/impl/tnew "cpp" ANAKIN_SABER_BASE_SRC)
+endif()
+```
+
+* 修改`test/CMakeList.txt`
+
+新增设备的单测文件放在`test/saber/tnew`目录下，修改`test`目录下的`CMakeList.txt`。
+```cmake
+if(USE_TNEW_PLACE)
+    anakin_fetch_files_with_suffix(${ANAKIN_UNIT_TEST}/saber/tnew "cpp" ANAKIN_TEST_CASE_SRC)
+endif()
+```
+
+* 修改`cmake/anakin_config.h.in`
+```c++
+// plantform to use
+#cmakedefine USE_GPU_PLACE
+
+#cmakedefine USE_X86_PLACE
+
+#cmakedefine USE_ARM_PLACE
+
+#cmakedefine USE_TNEW_PLACE
+```
+
+* 其他依赖和编译选项    
+修改`cmake`目录下的`compiler_options.cmake`和`find_modules.cmake`
+
+
+## <span id = '0002'> 在`saber`中添加设备的实现 </span> ##
+`saber`是`Anakin`的基础计算库，对外提供设备无关的统一的API，设备相关的实现都会封装到`TargetWrapper`中。
+
+### 在`saber/saber_types.h`中添加设备
+
+```c++
+enum TargetTypeEnum {
+    eINVALID = -1,
+    eNV = 1,
+    eAMD = 2,
+    eARM = 3,
+    eX86 = 4,
+    eNVHX86 = 5,
+    eTNEW = 6
+};
+
+typedef TargetType<eNV> NV;
+typedef TargetType<eARM> ARM;
+typedef TargetType<eAMD> AMD;
+typedef TargetType<eX86> X86;
+typedef TargetType<eTNEW> TNEW;
+
+```
+
+### 在`saber/core`中添加设备的实现
+
+1. 在`target_traits.h`中添加新设备
+
+* 增加设备类型
+```c++
+struct __cuda_device{};
+struct __arm_device{};
+struct __amd_device{};
+struct __x86_device{};
+struct __tnew_device{};
+```
+
+* `TargetTypeTraits`模板具体化
+```c++
+template <>
+struct TargetTypeTraits<TNEW> {
+    typedef __xxx_target target_category;//根据实际设备是host端还是device端进行选择
+    typedef __tnew_device target_type;
+};
+```
+
+2. 在`data_traits.h`中特化`DataTrait`模板类
+
+如果设备需要特殊的数据类型，则特化出设备的`DataTrait`类的实现，例如opencl数据类型的实现如下：
+```c++
+#ifdef USE_OPENCL
+struct ClMem{
+    ClMem(){
+        dmem = nullptr;
+        offset = 0;
+    }
+
+    ClMem(cl_mem* mem_in, int offset_in = 0) {
+        dmem = mem_in;
+        offset = offset_in;
+    }
+
+    ClMem(ClMem& right) {
+        dmem = right.dmem;
+        offset = right.offset;
+    }
+
+    ClMem& operator=(ClMem& right) {
+        this->dmem = right.dmem;
+        this->offset = right.offset;
+        return *this;
+    }
+
+    ClMem& operator+(int offset_in) {
+        this->offset += offset_in;
+        return *this;
+    }
+
+    int offset{0};
+    cl_mem* dmem;
+};
+
+template <>
+struct DataTrait<AMD, AK_FLOAT> {
+    typedef ClMem Dtype;
+    typedef float dtype;
+};
+
+template <>
+struct DataTrait<AMD, AK_DOUBLE> {
+    typedef ClMem Dtype;
+    typedef double dtype;
+};
+
+template <>
+struct DataTrait<AMD, AK_INT8> {
+    typedef ClMem Dtype;
+    typedef char dtype;
+};
+#endif //use_opencl
+```
+
+3. 在`target_wrapper.h`中特化`TargetWrapper`模板类
+
+特化`TargetWrapper`模板类，在`target_wrapper.h`中声明函数，具体如下：
+```c++
+template <>
+struct TargetWrapper<TNEW, __xxx_target> { //根据TNEW的具体类型修改__xxx_target，__host_target或者__device_target
+
+    typedef xxx_event event_t;          //根据设备实现xxx_event
+    typedef xxx_stream stream_t;        //根据设备实现xxx_stream
+
+    static void get_device_count(int& count);
+
+    static void set_device(int id);
+
+    //We should add strategy to avoid malloc directly
+    static void mem_alloc(void** ptr, size_t n);
+
+    static void mem_free(void* ptr);
+
+    static void mem_set(void* ptr, int value, size_t n);
+
+    static void create_event(event_t& event, bool flag = false);
+
+    static void create_stream(stream_t& stream);
+
+    static void create_stream_with_flag(stream_t& stream, unsigned int flag);
+
+    static void create_stream_with_priority(stream_t& stream, unsigned int flag, int priority);
+
+    static void destroy_stream(stream_t& stream);
+
+    static void destroy_event(event_t& event);
+
+    static void record_event(event_t& event, stream_t stream);
+
+    static void query_event(event_t& event);
+
+    static void sync_event(event_t& event);
+
+    static void sync_stream(event_t& event, stream_t& stream);
+
+    static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
+                            size_t count, __DtoD);
+
+    static void async_memcpy(void* dst, int dst_id, const void* src, int src_id, \
+                             size_t count, stream_t& stream, __DtoD);
+
+    static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
+                            size_t count, __HtoD);
+
+    static void async_memcpy(void* dst, int dst_id, const void* src, int src_id, \
+                             size_t count, stream_t& stream, __HtoD);
+
+    static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
+                            size_t count, __DtoH);
+
+    static void async_memcpy(void* dst, int dst_id, const void* src, int src_id, \
+                             size_t count, stream_t& stream, __DtoH);
+
+    static void sync_memcpy_p2p(void* dst, int dst_dev, const void* src, \
+                                int src_dev, size_t count);
+
+    static void async_memcpy_p2p(void* dst, int dst_dev, const void* src, \
+                                 int src_dev, size_t count, stream_t& stream);
+
+    static int get_device_id();
+};
+
+```
+
+4. 在`impl/`目录下添加设备目录和实现
+
+在`saber/core/impl`目录下添加设备目录`tnew`。
+* 实现`TargetWrapper<TNEW, __xxx_target>`结构体中各函数的定义。    
+如果`TargetWrapper<TNEW, __xxx_target>`的实现与默认的模板类一致，则不用特化出该类。
+
+```c++
+typedef TargetWrapper<TNEW, __xxx_target> TNEW_API;
+void TNEW_API::get_device_count(int &count) {
+    // add implementation
+}
+
+void TNEW_API::set_device(int id){
+    // add implementation
+}
+        
+void TNEW_API::mem_alloc(void** ptr, size_t n){
+    // add implementation
+}
+        
+void TNEW_API::mem_free(void* ptr){
+    if(ptr != nullptr){
+        // add implementation
+    }
+}
+...
+
+```
+
+* 特化实现`device.h`中的`Device<TNEW>`
+
+```c++
+template <>
+void Device<TNEW>::create_stream() {
+    // add implementation
+}
+
+template <>
+void Device<TNEW>::get_info() {
+
+    // add implementation
+}
+
+```
+
+### 在`saber/funcs`中实现设备相关的op
+
+参考[如何增加新的Operator](addCustomOp.md)
+
+
+## <span id = '0003'> 在`framework`中添加设备的具体化或实例化 </span> ##
+
+### `framework/core`
+
+* `net.cpp`中添加实例化
+
+```c++
+#ifdef USE_TNEW_PLACE
+template class Net<TNEW, AK_FLOAT, Precision::FP32, OpRunType::ASYNC>;
+template class Net<TNEW, AK_FLOAT, Precision::FP32, OpRunType::SYNC>;
+#endif
+```
+
+* `operator_func.cpp`中添加实例化
+
+```c++
+#ifdef USE_TNEW_PLACE
+template class OperatorFunc<TNEW, AK_FLOAT, Precision::FP32>;
+#endif
+```
+
+* `worker.cpp`中添加实例化
+
+```c++
+#ifdef USE_TNEW_PLACE
+template class Worker<TNEW, AK_FLOAT, Precision::FP32, OpRunType::ASYNC>;
+template class Worker<TNEW, AK_FLOAT, Precision::FP32, OpRunType::SYNC>;
+#endif
+```
+
+* `operator_attr.cpp`中添加实例化
+
+```c++
+template
+OpAttrWarpper& OpAttrWarpper::__alias__<TNEW, AK_FLOAT, Precision::FP32>(const std::string& op_name);
+template
+OpAttrWarpper& OpAttrWarpper::__alias__<TNEW, AK_FLOAT, Precision::FP16>(const std::string& op_name);
+template
+OpAttrWarpper& OpAttrWarpper::__alias__<TNEW, AK_FLOAT, Precision::INT8>(const std::string& op_name);
+```
+
+* `parameter.h`中添加设备的实现
+
+```c++
+#ifdef USE_TNEW_PLACE
+template<typename Dtype>
+class PBlock<Dtype, TNEW> {
+public:
+	typedef Tensor4d<TNEW, DataTypeRecover<Dtype>::type> type;
+
+	PBlock() {
+		_inner_tensor = std::make_shared<type>(); 
+	}
+	...
+}
+#endif //TNEW
+```
+
+* `type_traits_extend.h`中添加设备的实现
+
+```c++
+template<>
+struct target_host<saber::TNEW> {
+    typedef saber::X86 type; //根据TNEW选择正确的host type
+};
+```
+
+### `framework/graph`
+
+* `graph.cpp`中添加实例化
+  
+```c++
+  #ifdef USE_TNEW_PLACE
+  template class Graph<TNEW, AK_FLOAT, Precision::FP32>;
+  template class Graph<TNEW, AK_FLOAT, Precision::FP16>;
+  template class Graph<TNEW, AK_FLOAT, Precision::INT8>;
+  #endif
+```
+
+### `framework/model_parser`
+
+* `parser.cpp`中添加实例化
+  
+```c++
+  #ifdef USE_TNEW_PLACE
+  template
+  Status load<TNEW, AK_FLOAT, Precision::FP32>(graph::Graph<TNEW, AK_FLOAT, Precision::FP32>* graph,
+          const char* model_path);
+  template
+  Status load<TNEW, AK_FLOAT, Precision::FP16>(graph::Graph<TNEW, AK_FLOAT, Precision::FP16>* graph,
+          const char* model_path);
+  template
+  Status load<TNEW, AK_FLOAT, Precision::INT8>(graph::Graph<TNEW, AK_FLOAT, Precision::INT8>* graph,
+          const char* model_path);
+  
+  template
+  Status save<TNEW, AK_FLOAT, Precision::FP32>(graph::Graph<TNEW, AK_FLOAT, Precision::FP32>* graph,
+          std::string& model_path);
+  template
+  Status save<TNEW, AK_FLOAT, Precision::FP16>(graph::Graph<TNEW, AK_FLOAT, Precision::FP16>* graph,
+          std::string& model_path);
+  template
+  Status save<TNEW, AK_FLOAT, Precision::INT8>(graph::Graph<TNEW, AK_FLOAT, Precision::INT8>* graph,
+          std::string& model_path);
+  
+  template
+  Status load<TNEW, AK_FLOAT, Precision::FP32>(graph::Graph<TNEW, AK_FLOAT, Precision::FP32>* graph,
+          std::string& model_path);
+  template
+  Status load<TNEW, AK_FLOAT, Precision::FP16>(graph::Graph<TNEW, AK_FLOAT, Precision::FP16>* graph,
+          std::string& model_path);
+  template
+  Status load<TNEW, AK_FLOAT, Precision::INT8>(graph::Graph<TNEW, AK_FLOAT, Precision::INT8>* graph,
+          std::string& model_path);
+  
+  template
+  Status save<TNEW, AK_FLOAT, Precision::FP32>(graph::Graph<TNEW, AK_FLOAT, Precision::FP32>* graph,
+          const char* model_path);
+  template
+  Status save<TNEW, AK_FLOAT, Precision::FP16>(graph::Graph<TNEW, AK_FLOAT, Precision::FP16>* graph,
+          const char* model_path);
+  template
+  Status save<TNEW, AK_FLOAT, Precision::INT8>(graph::Graph<TNEW, AK_FLOAT, Precision::INT8>* graph,
+          const char* model_path);
+  #endif
+```
+
+* `model_io.cpp`中添加实例化
+
+```c++
+#ifdef USE_TNEW_PLACE
+template class NodeIO<TNEW, AK_FLOAT, Precision::FP32>;
+template class NodeIO<TNEW, AK_FLOAT, Precision::FP16>;
+template class NodeIO<TNEW, AK_FLOAT, Precision::INT8>;
+#endif
+```
+
+### `framework/operators`
+
+为`framework/operators`目录下所有op添加实例化或具体化
+以`activation.cpp`为例，实例化如下：
+
+```c++
+#ifdef USE_TNEW_PLACE
+INSTANCE_ACTIVATION(TNEW, AK_FLOAT, Precision::FP32);
+INSTANCE_ACTIVATION(TNEW, AK_FLOAT, Precision::FP16);
+INSTANCE_ACTIVATION(TNEW, AK_FLOAT, Precision::INT8);
+template class ActivationHelper<TNEW, AK_FLOAT, Precision::FP32>;
+ANAKIN_REGISTER_OP_HELPER(Activation, ActivationHelper, TNEW, AK_FLOAT, Precision::FP32);
+#endif
+```
+
+如果TNEW设备函数的实现与现有模板实现不一致，可以特化实现如下（以init()为例）：
+```c++
+#ifdef USE_TNEW_PLACE
+INSTANCE_ACTIVATION(TNEW, AK_FLOAT, Precision::FP32);
+INSTANCE_ACTIVATION(TNEW, AK_FLOAT, Precision::FP16);
+INSTANCE_ACTIVATION(TNEW, AK_FLOAT, Precision::INT8);
+template <>
+Status ActivationHelper<TNEW, AK_FLOAT, Precision::FP32>::Init(OpContext<TNEW> &ctx,\
+        const std::vector<Tensor4dPtr<TNEW, AK_FLOAT> >& ins, \
+                std::vector<Tensor4dPtr<TNEW, AK_FLOAT> >& outs) {
+    SABER_CHECK(_funcs_activation.init(ins, outs, _param_activation, SPECIFY, SABER_IMPL, ctx)); //在这里选择实现方式
+    return Status::OK();
+}
+ANAKIN_REGISTER_OP_HELPER(Activation, ActivationHelper, TNEW, AK_FLOAT, Precision::FP32);
+#endif
+```
+
+在`ANAKIN_REGISTER_OP(Activation)`中添加TNEW的注册
+
+```c++
+#ifdef USE_TNEW_PLACE
+.__alias__<TNEW, AK_FLOAT, Precision::FP32>("activation")
+#endif
+```
+
+## 注意事项
+不要修改`Tensor`/`Buffer`/`Env`/`Context`这些类函数的接口和实现
diff --git a/doc/fluid/new_docs/advanced_usage/deploy/index_anakin.rst b/doc/fluid/new_docs/advanced_usage/deploy/index_anakin.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e4682ccb94e6fc60e184632dff9ee16a6bf16ec0
--- /dev/null
+++ b/doc/fluid/new_docs/advanced_usage/deploy/index_anakin.rst
@@ -0,0 +1,26 @@
+Anakin - 服务器端加速引擎
+#######################
+
+
+使用文档
+~~~~~~~
+
+.. toctree::
+   :maxdepth: 1
+
+   install_anakin.md
+   convert_paddle_to_anakin.md
+   run_anakin_on_arm.md
+   anakin_tutorial.md
+   anakin_example.md
+   anakin_gpu_benchmark.md
+   anakin_arm_benchmark.md
+
+开发文档
+~~~~~~~
+
+.. toctree::
+   :maxdepth: 1
+
+   how_to_add_anakin_op.md
+   how_to_support_new_device_in_anakin.md
diff --git a/doc/fluid/new_docs/advanced_usage/deploy/index_mobile.rst b/doc/fluid/new_docs/advanced_usage/deploy/index_mobile.rst
new file mode 100644
index 0000000000000000000000000000000000000000..47df6392c123d520c701089db6ee1ae72e4f8ea5
--- /dev/null
+++ b/doc/fluid/new_docs/advanced_usage/deploy/index_mobile.rst
@@ -0,0 +1,9 @@
+移动端部署
+##########
+
+.. toctree::
+   :maxdepth: 2
+
+   mobile_build.md
+   mobile_dev.md
+
diff --git a/doc/fluid/new_docs/advanced_usage/deploy/install_anakin.md b/doc/fluid/new_docs/advanced_usage/deploy/install_anakin.md
new file mode 100644
index 0000000000000000000000000000000000000000..bb7c1950308622e3de292268a718e6ec688e6ae6
--- /dev/null
+++ b/doc/fluid/new_docs/advanced_usage/deploy/install_anakin.md
@@ -0,0 +1,69 @@
+## 从源码编译安装Anakin ##
+
+我们已经在CentOS 7.3上成功的安装和测试了Anakin，对于其他操作系统，我们将很快支持。
+
+### 安装概览 ###
+
+* [在CentOS上安装 Anakin]()
+* [在Ubuntu上安装 Anakin]()
+* [在ARM上安装 Anakin](run_on_arm_ch.md)
+* [验证安装]()
+
+
+### 在CentOS上安装 Anakin ###
+#### 1. 系统要求 ####
+
+*  make 3.82+
+*  cmake 2.8.12+
+*  gcc 4.8.2+
+*  g++ 4.8.2+
+*  其他需要补充的。。。
+
+#### 2. 编译CPU版Anakin ####
+
+暂时不支持
+
+#### 3. 编译支持NVIDIA GPU的Anakin ####
+
+- 3.1. 安装依赖
+  - 3.1.1 protobuf  
+    >$ git clone https://github.com/google/protobuf  
+    >$ cd protobuf  
+    >$ git submodule update --init --recursive  
+    >$ ./autogen.sh  
+    >$ ./configure --prefix=/path/to/your/insall_dir  
+    >$ make  
+    >$ make check  
+    >$ make install  
+    >$ sudo ldconfig
+
+
+    如安装protobuf遇到任何问题，请访问[这里](https://github.com/google/protobuf/blob/master/src/README.md)
+
+- 3.2 CUDA Toolkit
+  - [CUDA 8.0](https://developer.nvidia.com/cuda-zone) or higher. 具体信息参见[NVIDIA's documentation](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/).
+  - [cuDNN v7](https://developer.nvidia.com/cudnn). 具体信息参见[NVIDIA's documentation](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/). 
+- 3.3  编译Anakin
+  >$ git clone https:/xxxxx  
+  >$ cd anakin  
+  >$ mkdir build  
+  >$ camke ..  
+  >$ make
+
+
+#### 4. 编译支持AMD GPU的Anakin ####
+
+暂时还不支持
+
+
+### 在Ubuntu上安装 Anakin ###
+
+暂时还不支持
+
+
+### 在ARM上安装 Anakin ###
+
+暂时还不支持
+
+### 验证安装 ###
+we are coming soon...
diff --git a/doc/fluid/new_docs/advanced_usage/deploy/mobile_build.md b/doc/fluid/new_docs/advanced_usage/deploy/mobile_build.md
new file mode 100644
index 0000000000000000000000000000000000000000..e51593164987d548e256ddebbc5fa8d960fb5255
--- /dev/null
+++ b/doc/fluid/new_docs/advanced_usage/deploy/mobile_build.md
@@ -0,0 +1,59 @@
+# 环境搭建
+## 使用 docker
+### 1. 安装 docker
+安装 docker 的方式，参考官方文档 [https://docs.docker.com/install/](https://docs.docker.com/install/)
+### 2. 使用 docker 搭建构建环境
+首先进入 paddle-mobile 的目录下，执行 `docker build`
+以 Linux/Mac 为例 (windows 建议在 'Docker Quickstart Terminal' 中执行)
+```
+$ docker build -t paddle-mobile:dev - < Dockerfile
+```
+使用 `docker images` 可以看到我们新建的 image
+```
+$ docker images
+REPOSITORY      TAG     IMAGE ID       CREATED         SIZE
+paddle-mobile   dev     33b146787711   45 hours ago    372MB
+```
+### 3. 使用 docker 构建
+进入 paddle-mobile 目录，执行 docker run
+```
+$ docker run -it --mount type=bind,source=$PWD,target=/paddle-mobile paddle-mobile:dev
+root@5affd29d4fc5:/ # cd /paddle-mobile
+# 生成构建 android 产出的 Makefile
+root@5affd29d4fc5:/ # rm CMakeCache.txt
+root@5affd29d4fc5:/ # cmake -DCMAKE_TOOLCHAIN_FILE=tools/toolchains/arm-android-neon.cmake
+# 生成构建 linux 产出的 Makefile
+root@5affd29d4fc5:/ # rm CMakeCache.txt
+root@5affd29d4fc5:/ # cmake -DCMAKE_TOOLCHAIN_FILE=tools/toolchains/arm-linux-gnueabi.cmake
+```
+### 4. 设置编译选项
+可以通过 ccmake 设置编译选项
+```
+root@5affd29d4fc5:/ # ccmake .
+                                                     Page 1 of 1
+ CMAKE_ASM_FLAGS
+ CMAKE_ASM_FLAGS_DEBUG
+ CMAKE_ASM_FLAGS_RELEASE
+ CMAKE_BUILD_TYPE
+ CMAKE_INSTALL_PREFIX             /usr/local
+ CMAKE_TOOLCHAIN_FILE             /paddle-mobile/tools/toolchains/arm-android-neon.cmake
+ CPU                              ON
+ DEBUGING                         ON
+ FPGA                             OFF
+ LOG_PROFILE                      ON
+ MALI_GPU                         OFF
+ NET                              googlenet
+ USE_EXCEPTION                    ON
+ USE_OPENMP                       OFF
+```
+修改选项后，按 `c`, `g` 更新 Makefile
+### 5. 构建
+使用 make 命令进行构建
+```
+root@5affd29d4fc5:/ # make
+```
+### 6. 查看构建产出
+构架产出可以在 host 机器上查看，在 paddle-mobile 的目录下，build 以及 test/build 下，可以使用 adb 指令或者 scp 传输到 device 上执行
+
+## 不使用 docker
+不使用 docker 的方法，可以直接用 cmake 生成 makefile 后构建。使用 ndk 构建 android 应用需要正确设置 NDK_ROOT。构建 linux 应用需要安装 arm-linux-gnueabi-gcc 或者类似的交叉编译工具，可能需要设置 CC，CXX 环境变量，或者在 tools/toolchains/ 中修改 arm-linux-gnueabi.cmake，或者增加自己需要的 toolchain file。
diff --git a/doc/fluid/new_docs/advanced_usage/deploy/mobile_dev.md b/doc/fluid/new_docs/advanced_usage/deploy/mobile_dev.md
new file mode 100644
index 0000000000000000000000000000000000000000..474380f9dbfd2fb8a06630cb1ca3ca5cd14ca9d9
--- /dev/null
+++ b/doc/fluid/new_docs/advanced_usage/deploy/mobile_dev.md
@@ -0,0 +1,72 @@
+# iOS开发文档
+
+## 编译
+
+### 一. 使用 build.sh 编译
+
+```sh 
+sh build.sh ios
+
+# 如果只想编译某个特定模型的 op, 则需执行以下命令
+sh build.sh ios googlenet
+
+# 在这个文件夹下, 你可以拿到生成的 .a 库
+cd ../build/release/ios/build
+
+```
+
+### 二. 使用 xcode 编译
+
+我们提供了 ios 开发更为熟悉的 xcode 编译环境:
+在 ios/ 目录下打开 PaddleMobile.xcworkspace 即可编译 PaddleMobile 或者 运行 Demo
+
+### 三. 集成
+
+#### 如使用 c++ 接口
+将 
+
+```
+libpaddle-mobile.a 
+io.h  
+program.h 
+types.h 
+lod_tensor.h 
+tensor.h
+```
+拖入工程, io.h 为接口文件, 可在 [github](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/src/io/io.h)上查看接口注释
+
+#### 如使用 oc 接口
+将在xcode 编译生成的
+```
+libPaddleMobile.a 
+PaddleMobile.h
+```
+拖入工程, 接口如下:
+
+```
+/*
+	创建单例对象
+*/
++ (instancetype)sharedInstance;
+
+/*
+	load 模型, 开辟内存
+*/
+- (BOOL)load:(NSString *)modelPath andWeightsPath:(NSString *)weighsPath;
+
+/*
+	进行预测, means 和 scale 为训练模型时的预处理参数, 如训练时没有做这些预处理则直接使用 predict
+*/
+- (NSArray *)predict:(CGImageRef)image means:(NSArray<NSNumber *> *)means scale:(float)scale;
+
+/*
+	进行预测
+*/
+- (NSArray *)predict:(CGImageRef)image;
+
+/*
+	清理内存
+*/
+- (void)clear;
+
+```
diff --git a/doc/fluid/new_docs/advanced_usage/deploy/run_anakin_on_arm.md b/doc/fluid/new_docs/advanced_usage/deploy/run_anakin_on_arm.md
new file mode 100644
index 0000000000000000000000000000000000000000..ebeb38f534ebfc8cb5a41d103abe3bb1de7e379a
--- /dev/null
+++ b/doc/fluid/new_docs/advanced_usage/deploy/run_anakin_on_arm.md
@@ -0,0 +1,151 @@
+## 源码编译 Anakin ##
+
+目前Anakin支持ARM Android平台，采用Android NDK交叉编译工具链，已在mac os和centos上编译和测试通过。
+
+### 安装概览 ###
+
+* [系统需求](#0001)
+* [安装第三方依赖](#0002)
+* [Anakin源码编译](#0003)
+* [验证安装](#0004)
+
+
+### <span id = '0001'> 1. 系统需求 </span> ###
+
+*  宿主机: linux, mac    
+*  cmake 3.8.2+    
+*  Android NDK r14, Linux 版本[从这里下载](https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip)
+
+### <span id = '0002'> 2. 安装第三方依赖 </span> ###
+
+- 2.1 protobuf3.4.0     
+   源码从这里[下载](https://github.com/google/protobuf/releases/tag/v3.4.0)    
+ - 2.1.1 为宿主机编译protobuf     
+ ```bash
+   $ tar -xzf protobuf-3.4.0.tar.gz  
+   $ cd protobuf-3.4.0   
+   $ ./autogen.sh  
+   $ ./configure    
+   $ make  
+   $ make check   
+   $ make install
+   ```
+   上述 $make install 执行后，可在 /usr/local/include/google 找到 libprotobuf 所需的头文件,将整个google文件夹拷贝至Anakin/third-party/arm-android/protobuf/下，
+   如有问题，请点[这里](https://github.com/google/protobuf/blob/v3.4.0/src/README.md)。
+   然后将已经生成文件清除。
+ ```bash
+   $ make distclean
+   ```
+ - 2.1.1 交叉编译Android`armeabi-v7a`的protobuf，注意设置ANDROID_NDK的路径，以及ARCH_ABI、HOSTOSN的值，   
+ ```bash
+
+   $ export ANDROID_NDK=your_ndk_path 
+   $ ARCH_ABI="arm-linux-androideabi-4.9"
+   $ HOSTOSN="darwin-x86_64"
+   $ export SYSROOT=$ANDROID_NDK/platforms/android-9/arch-arm  
+   $ export PREBUILT=$ANDROID_NDK/toolchains/$ARCH_ABI
+   $ export LDFLAGS="--sysroot=$SYSROOT"
+   $ export LD="$ANDROID_NDK/toolchains/$ARCH_ABI/prebuilt/$HOSTOSN/arm-linux-androideabi/bin/ld $LDFLAGS"
+   $ export LIBS="-llog $ANDROID_NDK/sources/cxx-stl/gnu-libstdc++/4.9/libs/armeabi-v7a/libgnustl_static.a"
+   $ export CPPFLAGS=""
+   $ export INCLUDES="-I$ANDROID_NDK/sources/cxx-stl/gnu-libstdc++/4.9/include/ -I$ANDROID_NDK/platforms/android-9/arch-arm/usr/include/ -I$ANDROID_NDK/sources/cxx-stl/gnu-libstdc++/4.9/libs/armeabi-v7a/include/"
+   $ export CXXFLAGS="-march=armv7-a -mfloat-abi=softfp -DGOOGLE_PROTOBUF_NO_RTTI --sysroot=$SYSROOT"
+   $ export CCFLAGS="$CXXFLAGS"
+   $ export CXX="$PREBUILT/prebuilt/$HOSTOSN/bin/arm-linux-androideabi-g++ $CXXFLAGS"
+   $ export CC="$CXX"
+   $ export RANLIB="$ANDROID_NDK/toolchains/$ARCH_ABI/prebuilt/$HOSTOSN/bin/arm-linux-androideabi-ranlib"  
+   $ ./autogen.sh  
+   $ ./configure --host=arm-linux-androideabi --with-sysroot=$SYSROOT --enable-cross-compile --with-protoc=protoc --disable-shared CXX="$CXX" CC="$CC" LD="$LD"  
+   $ make
+  ```
+  
+  编译生成 *.a 静态库，若希望编译*.so 动态链接库 ，请在./configure参数中改--disable-shared为--disable-static --enable-shared。  
+  生成文件在src/.libs/下，将生成的文件拷贝至Anakin/third-party/arm-android/protobuf/lib下。  
+  在[cmake](../../cmake/find_modules.cmake)中更新`ARM_RPOTO_ROOT`的路径。        
+  ```cmake
+  set(ARM_RPOTO_ROOT "${CMAKE_SOURCE_DIR}/third-party/arm-android/protobuf")
+  ```
+  
+- 2.2 opencv 2.4.3+(optional)    
+    Anakin只在examples示例中使用opencv   
+    Android系统的opencv从[这里下载](https://opencv.org/releases.html)    
+    解压后将 `3rdparty/libs/armeabi-v7a`中的库文件拷贝到`libs/armeabi-v7a`    
+    在[cmake](../../cmake/find_modules.cmake)中搜索`anakin_find_opencv`, 
+    并设置 `include_directories` 和 `LINK_DIRECTORIES`为自己安装的库的路径。   
+    ```cmake
+    include_directories(${CMAKE_SOURCE_DIR}/third-party/arm-android/opencv/sdk/native/jni/include/)
+    LINK_DIRECTORIES(${CMAKE_SOURCE_DIR}/third-party/arm-android/opencv/sdk/native/libs/armeabi-v7a/)
+    ```
+### <span id = '0003'> 3. Anakin源码编译 </span> ###
+
+#### 编译Android版本
+
+   克隆[源码](https://github.com/PaddlePaddle/Anakin/tree/arm)
+```bash
+    cd your_dir
+    git clone https://github.com/PaddlePaddle/Anakin.git
+    cd Anakin
+    git fetch origin arm
+    git checkout arm
+  ```
+  修改`android_build.sh`    
+- 修改NDK路径    
+  ```bash
+    #modify "your_ndk_path" to your NDK path
+    export ANDROID_NDK=your_ndk_path
+  ```
+- 修改ARM 处理器架构     
+  对于32位ARM处理器, 将ANDROID_ABI 设置为 `armeabi-v7a with NEON`， 
+  对于64位ARM处理器, 可以将ANDROID_ABI 设置为 `armeabi-v7a with NEON`或者`arm64-v8a`。        
+  目前我们只支持 `armeabi-v7a with NEON`；`arm64-v8a` 还在开发中。      
+  ```bash
+      -DANDROID_ABI="armeabi-v7a with NEON"
+  ```
+- 设置Android API    
+  根据Android系统的版本设置API level， 例如API Level 21 -> Android 5.0.1    
+  ```bash
+      -DANDROID_NATIVE_API_LEVEL=21
+  ```
+
+- 选择编译静态库或动态库    
+  设置`BUILD_SHARED=NO`编译静态库    
+  设置`BUILD_SHARED=YES`编译动态库    
+  ```bash
+      -DBUILD_SHARED=NO
+  ```
+- OpenMP多线程支持    
+  设置`USE_OPENMP=YES`开启OpenMP多线程    
+  ```bash
+      -DUSE_OPENMP=YES
+  ```
+  
+- 编译单测文件    
+  设置`BUILD_WITH_UNIT_TEST=YES`将会编译单测文件    
+    ```bash
+        -DBUILD_WITH_UNIT_TEST=YES
+    ```
+
+- 编译示例文件    
+  设置`BUILD_EXAMPLES=YES`将会编译示例文件    
+    ```bash
+        -DBUILD_EXAMPLES=YES
+    ```
+  
+- 开启opencv    
+  如果使用opencv，设置`USE_OPENCV=YES`    
+    ```bash
+        -DUSE_OPENCV=YES
+    ```
+    
+- 开始编译    
+  运行脚本 `android_build.sh` 将自动编译Anakin     
+  ```bash
+      ./android_build.sh
+  ```
+
+### <span id = '0004'> 4. 验证安装 </span> ###    
+  编译好的库会放在目录`${Anakin_root}/output`下；    
+  编译好的单测文件会放在`${Anakin_root}/output/unit_test`目录下；    
+  编译好的示例文件会放在`${Anakin_root}/output/examples`目录下。
+  
+  对于Android系统，打开设备的调试模式，通过ADB可以访问的目录是`data/local/tmp`，通过ADB push将测试文件、模型和数据发送到设备目录， 运行测试文件。
diff --git a/doc/fluid/new_docs/advanced_usage/development/contribute_to_paddle.md b/doc/fluid/new_docs/advanced_usage/development/contribute_to_paddle.md
new file mode 120000
index 0000000000000000000000000000000000000000..1126df7a829ab6d98e58a44e8f9c6459feae9a8b
--- /dev/null
+++ b/doc/fluid/new_docs/advanced_usage/development/contribute_to_paddle.md
@@ -0,0 +1 @@
+../../../dev/contribute_to_paddle_cn.md
\ No newline at end of file
diff --git a/doc/fluid/new_docs/advanced_usage/development/cpu_profiling_cn.md b/doc/fluid/new_docs/advanced_usage/development/cpu_profiling_cn.md
new file mode 120000
index 0000000000000000000000000000000000000000..1381a3b05f6761c60742eb9365708d94ad8a2642
--- /dev/null
+++ b/doc/fluid/new_docs/advanced_usage/development/cpu_profiling_cn.md
@@ -0,0 +1 @@
+../../../howto/optimization/cpu_profiling_cn.md
\ No newline at end of file
diff --git a/doc/fluid/new_docs/advanced_usage/development/gpu_profiling_cn.rst b/doc/fluid/new_docs/advanced_usage/development/gpu_profiling_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..f2396716bddd4810fa77c738d41f5482aa6d6055
--- /dev/null
+++ b/doc/fluid/new_docs/advanced_usage/development/gpu_profiling_cn.rst
@@ -0,0 +1,242 @@
+============
+GPU性能调优
+============
+
+..  contents::
+
+此教程将向您分步介绍如何使用内置的定时工具、 **nvprof** 或 **nvvp** 来运行性能分析和调优。
+
+- 什么是性能分析？
+- 为什么需要性能分析？
+- 如何进行性能分析？
+- 性能分析工具介绍
+- 详细教程
+- 性能分析小技巧
+
+什么是性能分析？
+================
+在软件工程的范畴里，性能分析（Profiling）是一个动态程序分析的术语，它可以指测量一个程序的空间（内存）复杂度或时间复杂度，
+也可以说是某些特定指令的使用情况，或者是函数调用的频率和耗时等。通常情况下，分析得到的信息用于协助进行程序的优化。
+
+简单来说，性能分析工具是用于给应用程序的性能做定量分析的。如果想很好的理解程序的行为，那程序分析工具是必不可少的利器。简单的性能分析，可以告诉您某个操作到底花了多长时间？而更深入的分析，甚至能解释为什么某个操作花了很长时间？
+
+为什么需要性能分析？
+============================
+训练好一个深层神经网络通常要耗费非常长的时间，所以性能也就逐步变成了深度学习领域最重要的指标。
+而优化性能的首要任务，是需要了解哪些步骤拖慢了整体。
+如果某一块根本就不怎么耗时，那也就不需要急着优化性能啦！
+
+如何进行性能分析？
+========================
+为了达到性能最优，您可以采用下面五个步骤：
+
+- 对代码进行性能分析
+- 找到运行慢的部分
+- 找到运行慢的原因
+- 修改成更快的版本
+- 再次对代码进行性能分析
+
+Usually, processor has two key performance limits include float point throughput and
+memory throughput. For GPU,  it also need more parallelism to fulfill its potential.
+This is why they can be so fast.
+
+通常情况下，处理器有两个关键性能限制：一个是浮点计算量，另一个是内存操作量。
+GPU则还需要高并行性，才能发挥其全部能力。这正是它们速度快的原因。
+
+性能分析工具介绍
+======================
+就通常的GPU性能分析来说，市面上已经有NVIDIA或第三方提供的众多工具。
+
+**nvprof** 是Nvidia性能分析工具， **nvvp** 则是带GUI的Nvidia可视化性能分析工具。
+在这个教程中，我们主要会介绍nvprof和nvvp。
+
+:code:`test_GpuProfiler` from :code:`paddle/legacy/math/tests` directory will be used to evaluate
+above profilers.
+
+:code:`paddle/legacy/math/test` 目录中的 :code:`test_GpuProfiler` 就是用于展示上述分析工具的用法。
+
+.. literalinclude:: ../../../../paddle/legacy/math/tests/test_GpuProfiler.cpp
+   :language: c++
+   :lines: 137-151
+   :linenos:
+
+上述的代码片段包含了两种方法，您可以任意使用一个或两个来对感兴趣的代码段做性能分析。
+
+1. :code:`REGISTER_TIMER_INFO` 是一个内置的定时器封装，可以用来计算CPU函数或cuda内核的时间消耗。
+
+2. :code:`REGISTER_GPU_PROFILER` is a general purpose wrapper object of :code:`cudaProfilerStart` and :code:`cudaProfilerStop` to avoid
+program crashes when CPU version of PaddlePaddle invokes them.
+
+3. :code:`REGISTER_GPU_PROFILER` 是一个封装对象，封装了 :code:`cudaProfilerStart` 和 :code:`cudaProfileStop` 两个操作；同时其内部实现可以避免纯CPU版本PaddlePaddle在执行本语句时发生崩溃。
+
+您会在接下来的部分中获得更多的细节介绍。
+
+详细教程
+============
+
+内置定时器
+------------
+
+如果想要启用PaddlePaddle的内置定时器，您首先需要在相关代码段中加入 :code:`REGISTER_TIMER_INFO`。
+接下来就可以使用 :code:`printStatus` 或者 :code:`printAllStatus` 函数来将信息输出到界面中。
+下面举个简单的例子：
+
+1. 加入 :code:`REGISTER_TIMER_INFO` 和 :code:`printAllStatus` 函数（如高亮部分）。
+
+    .. literalinclude:: ../../../../paddle/legacy/math/tests/test_GpuProfiler.cpp
+        :language: c++
+        :lines: 137-151
+        :emphasize-lines: 8-12,14
+        :linenos:
+
+2. cmake配置中将 **WITH_TIMER** 打开，重新编译PaddlePaddle。
+
+    .. code-block:: bash
+
+        cmake .. -DWITH_TIMER=ON
+        make
+
+3. 执行您的代码，并观察结果(如高亮部分）。
+
+    .. code-block:: bash
+        :emphasize-lines: 1,12-15
+
+        > ./paddle/legacy/math/tests/test_GpuProfiler
+        I1117 11:13:42.313065 2522362816 Util.cpp:155] commandline: ./paddle/legacy/math/tests/test_GpuProfiler
+        I1117 11:13:42.845065 2522362816 Util.cpp:130] Calling runInitFunctions
+        I1117 11:13:42.845208 2522362816 Util.cpp:143] Call runInitFunctions done.
+        [==========] Running 1 test from 1 test case.
+        [----------] Global test environment set-up.
+        [----------] 1 test from Profiler
+        [ RUN      ] Profiler.BilinearFwdBwd
+        I1117 11:13:42.845310 2522362816 test_GpuProfiler.cpp:114] Enable GPU Profiler Stat: [testBilinearFwdBwd] "numSamples = 10, channels = 16, im
+        gSizeX = 64, imgSizeY = 64"
+        I1117 11:13:42.850154 2522362816 ThreadLocal.cpp:37] thread use undeterministic rand seed:20659751
+        I1117 11:13:42.981501 2522362816 Stat.cpp:130] ======= StatSet: [GlobalStatInfo] status ======
+        I1117 11:13:42.981539 2522362816 Stat.cpp:133] Stat=testBilinearFwdBwd     total=136.141    avg=136.141    max=136.141    min=136.141   count=1
+        I1117 11:13:42.981572 2522362816 Stat.cpp:141] ======= BarrierStatSet status ======
+        I1117 11:13:42.981575 2522362816 Stat.cpp:154] --------------------------------------------------
+        [       OK ] Profiler.BilinearFwdBwd (136 ms)
+        [----------] 1 test from Profiler (136 ms total)
+
+        [----------] Global test environment tear-down
+        [==========] 1 test from 1 test case ran. (136 ms total)
+        [  PASSED  ] 1 test.
+
+nvprof 工具
+----------------
+
+要使用命令行分析工具 **nvprof**，您按如下步骤操作即可：
+
+1. 将 :code:`REGISTER_GPU_PROFILER` 函数加到代码中（参考强调部分）。
+
+    .. literalinclude:: ../../../../paddle/legacy/math/tests/test_GpuProfiler.cpp
+        :language: c++
+        :lines: 137-151
+        :emphasize-lines: 6-7
+        :linenos:
+
+2. cmake中将 **WITH_PROFILER** 配置打开，重新编译PaddlePaddle。
+
+    .. code-block:: bash
+
+        cmake .. -DWITH_PROFILER=ON
+        make
+
+3. 使用 **nvprof** 来分析执行文件。
+
+    .. code-block:: bash
+
+        nvprof  ./paddle/legacy/math/tests/test_GpuProfiler
+
+然后，您就能获得如下的分析结果：
+
+.. code-block:: bash
+
+    ==78544== Profiling application: ./paddle/legacy/math/tests/test_GpuProfiler
+    ==78544== Profiling result:
+    Time(%)     Time     Calls       Avg       Min       Max  Name
+    27.60%  9.6305ms         5  1.9261ms  3.4560us  6.4035ms  [CUDA memcpy HtoD]
+    26.07%  9.0957ms         1  9.0957ms  9.0957ms  9.0957ms  KeBilinearInterpBw
+    23.78%  8.2977ms         1  8.2977ms  8.2977ms  8.2977ms  KeBilinearInterpFw
+    22.55%  7.8661ms         2  3.9330ms  1.5798ms  6.2863ms  [CUDA memcpy DtoH]
+
+    ==78544== API calls:
+    Time(%)     Time     Calls       Avg       Min       Max  Name
+    46.85%  682.28ms         8  85.285ms  12.639us  682.03ms  cudaStreamCreateWithFlags
+    39.83%  580.00ms         4  145.00ms     302ns  550.27ms  cudaFree
+    9.82%   143.03ms         9  15.892ms  8.7090us  142.78ms  cudaStreamCreate
+    1.23%   17.983ms         7  2.5690ms  23.210us  6.4563ms  cudaMemcpy
+    1.23%   17.849ms         2  8.9247ms  8.4726ms  9.3768ms  cudaStreamSynchronize
+    0.66%   9.5969ms         7  1.3710ms  288.43us  2.4279ms  cudaHostAlloc
+    0.13%   1.9530ms        11  177.54us  7.6810us  591.06us  cudaMalloc
+    0.07%   1.0424ms         8  130.30us  1.6970us  453.72us  cudaGetDevice
+    0.04%   527.90us        40  13.197us     525ns  253.99us  cudaEventCreateWithFlags
+    0.03%   435.73us       348  1.2520us     124ns  42.704us  cuDeviceGetAttribute
+    0.03%   419.36us         1  419.36us  419.36us  419.36us  cudaGetDeviceCount
+    0.02%   260.75us         2  130.38us  129.32us  131.43us  cudaGetDeviceProperties
+    0.02%   222.32us         2  111.16us  106.94us  115.39us  cudaLaunch
+    0.01%   214.06us         4  53.514us  28.586us  77.655us  cuDeviceGetName
+    0.01%   115.45us         4  28.861us  9.8250us  44.526us  cuDeviceTotalMem
+    0.01%   83.988us         4  20.997us     578ns  77.760us  cudaSetDevice
+    0.00%   38.918us         1  38.918us  38.918us  38.918us  cudaEventCreate
+    0.00%   34.573us        31  1.1150us     279ns  12.784us  cudaDeviceGetAttribute
+    0.00%   17.767us         1  17.767us  17.767us  17.767us  cudaProfilerStart
+    0.00%   15.228us         2  7.6140us  3.5460us  11.682us  cudaConfigureCall
+    0.00%   14.536us         2  7.2680us  1.1490us  13.387us  cudaGetLastError
+    0.00%   8.6080us        26     331ns     173ns     783ns  cudaSetupArgument
+    0.00%   5.5470us         6     924ns     215ns  2.6780us  cuDeviceGet
+    0.00%   5.4090us         6     901ns     328ns  3.3320us  cuDeviceGetCount
+    0.00%   4.1770us         3  1.3920us  1.0630us  1.8300us  cuDriverGetVersion
+    0.00%   3.4650us         3  1.1550us  1.0810us  1.2680us  cuInit
+    0.00%      830ns         1     830ns     830ns     830ns  cudaRuntimeGetVersion
+
+
+nvvp 工具
+--------------
+
+如果想使用可视化的分析器 **nvvp**，您可以导入 :code:`nvprof -o ...` 的输出，或者从工具的界面里运行您的应用。
+
+**备注: nvvp 也支持CPU的性能分析** (需在nvvp界面中选上才能开启）
+
+..  image:: nvvp1.png
+    :align: center
+    :scale: 33%
+
+从内核函数的角度， **nvvp** 可以精确说明一个长耗时操作的具体原因。
+同时，如下图所示， **nvvp** 的内核block使用情况、寄存器使用情况和共享内存使用情况能让我们对GPU的整体使用有更好的理解。
+
+
+..  image:: nvvp2.png
+    :align: center
+    :scale: 33%
+
+而从应用的角度， **nvvp** 可以帮您提供一些定位性能瓶颈的建议。
+例如，下图中就展示了一些关于内存数据迁徙和计算资源利用率的建议，为您做性能调优提供了方向。
+
+..  image:: nvvp3.png
+    :align: center
+    :scale: 33%
+
+..  image:: nvvp4.png
+    :align: center
+    :scale: 33%
+
+性能分析小技巧
+==================
+
+- 开始阶段，从 **nvprof** 和 **nvvp** 的输出信息入手是个不错的选择。
+- 接下来可以考虑下时间线的分析。
+- 如果真想挖掘内核深处的某个秘密，您最好先确认：这一块的耗时比例真的太高，值得深入分析。
+- 可能的情况下，试着让输出的分析数据和理论值对应。
+
+    1) 例如，如果我知道内核花了10ms来移动1GB数据，那我会期望分析工具统计到速度是100GB/s。
+    2) 若有不一致之处，很有可能实际应用就是没有按照您的预期情况运行。
+- 了解您的硬件：如果您的GPU理论可以达到6 TFLOPs（6万亿次浮点运算每秒），而当前已经有5.5 TFLOPs了，那估计这里的潜力就没啥好挖的了……
+
+性能分析是性能优化的关键一步。有的时候简简单单的改变就能在性能上产生明显的优化效果！
+当然，具体情况因人而异。
+
+参考资料
+===========
+Jeremy Appleyard, `GPU Profiling for Deep Learning <http://www.robots.ox.ac.uk/~seminars/seminars/Extra/2015_10_08_JeremyAppleyard.pdf>`_, 2015
diff --git a/doc/fluid/new_docs/advanced_usage/development/host_memory_profiling_cn.md b/doc/fluid/new_docs/advanced_usage/development/host_memory_profiling_cn.md
new file mode 120000
index 0000000000000000000000000000000000000000..904968ba4a8d6cc6489c91a0a751e0a33dcc873c
--- /dev/null
+++ b/doc/fluid/new_docs/advanced_usage/development/host_memory_profiling_cn.md
@@ -0,0 +1 @@
+../../../howto/optimization/host_memory_profiling_cn.md
\ No newline at end of file
diff --git a/doc/fluid/new_docs/advanced_usage/development/new_op.md b/doc/fluid/new_docs/advanced_usage/development/new_op.md
new file mode 120000
index 0000000000000000000000000000000000000000..dce0348585b8c484c1418a03a5fde5d78b0afcc9
--- /dev/null
+++ b/doc/fluid/new_docs/advanced_usage/development/new_op.md
@@ -0,0 +1 @@
+../../../dev/new_op_cn.md
\ No newline at end of file
diff --git a/doc/fluid/new_docs/advanced_usage/development/nvvp1.png b/doc/fluid/new_docs/advanced_usage/development/nvvp1.png
new file mode 100644
index 0000000000000000000000000000000000000000..1af23ac3c52929b2b0645d2f9fa4d4c6db1f6e77
Binary files /dev/null and b/doc/fluid/new_docs/advanced_usage/development/nvvp1.png differ
diff --git a/doc/fluid/new_docs/advanced_usage/development/nvvp2.png b/doc/fluid/new_docs/advanced_usage/development/nvvp2.png
new file mode 100644
index 0000000000000000000000000000000000000000..177c9db708da6863d1075f3e615f5962dbe18b29
Binary files /dev/null and b/doc/fluid/new_docs/advanced_usage/development/nvvp2.png differ
diff --git a/doc/fluid/new_docs/advanced_usage/development/nvvp3.png b/doc/fluid/new_docs/advanced_usage/development/nvvp3.png
new file mode 100644
index 0000000000000000000000000000000000000000..d8f393667d6569b6f1e61ffccac43fae5888b6db
Binary files /dev/null and b/doc/fluid/new_docs/advanced_usage/development/nvvp3.png differ
diff --git a/doc/fluid/new_docs/advanced_usage/development/nvvp4.png b/doc/fluid/new_docs/advanced_usage/development/nvvp4.png
new file mode 100644
index 0000000000000000000000000000000000000000..51f2f3e183295de6cf8ddaf2b3b8a0862aa35f01
Binary files /dev/null and b/doc/fluid/new_docs/advanced_usage/development/nvvp4.png differ
diff --git a/doc/fluid/new_docs/advanced_usage/development/pprof_1.png b/doc/fluid/new_docs/advanced_usage/development/pprof_1.png
new file mode 100644
index 0000000000000000000000000000000000000000..8e9edbf377672d0ef40f2fc7bd39e746923550cb
Binary files /dev/null and b/doc/fluid/new_docs/advanced_usage/development/pprof_1.png differ
diff --git a/doc/fluid/new_docs/advanced_usage/development/pprof_2.png b/doc/fluid/new_docs/advanced_usage/development/pprof_2.png
new file mode 100644
index 0000000000000000000000000000000000000000..172ba20399ba974d27f4c072425277b69b02520b
Binary files /dev/null and b/doc/fluid/new_docs/advanced_usage/development/pprof_2.png differ
diff --git a/doc/fluid/new_docs/advanced_usage/development/timeline.jpeg b/doc/fluid/new_docs/advanced_usage/development/timeline.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..38ec3f80c982857531f30a8bb0fa26ea5bf05385
Binary files /dev/null and b/doc/fluid/new_docs/advanced_usage/development/timeline.jpeg differ
diff --git a/doc/fluid/new_docs/advanced_usage/development/timeline_cn.md b/doc/fluid/new_docs/advanced_usage/development/timeline_cn.md
new file mode 120000
index 0000000000000000000000000000000000000000..a05540e82a7fa795dcd8e7306261ef9bef57426f
--- /dev/null
+++ b/doc/fluid/new_docs/advanced_usage/development/timeline_cn.md
@@ -0,0 +1 @@
+../../../howto/optimization/timeline_cn.md
\ No newline at end of file
diff --git a/doc/fluid/new_docs/advanced_usage/development/tracing.jpeg b/doc/fluid/new_docs/advanced_usage/development/tracing.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..3a49fc4f8a401a9463b0157e2f38c164ca02dcc5
Binary files /dev/null and b/doc/fluid/new_docs/advanced_usage/development/tracing.jpeg differ
diff --git a/doc/fluid/new_docs/advanced_usage/development/write_docs.rst b/doc/fluid/new_docs/advanced_usage/development/write_docs.rst
new file mode 120000
index 0000000000000000000000000000000000000000..dc536c8bdd4924758d4418bac8e4181ffbb1f780
--- /dev/null
+++ b/doc/fluid/new_docs/advanced_usage/development/write_docs.rst
@@ -0,0 +1 @@
+../../../dev/write_docs_cn.rst
\ No newline at end of file
diff --git a/doc/fluid/new_docs/advanced_usage/index.rst b/doc/fluid/new_docs/advanced_usage/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..89166573eebca045e948046c69f3b7a3e0031d58
--- /dev/null
+++ b/doc/fluid/new_docs/advanced_usage/index.rst
@@ -0,0 +1,22 @@
+########
+进阶使用
+########
+
+
+..  todo::
+
+    Complete this guide
+
+..  toctree::
+    :maxdepth: 2
+
+    deploy/index_anakin.rst
+    deploy/index_mobile.rst
+    development/contribute_to_paddle.md
+    development/write_docs.rst
+    development/new_op.md
+    development/cpu_profiling_cn.md
+    development/gpu_profiling_cn.rst
+    development/host_memory_profiling_cn.md
+    development/timeline_cn.md
+    benchmark.rst
diff --git a/doc/fluid/new_docs/advanced_usage/pics/anakin_fm_ch.png b/doc/fluid/new_docs/advanced_usage/pics/anakin_fm_ch.png
new file mode 100644
index 0000000000000000000000000000000000000000..52d4992a22397119af949aa7c11a9ea6365c167c
Binary files /dev/null and b/doc/fluid/new_docs/advanced_usage/pics/anakin_fm_ch.png differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/.gitignore b/doc/fluid/new_docs/beginners_guide/basics/image_classification/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..dc7c62b06287ad333dd41082e566b0553d3a5341
--- /dev/null
+++ b/doc/fluid/new_docs/beginners_guide/basics/image_classification/.gitignore
@@ -0,0 +1,8 @@
+*.pyc
+train.log
+output
+data/cifar-10-batches-py/
+data/cifar-10-python.tar.gz
+data/*.txt
+data/*.list
+data/mean.meta
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/README.cn.md b/doc/fluid/new_docs/beginners_guide/basics/image_classification/README.cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f20843596aa676962a36241f59560ec2a41257b
--- /dev/null
+++ b/doc/fluid/new_docs/beginners_guide/basics/image_classification/README.cn.md
@@ -0,0 +1,576 @@
+
+# 图像分类
+
+本教程源代码目录在[book/image_classification](https://github.com/PaddlePaddle/book/tree/develop/03.image_classification)， 初次使用请参考PaddlePaddle[安装教程](https://github.com/PaddlePaddle/book/blob/develop/README.cn.md#运行这本书)，更多内容请参考本教程的[视频课堂](http://bit.baidu.com/course/detail/id/168.html)。
+
+## 背景介绍
+
+图像相比文字能够提供更加生动、容易理解及更具艺术感的信息，是人们转递与交换信息的重要来源。在本教程中，我们专注于图像识别领域的一个重要问题，即图像分类。
+
+图像分类是根据图像的语义信息将不同类别图像区分开来，是计算机视觉中重要的基本问题，也是图像检测、图像分割、物体跟踪、行为分析等其他高层视觉任务的基础。图像分类在很多领域有广泛应用，包括安防领域的人脸识别和智能视频分析等，交通领域的交通场景识别，互联网领域基于内容的图像检索和相册自动归类，医学领域的图像识别等。
+
+
+一般来说，图像分类通过手工特征或特征学习方法对整个图像进行全部描述，然后使用分类器判别物体类别，因此如何提取图像的特征至关重要。在深度学习算法之前使用较多的是基于词袋(Bag of Words)模型的物体分类方法。词袋方法从自然语言处理中引入，即一句话可以用一个装了词的袋子表示其特征，袋子中的词为句子中的单词、短语或字。对于图像而言，词袋方法需要构建字典。最简单的词袋模型框架可以设计为**底层特征抽取**、**特征编码**、**分类器设计**三个过程。
+
+而基于深度学习的图像分类方法，可以通过有监督或无监督的方式**学习**层次化的特征描述，从而取代了手工设计或选择图像特征的工作。深度学习模型中的卷积神经网络(Convolution Neural Network, CNN)近年来在图像领域取得了惊人的成绩，CNN直接利用图像像素信息作为输入，最大程度上保留了输入图像的所有信息，通过卷积操作进行特征的提取和高层抽象，模型输出直接是图像识别的结果。这种基于"输入-输出"直接端到端的学习方法取得了非常好的效果，得到了广泛的应用。
+
+本教程主要介绍图像分类的深度学习模型，以及如何使用PaddlePaddle训练CNN模型。
+
+## 效果展示
+
+图像分类包括通用图像分类、细粒度图像分类等。图1展示了通用图像分类效果，即模型可以正确识别图像上的主要物体。
+
+<p align="center">
+<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/dog_cat.png?raw=true"  width="350" ><br/>
+图1. 通用图像分类展示
+</p>
+
+
+图2展示了细粒度图像分类-花卉识别的效果，要求模型可以正确识别花的类别。
+
+
+<p align="center">
+<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/flowers.png?raw=true" width="400" ><br/>
+图2. 细粒度图像分类展示
+</p>
+
+
+一个好的模型既要对不同类别识别正确，同时也应该能够对不同视角、光照、背景、变形或部分遮挡的图像正确识别(这里我们统一称作图像扰动)。图3展示了一些图像的扰动，较好的模型会像聪明的人类一样能够正确识别。
+
+<p align="center">
+<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/variations.png?raw=true" width="550" ><br/>
+图3. 扰动图片展示[22]
+</p>
+
+## 模型概览
+
+图像识别领域大量的研究成果都是建立在[PASCAL VOC](http://host.robots.ox.ac.uk/pascal/VOC/)、[ImageNet](http://image-net.org/)等公开的数据集上，很多图像识别算法通常在这些数据集上进行测试和比较。PASCAL VOC是2005年发起的一个视觉挑战赛，ImageNet是2010年发起的大规模视觉识别竞赛(ILSVRC)的数据集，在本章中我们基于这些竞赛的一些论文介绍图像分类模型。
+
+在2012年之前的传统图像分类方法可以用背景描述中提到的三步完成，但通常完整建立图像识别模型一般包括底层特征学习、特征编码、空间约束、分类器设计、模型融合等几个阶段。
+
+  1). **底层特征提取**: 通常从图像中按照固定步长、尺度提取大量局部特征描述。常用的局部特征包括SIFT(Scale-Invariant Feature Transform, 尺度不变特征转换) \[[1](#参考文献)\]、HOG(Histogram of Oriented Gradient, 方向梯度直方图) \[[2](#参考文献)\]、LBP(Local Bianray Pattern, 局部二值模式) \[[3](#参考文献)\] 等，一般也采用多种特征描述子，防止丢失过多的有用信息。
+
+  2). **特征编码**: 底层特征中包含了大量冗余与噪声，为了提高特征表达的鲁棒性，需要使用一种特征变换算法对底层特征进行编码，称作特征编码。常用的特征编码包括向量量化编码 \[[4](#参考文献)\]、稀疏编码 \[[5](#参考文献)\]、局部线性约束编码 \[[6](#参考文献)\]、Fisher向量编码 \[[7](#参考文献)\] 等。
+
+  3). **空间特征约束**: 特征编码之后一般会经过空间特征约束，也称作**特征汇聚**。特征汇聚是指在一个空间范围内，对每一维特征取最大值或者平均值，可以获得一定特征不变形的特征表达。金字塔特征匹配是一种常用的特征聚会方法，这种方法提出将图像均匀分块，在分块内做特征汇聚。
+
+  4). **通过分类器分类**: 经过前面步骤之后一张图像可以用一个固定维度的向量进行描述，接下来就是经过分类器对图像进行分类。通常使用的分类器包括SVM(Support Vector Machine, 支持向量机)、随机森林等。而使用核方法的SVM是最为广泛的分类器，在传统图像分类任务上性能很好。
+
+这种方法在PASCAL VOC竞赛中的图像分类算法中被广泛使用 \[[18](#参考文献)\]。[NEC实验室](http://www.nec-labs.com/)在ILSVRC2010中采用SIFT和LBP特征，两个非线性编码器以及SVM分类器获得图像分类的冠军 \[[8](#参考文献)\]。
+
+Alex Krizhevsky在2012年ILSVRC提出的CNN模型 \[[9](#参考文献)\] 取得了历史性的突破，效果大幅度超越传统方法，获得了ILSVRC2012冠军，该模型被称作AlexNet。这也是首次将深度学习用于大规模图像分类中。从AlexNet之后，涌现了一系列CNN模型，不断地在ImageNet上刷新成绩，如图4展示。随着模型变得越来越深以及精妙的结构设计，Top-5的错误率也越来越低，降到了3.5%附近。而在同样的ImageNet数据集上，人眼的辨识错误率大概在5.1%，也就是目前的深度学习模型的识别能力已经超过了人眼。
+
+<p align="center">
+<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/ilsvrc.png?raw=true" width="500" ><br/>
+图4. ILSVRC图像分类Top-5错误率
+</p>
+
+### CNN
+
+传统CNN包含卷积层、全连接层等组件，并采用softmax多类别分类器和多类交叉熵损失函数，一个典型的卷积神经网络如图5所示，我们先介绍用来构造CNN的常见组件。
+
+<p align="center">
+<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/lenet.png?raw=true"><br/>
+图5. CNN网络示例[20]
+</p>
+
+- 卷积层(convolution layer): 执行卷积操作提取底层到高层的特征，发掘出图片局部关联性质和空间不变性质。
+- 池化层(pooling layer): 执行降采样操作。通过取卷积输出特征图中局部区块的最大值(max-pooling)或者均值(avg-pooling)。降采样也是图像处理中常见的一种操作，可以过滤掉一些不重要的高频信息。
+- 全连接层(fully-connected layer，或者fc layer): 输入层到隐藏层的神经元是全部连接的。
+- 非线性变化: 卷积层、全连接层后面一般都会接非线性变化层，例如Sigmoid、Tanh、ReLu等来增强网络的表达能力，在CNN里最常使用的为ReLu激活函数。
+- Dropout \[[10](#参考文献)\] : 在模型训练阶段随机让一些隐层节点权重不工作，提高网络的泛化能力，一定程度上防止过拟合。
+
+另外，在训练过程中由于每层参数不断更新，会导致下一次输入分布发生变化，这样导致训练过程需要精心设计超参数。如2015年Sergey Ioffe和Christian Szegedy提出了Batch Normalization (BN)算法 \[[14](#参考文献)\] 中，每个batch对网络中的每一层特征都做归一化，使得每层分布相对稳定。BN算法不仅起到一定的正则作用，而且弱化了一些超参数的设计。经过实验证明，BN算法加速了模型收敛过程，在后来较深的模型中被广泛使用。
+
+接下来我们主要介绍VGG，GoogleNet和ResNet网络结构。
+
+### VGG
+
+牛津大学VGG(Visual Geometry Group)组在2014年ILSVRC提出的模型被称作VGG模型 \[[11](#参考文献)\] 。该模型相比以往模型进一步加宽和加深了网络结构，它的核心是五组卷积操作，每两组之间做Max-Pooling空间降维。同一组内采用多次连续的3X3卷积，卷积核的数目由较浅组的64增多到最深组的512，同一组内的卷积核数目是一样的。卷积之后接两层全连接层，之后是分类层。由于每组内卷积层的不同，有11、13、16、19层这几种模型，下图展示一个16层的网络结构。VGG模型结构相对简洁，提出之后也有很多文章基于此模型进行研究，如在ImageNet上首次公开超过人眼识别的模型\[[19](#参考文献)\]就是借鉴VGG模型的结构。
+
+<p align="center">
+<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/vgg16.png?raw=true" width="750" ><br/>
+图6. 基于ImageNet的VGG16模型
+</p>
+
+### GoogleNet
+
+GoogleNet \[[12](#参考文献)\] 在2014年ILSVRC的获得了冠军，在介绍该模型之前我们先来了解NIN(Network in Network)模型 \[[13](#参考文献)\] 和Inception模块，因为GoogleNet模型由多组Inception模块组成，模型设计借鉴了NIN的一些思想。
+
+NIN模型主要有两个特点：
+
+1) 引入了多层感知卷积网络(Multi-Layer Perceptron Convolution, MLPconv)代替一层线性卷积网络。MLPconv是一个微小的多层卷积网络，即在线性卷积后面增加若干层1x1的卷积，这样可以提取出高度非线性特征。
+
+2) 传统的CNN最后几层一般都是全连接层，参数较多。而NIN模型设计最后一层卷积层包含类别维度大小的特征图，然后采用全局均值池化(Avg-Pooling)替代全连接层，得到类别维度大小的向量，再进行分类。这种替代全连接层的方式有利于减少参数。
+
+Inception模块如下图7所示，图(a)是最简单的设计，输出是3个卷积层和一个池化层的特征拼接。这种设计的缺点是池化层不会改变特征通道数，拼接后会导致特征的通道数较大，经过几层这样的模块堆积后，通道数会越来越大，导致参数和计算量也随之增大。为了改善这个缺点，图(b)引入3个1x1卷积层进行降维，所谓的降维就是减少通道数，同时如NIN模型中提到的1x1卷积也可以修正线性特征。
+
+<p align="center">
+<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/inception.png?raw=ture" width="800" ><br/>
+图7. Inception模块
+</p>
+
+GoogleNet由多组Inception模块堆积而成。另外，在网络最后也没有采用传统的多层全连接层，而是像NIN网络一样采用了均值池化层；但与NIN不同的是，池化层后面接了一层到类别数映射的全连接层。除了这两个特点之外，由于网络中间层特征也很有判别性，GoogleNet在中间层添加了两个辅助分类器，在后向传播中增强梯度并且增强正则化，而整个网络的损失函数是这个三个分类器的损失加权求和。
+
+GoogleNet整体网络结构如图8所示，总共22层网络：开始由3层普通的卷积组成；接下来由三组子网络组成，第一组子网络包含2个Inception模块，第二组包含5个Inception模块，第三组包含2个Inception模块；然后接均值池化层、全连接层。
+
+<p align="center">
+<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/googlenet.jpeg?raw=true" ><br/>
+图8. GoogleNet[12]
+</p>
+
+
+上面介绍的是GoogleNet第一版模型(称作GoogleNet-v1)。GoogleNet-v2 \[[14](#参考文献)\] 引入BN层；GoogleNet-v3 \[[16](#参考文献)\] 对一些卷积层做了分解，进一步提高网络非线性能力和加深网络；GoogleNet-v4 \[[17](#参考文献)\] 引入下面要讲的ResNet设计思路。从v1到v4每一版的改进都会带来准确度的提升，介于篇幅，这里不再详细介绍v2到v4的结构。
+
+
+### ResNet
+
+ResNet(Residual Network) \[[15](#参考文献)\] 是2015年ImageNet图像分类、图像物体定位和图像物体检测比赛的冠军。针对训练卷积神经网络时加深网络导致准确度下降的问题，ResNet提出了采用残差学习。在已有设计思路(BN, 小卷积核，全卷积网络)的基础上，引入了残差模块。每个残差模块包含两条路径，其中一条路径是输入特征的直连通路，另一条路径对该特征做两到三次卷积操作得到该特征的残差，最后再将两条路径上的特征相加。
+
+残差模块如图9所示，左边是基本模块连接方式，由两个输出通道数相同的3x3卷积组成。右边是瓶颈模块(Bottleneck)连接方式，之所以称为瓶颈，是因为上面的1x1卷积用来降维(图示例即256->64)，下面的1x1卷积用来升维(图示例即64->256)，这样中间3x3卷积的输入和输出通道数都较小(图示例即64->64)。
+
+<p align="center">
+<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/resnet_block.jpg?raw=true" width="400"><br/>
+图9. 残差模块
+</p>
+
+图10展示了50、101、152层网络连接示意图，使用的是瓶颈模块。这三个模型的区别在于每组中残差模块的重复次数不同(见图右上角)。ResNet训练收敛较快，成功的训练了上百乃至近千层的卷积神经网络。
+
+<p align="center">
+<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/resnet.png?raw=true"><br/>
+图10. 基于ImageNet的ResNet模型
+</p>
+
+
+## 数据准备
+
+通用图像分类公开的标准数据集常用的有[CIFAR](https://www.cs.toronto.edu/~kriz/cifar.html)、[ImageNet](http://image-net.org/)、[COCO](http://mscoco.org/)等，常用的细粒度图像分类数据集包括[CUB-200-2011](http://www.vision.caltech.edu/visipedia/CUB-200-2011.html)、[Stanford Dog](http://vision.stanford.edu/aditya86/ImageNetDogs/)、[Oxford-flowers](http://www.robots.ox.ac.uk/~vgg/data/flowers/)等。其中ImageNet数据集规模相对较大，如[模型概览](#模型概览)一章所讲，大量研究成果基于ImageNet。ImageNet数据从2010年来稍有变化，常用的是ImageNet-2012数据集，该数据集包含1000个类别：训练集包含1,281,167张图片，每个类别数据732至1300张不等，验证集包含50,000张图片，平均每个类别50张图片。
+
+由于ImageNet数据集较大，下载和训练较慢，为了方便大家学习，我们使用[CIFAR10](<https://www.cs.toronto.edu/~kriz/cifar.html>)数据集。CIFAR10数据集包含60,000张32x32的彩色图片，10个类别，每个类包含6,000张。其中50,000张图片作为训练集，10000张作为测试集。图11从每个类别中随机抽取了10张图片，展示了所有的类别。
+
+<p align="center">
+<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/cifar.png?raw=true" width="350"><br/>
+图11. CIFAR10数据集[21]
+</p>
+
+Paddle API提供了自动加载cifar数据集模块 `paddle.dataset.cifar`。
+
+通过输入`python train.py`，就可以开始训练模型了，以下小节将详细介绍`train.py`的相关内容。
+
+### 模型结构
+
+#### Paddle 初始化
+
+让我们从导入 Paddle Fluid API 和辅助模块开始。
+
+```python
+import paddle
+import paddle.fluid as fluid
+import numpy
+import sys
+from __future__ import print_function
+```
+
+本教程中我们提供了VGG和ResNet两个模型的配置。
+
+#### VGG
+
+首先介绍VGG模型结构，由于CIFAR10图片大小和数量相比ImageNet数据小很多，因此这里的模型针对CIFAR10数据做了一定的适配。卷积部分引入了BN和Dropout操作。
+VGG核心模块的输入是数据层，`vgg_bn_drop` 定义了16层VGG结构，每层卷积后面引入BN层和Dropout层，详细的定义如下：
+
+```python
+def vgg_bn_drop(input):
+    def conv_block(ipt, num_filter, groups, dropouts):
+        return fluid.nets.img_conv_group(
+            input=ipt,
+            pool_size=2,
+            pool_stride=2,
+            conv_num_filter=[num_filter] * groups,
+            conv_filter_size=3,
+            conv_act='relu',
+            conv_with_batchnorm=True,
+            conv_batchnorm_drop_rate=dropouts,
+            pool_type='max')
+
+    conv1 = conv_block(input, 64, 2, [0.3, 0])
+    conv2 = conv_block(conv1, 128, 2, [0.4, 0])
+    conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
+    conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
+    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
+
+    drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
+    fc1 = fluid.layers.fc(input=drop, size=512, act=None)
+    bn = fluid.layers.batch_norm(input=fc1, act='relu')
+    drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
+    fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
+    predict = fluid.layers.fc(input=fc2, size=10, act='softmax')
+    return predict
+```
+
+
+1. 首先定义了一组卷积网络，即conv_block。卷积核大小为3x3，池化窗口大小为2x2，窗口滑动大小为2，groups决定每组VGG模块是几次连续的卷积操作，dropouts指定Dropout操作的概率。所使用的`img_conv_group`是在`paddle.networks`中预定义的模块，由若干组 Conv->BN->ReLu->Dropout 和 一组 Pooling 组成。
+
+2. 五组卷积操作，即 5个conv_block。 第一、二组采用两次连续的卷积操作。第三、四、五组采用三次连续的卷积操作。每组最后一个卷积后面Dropout概率为0，即不使用Dropout操作。
+
+3. 最后接两层512维的全连接。
+
+4. 通过上面VGG网络提取高层特征，然后经过全连接层映射到类别维度大小的向量，再通过Softmax归一化得到每个类别的概率，也可称作分类器。
+
+### ResNet
+
+ResNet模型的第1、3、4步和VGG模型相同，这里不再介绍。主要介绍第2步即CIFAR10数据集上ResNet核心模块。
+
+先介绍`resnet_cifar10`中的一些基本函数，再介绍网络连接过程。
+
+  - `conv_bn_layer` : 带BN的卷积层。
+  - `shortcut` : 残差模块的"直连"路径，"直连"实际分两种形式：残差模块输入和输出特征通道数不等时，采用1x1卷积的升维操作；残差模块输入和输出通道相等时，采用直连操作。
+  - `basicblock` : 一个基础残差模块，即图9左边所示，由两组3x3卷积组成的路径和一条"直连"路径组成。
+  - `bottleneck` : 一个瓶颈残差模块，即图9右边所示，由上下1x1卷积和中间3x3卷积组成的路径和一条"直连"路径组成。
+  - `layer_warp` : 一组残差模块，由若干个残差模块堆积而成。每组中第一个残差模块滑动窗口大小与其他可以不同，以用来减少特征图在垂直和水平方向的大小。
+
+```python
+def conv_bn_layer(input,
+                  ch_out,
+                  filter_size,
+                  stride,
+                  padding,
+                  act='relu',
+                  bias_attr=False):
+    tmp = fluid.layers.conv2d(
+        input=input,
+        filter_size=filter_size,
+        num_filters=ch_out,
+        stride=stride,
+        padding=padding,
+        act=None,
+        bias_attr=bias_attr)
+    return fluid.layers.batch_norm(input=tmp, act=act)
+
+
+def shortcut(input, ch_in, ch_out, stride):
+    if ch_in != ch_out:
+        return conv_bn_layer(input, ch_out, 1, stride, 0, None)
+    else:
+        return input
+
+
+def basicblock(input, ch_in, ch_out, stride):
+    tmp = conv_bn_layer(input, ch_out, 3, stride, 1)
+    tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1, act=None, bias_attr=True)
+    short = shortcut(input, ch_in, ch_out, stride)
+    return fluid.layers.elementwise_add(x=tmp, y=short, act='relu')
+
+
+def layer_warp(block_func, input, ch_in, ch_out, count, stride):
+    tmp = block_func(input, ch_in, ch_out, stride)
+    for i in range(1, count):
+        tmp = block_func(tmp, ch_out, ch_out, 1)
+    return tmp
+```
+
+`resnet_cifar10` 的连接结构主要有以下几个过程。
+
+1. 底层输入连接一层 `conv_bn_layer`，即带BN的卷积层。
+
+2. 然后连接3组残差模块即下面配置3组 `layer_warp` ，每组采用图 10 左边残差模块组成。
+
+3. 最后对网络做均值池化并返回该层。
+
+注意：除过第一层卷积层和最后一层全连接层之外，要求三组 `layer_warp` 总的含参层数能够被6整除，即 `resnet_cifar10` 的 depth 要满足 $(depth - 2) % 6 == 0$ 。
+
+```python
+def resnet_cifar10(ipt, depth=32):
+    # depth should be one of 20, 32, 44, 56, 110, 1202
+    assert (depth - 2) % 6 == 0
+    n = (depth - 2) / 6
+    nStages = {16, 64, 128}
+    conv1 = conv_bn_layer(ipt, ch_out=16, filter_size=3, stride=1, padding=1)
+    res1 = layer_warp(basicblock, conv1, 16, 16, n, 1)
+    res2 = layer_warp(basicblock, res1, 16, 32, n, 2)
+    res3 = layer_warp(basicblock, res2, 32, 64, n, 2)
+    pool = fluid.layers.pool2d(
+        input=res3, pool_size=8, pool_type='avg', pool_stride=1)
+    predict = fluid.layers.fc(input=pool, size=10, act='softmax')
+    return predict
+```
+
+## Infererence Program 配置
+
+网络输入定义为 `data_layer` (数据层)，在图像分类中即为图像像素信息。CIFRAR10是RGB 3通道32x32大小的彩色图，因此输入数据大小为3072(3x32x32)。
+
+```python
+def inference_program():
+    # The image is 32 * 32 with RGB representation.
+    data_shape = [3, 32, 32]
+    images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
+
+    predict = resnet_cifar10(images, 32)
+    # predict = vgg_bn_drop(images) # un-comment to use vgg net
+    return predict
+```
+
+## Train Program 配置
+
+然后我们需要设置训练程序 `train_program`。它首先从推理程序中进行预测。
+在训练期间，它将从预测中计算 `avg_cost`。
+在有监督训练中需要输入图像对应的类别信息，同样通过`fluid.layers.data`来定义。训练中采用多类交叉熵作为损失函数，并作为网络的输出，预测阶段定义网络的输出为分类器得到的概率信息。
+
+**注意:** 训练程序应该返回一个数组，第一个返回参数必须是 `avg_cost`。训练器使用它来计算梯度。
+
+```python
+def train_program():
+    predict = inference_program()
+
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    cost = fluid.layers.cross_entropy(input=predict, label=label)
+    avg_cost = fluid.layers.mean(cost)
+    accuracy = fluid.layers.accuracy(input=predict, label=label)
+    return [avg_cost, accuracy]
+```
+
+## Optimizer Function 配置
+
+在下面的 `Adam optimizer`，`learning_rate` 是训练的速度，与网络的训练收敛速度有关系。
+
+```python
+def optimizer_program():
+    return fluid.optimizer.Adam(learning_rate=0.001)
+```
+
+## 训练模型
+
+### Trainer 配置
+
+现在，我们需要配置 `Trainer`。`Trainer` 需要接受训练程序 `train_program`, `place` 和优化器 `optimizer_func`。
+
+```python
+use_cuda = False
+place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+trainer = fluid.Trainer(
+    train_func=train_program,
+    optimizer_func=optimizer_program,
+    place=place)
+```
+
+### Data Feeders 配置
+
+`cifar.train10()` 每次产生一条样本，在完成shuffle和batch之后，作为训练的输入。
+
+```python
+# Each batch will yield 128 images
+BATCH_SIZE = 128
+
+# Reader for training
+train_reader = paddle.batch(
+    paddle.reader.shuffle(paddle.dataset.cifar.train10(), buf_size=50000),
+    batch_size=BATCH_SIZE)
+
+# Reader for testing. A separated data set for testing.
+test_reader = paddle.batch(
+    paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE)
+```
+
+### Event Handler
+
+可以使用`event_handler`回调函数来观察训练过程，或进行测试等, 该回调函数是`trainer.train`函数里设定。
+
+`event_handler_plot`可以用来利用回调数据来打点画图:
+
+<p align="center">
+<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/train_and_test.png?raw=true" width="350"><br/>
+图12. 训练结果
+</p>
+
+
+```python
+params_dirname = "image_classification_resnet.inference.model"
+
+from paddle.v2.plot import Ploter
+
+train_title = "Train cost"
+test_title = "Test cost"
+cost_ploter = Ploter(train_title, test_title)
+
+step = 0
+def event_handler_plot(event):
+    global step
+    if isinstance(event, fluid.EndStepEvent):
+        if step % 1 == 0:
+            cost_ploter.append(train_title, step, event.metrics[0])
+            cost_ploter.plot()
+        step += 1
+    if isinstance(event, fluid.EndEpochEvent):
+        avg_cost, accuracy = trainer.test(
+            reader=test_reader,
+            feed_order=['pixel', 'label'])
+        cost_ploter.append(test_title, step, avg_cost)
+
+        # save parameters
+        if params_dirname is not None:
+            trainer.save_params(params_dirname)
+```
+
+`event_handler` 用来在训练过程中输出文本日志
+
+```python
+params_dirname = "image_classification_resnet.inference.model"
+
+# event handler to track training and testing process
+def event_handler(event):
+    if isinstance(event, fluid.EndStepEvent):
+        if event.step % 100 == 0:
+            print("\nPass %d, Batch %d, Cost %f, Acc %f" %
+                  (event.step, event.epoch, event.metrics[0],
+                   event.metrics[1]))
+        else:
+            sys.stdout.write('.')
+            sys.stdout.flush()
+
+    if isinstance(event, fluid.EndEpochEvent):
+        # Test against with the test dataset to get accuracy.
+        avg_cost, accuracy = trainer.test(
+            reader=test_reader, feed_order=['pixel', 'label'])
+
+        print('\nTest with Pass {0}, Loss {1:2.2}, Acc {2:2.2}'.format(event.epoch, avg_cost, accuracy))
+
+        # save parameters
+        if params_dirname is not None:
+            trainer.save_params(params_dirname)
+```
+
+### 训练
+
+通过`trainer.train`函数训练:
+
+**注意:** CPU，每个 Epoch 将花费大约15～20分钟。这部分可能需要一段时间。请随意修改代码，在GPU上运行测试，以提高训练速度。
+
+```python
+trainer.train(
+    reader=train_reader,
+    num_epochs=2,
+    event_handler=event_handler,
+    feed_order=['pixel', 'label'])
+```
+
+一轮训练log示例如下所示，经过1个pass， 训练集上平均 Accuracy 为0.59 ，测试集上平均  Accuracy 为0.6 。
+
+```text
+Pass 0, Batch 0, Cost 3.869598, Acc 0.164062
+...................................................................................................
+Pass 100, Batch 0, Cost 1.481038, Acc 0.460938
+...................................................................................................
+Pass 200, Batch 0, Cost 1.340323, Acc 0.523438
+...................................................................................................
+Pass 300, Batch 0, Cost 1.223424, Acc 0.593750
+..........................................................................................
+Test with Pass 0, Loss 1.1, Acc 0.6
+```
+
+图13是训练的分类错误率曲线图，运行到第200个pass后基本收敛，最终得到测试集上分类错误率为8.54%。
+
+<p align="center">
+<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/plot.png?raw=true" width="400" ><br/>
+图13. CIFAR10数据集上VGG模型的分类错误率
+</p>
+
+## 应用模型
+
+可以使用训练好的模型对图片进行分类，下面程序展示了如何使用 `fluid.Inferencer` 接口进行推断，可以打开注释，更改加载的模型。
+
+### 生成预测输入数据
+
+`dog.png` is an example image of a dog. Turn it into an numpy array to match the data feeder format.
+
+```python
+# Prepare testing data.
+from PIL import Image
+import numpy as np
+import os
+
+def load_image(file):
+    im = Image.open(file)
+    im = im.resize((32, 32), Image.ANTIALIAS)
+
+    im = np.array(im).astype(np.float32)
+    # The storage order of the loaded image is W(width),
+    # H(height), C(channel). PaddlePaddle requires
+    # the CHW order, so transpose them.
+    im = im.transpose((2, 0, 1))  # CHW
+    im = im / 255.0
+
+    # Add one dimension to mimic the list format.
+    im = numpy.expand_dims(im, axis=0)
+    return im
+
+cur_dir = os.getcwd()
+img = load_image(cur_dir + '/image/dog.png')
+```
+
+### Inferencer 配置和预测
+
+`Inferencer` 需要一个 `infer_func` 和 `param_path` 来设置网络和经过训练的参数。
+我们可以简单地插入前面定义的推理程序。
+现在我们准备做预测。
+
+```python
+inferencer = fluid.Inferencer(
+    infer_func=inference_program, param_path=params_dirname, place=place)
+label_list = ["airplane", "automobile", "bird", "cat", "deer", "dog", "frog", "horse", "ship", "truck"]
+# inference
+results = inferencer.infer({'pixel': img})
+print("infer results: %s" % label_list[np.argmax(results[0])])
+```
+
+## 总结
+
+传统图像分类方法由多个阶段构成，框架较为复杂，而端到端的CNN模型结构可一步到位，而且大幅度提升了分类准确率。本文我们首先介绍VGG、GoogleNet、ResNet三个经典的模型；然后基于CIFAR10数据集，介绍如何使用PaddlePaddle配置和训练CNN模型，尤其是VGG和ResNet模型；最后介绍如何使用PaddlePaddle的API接口对图片进行预测和特征提取。对于其他数据集比如ImageNet，配置和训练流程是同样的，大家可以自行进行实验。
+
+
+## 参考文献
+
+[1] D. G. Lowe, [Distinctive image features from scale-invariant keypoints](http://www.cs.ubc.ca/~lowe/papers/ijcv04.pdf). IJCV, 60(2):91-110, 2004.
+
+[2] N. Dalal, B. Triggs, [Histograms of Oriented Gradients for Human Detection](http://vision.stanford.edu/teaching/cs231b_spring1213/papers/CVPR05_DalalTriggs.pdf), Proc. IEEE Conf. Computer Vision and Pattern Recognition, 2005.
+
+[3] Ahonen, T., Hadid, A., and Pietikinen, M. (2006). [Face description with local binary patterns: Application to face recognition](http://ieeexplore.ieee.org/document/1717463/). PAMI, 28.
+
+[4] J. Sivic, A. Zisserman, [Video Google: A Text Retrieval Approach to Object Matching in Videos](http://www.robots.ox.ac.uk/~vgg/publications/papers/sivic03.pdf), Proc. Ninth Int'l Conf. Computer Vision, pp. 1470-1478, 2003.
+
+[5] B. Olshausen, D. Field, [Sparse Coding with an Overcomplete Basis Set: A Strategy Employed by V1?](http://redwood.psych.cornell.edu/papers/olshausen_field_1997.pdf), Vision Research, vol. 37, pp. 3311-3325, 1997.
+
+[6] Wang, J., Yang, J., Yu, K., Lv, F., Huang, T., and Gong, Y. (2010). [Locality-constrained Linear Coding for image classification](http://ieeexplore.ieee.org/abstract/document/5540018/). In CVPR.
+
+[7] Perronnin, F., Sánchez, J., & Mensink, T. (2010). [Improving the fisher kernel for large-scale image classification](http://dl.acm.org/citation.cfm?id=1888101). In ECCV (4).
+
+[8] Lin, Y., Lv, F., Cao, L., Zhu, S., Yang, M., Cour, T., Yu, K., and Huang, T. (2011). [Large-scale image clas- sification: Fast feature extraction and SVM training](http://ieeexplore.ieee.org/document/5995477/). In CVPR.
+
+[9] Krizhevsky, A., Sutskever, I., and Hinton, G. (2012). [ImageNet classification with deep convolutional neu- ral networks](http://www.cs.toronto.edu/~kriz/imagenet_classification_with_deep_convolutional.pdf). In NIPS.
+
+[10] G.E. Hinton, N. Srivastava, A. Krizhevsky, I. Sutskever, and R.R. Salakhutdinov. [Improving neural networks by preventing co-adaptation of feature detectors](https://arxiv.org/abs/1207.0580). arXiv preprint arXiv:1207.0580, 2012.
+
+[11] K. Chatfield, K. Simonyan, A. Vedaldi, A. Zisserman. [Return of the Devil in the Details: Delving Deep into Convolutional Nets](https://arxiv.org/abs/1405.3531). BMVC, 2014。
+
+[12] Szegedy, C., Liu, W., Jia, Y., Sermanet, P., Reed, S., Anguelov, D., Erhan, D., Vanhoucke, V., Rabinovich, A., [Going deeper with convolutions](https://arxiv.org/abs/1409.4842). In: CVPR. (2015)
+
+[13] Lin, M., Chen, Q., and Yan, S. [Network in network](https://arxiv.org/abs/1312.4400). In Proc. ICLR, 2014.
+
+[14] S. Ioffe and C. Szegedy. [Batch normalization: Accelerating deep network training by reducing internal covariate shift](https://arxiv.org/abs/1502.03167). In ICML, 2015.
+
+[15] K. He, X. Zhang, S. Ren, J. Sun. [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385). CVPR 2016.
+
+[16] Szegedy, C., Vanhoucke, V., Ioffe, S., Shlens, J., Wojna, Z. [Rethinking the incep-tion architecture for computer vision](https://arxiv.org/abs/1512.00567). In: CVPR. (2016).
+
+[17] Szegedy, C., Ioffe, S., Vanhoucke, V. [Inception-v4, inception-resnet and the impact of residual connections on learning](https://arxiv.org/abs/1602.07261). arXiv:1602.07261 (2016).
+
+[18] Everingham, M., Eslami, S. M. A., Van Gool, L., Williams, C. K. I., Winn, J. and Zisserman, A. [The Pascal Visual Object Classes Challenge: A Retrospective]((http://link.springer.com/article/10.1007/s11263-014-0733-5)). International Journal of Computer Vision, 111(1), 98-136, 2015.
+
+[19] He, K., Zhang, X., Ren, S., and Sun, J. [Delving Deep into Rectifiers: Surpassing Human-Level Performance on ImageNet Classification](https://arxiv.org/abs/1502.01852). ArXiv e-prints, February 2015.
+
+[20] http://deeplearning.net/tutorial/lenet.html
+
+[21] https://www.cs.toronto.edu/~kriz/cifar.html
+
+[22] http://cs231n.github.io/classification/
+
+<br/>
+<a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span> 由 <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作，采用 <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">知识共享 署名-相同方式共享 4.0 国际 许可协议</a>进行许可。
diff --git a/doc/fluid/new_docs/beginners_guide/basics/index.rst b/doc/fluid/new_docs/beginners_guide/basics/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..0fcb008e0a7773e81e5124da09fe07366130b924
--- /dev/null
+++ b/doc/fluid/new_docs/beginners_guide/basics/index.rst
@@ -0,0 +1,18 @@
+################
+深度学习基础知识
+################
+
+
+..  todo::
+
+    概述
+    
+..  toctree::
+    :maxdepth: 2
+
+    image_classification/README.cn.md
+    word2vec/README.cn.md
+    recommender_system/README.cn.md
+    understand_sentiment/README.cn.md
+    label_semantic_roles/README.cn.md
+    machine_translation/README.cn.md
diff --git a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/.gitignore b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..29b5622a53a1b0847e9f53febf1cc50dcf4f044a
--- /dev/null
+++ b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/.gitignore
@@ -0,0 +1,12 @@
+data/train.list
+data/test.*
+data/conll05st-release.tar.gz
+data/conll05st-release
+data/predicate_dict
+data/label_dict
+data/word_dict
+data/emb
+data/feature
+output
+predict.res
+train.log
diff --git a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/README.cn.md b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/README.cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..0891f5b6b16a1b715b44db6c47ba079adfcad4c5
--- /dev/null
+++ b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/README.cn.md
@@ -0,0 +1,562 @@
+# 语义角色标注
+
+本教程源代码目录在[book/label_semantic_roles](https://github.com/PaddlePaddle/book/tree/develop/07.label_semantic_roles)， 初次使用请参考PaddlePaddle[安装教程](https://github.com/PaddlePaddle/book/blob/develop/README.cn.md#运行这本书)，更多内容请参考本教程的[视频课堂](http://bit.baidu.com/course/detail/id/178.html)。
+
+## 背景介绍
+
+自然语言分析技术大致分为三个层面：词法分析、句法分析和语义分析。语义角色标注是实现浅层语义分析的一种方式。在一个句子中，谓词是对主语的陈述或说明，指出“做什么”、“是什么”或“怎么样，代表了一个事件的核心，跟谓词搭配的名词称为论元。语义角色是指论元在动词所指事件中担任的角色。主要有：施事者（Agent）、受事者（Patient）、客体（Theme）、经验者（Experiencer）、受益者（Beneficiary）、工具（Instrument）、处所（Location）、目标（Goal）和来源（Source）等。
+
+请看下面的例子，“遇到” 是谓词（Predicate，通常简写为“Pred”），“小明”是施事者（Agent），“小红”是受事者（Patient），“昨天” 是事件发生的时间（Time），“公园”是事情发生的地点（Location）。
+
+$$\mbox{[小明]}_{\mbox{Agent}}\mbox{[昨天]}_{\mbox{Time}}\mbox{[晚上]}_\mbox{Time}\mbox{在[公园]}_{\mbox{Location}}\mbox{[遇到]}_{\mbox{Predicate}}\mbox{了[小红]}_{\mbox{Patient}}\mbox{。}$$
+
+语义角色标注（Semantic Role Labeling，SRL）以句子的谓词为中心，不对句子所包含的语义信息进行深入分析，只分析句子中各成分与谓词之间的关系，即句子的谓词（Predicate）- 论元（Argument）结构，并用语义角色来描述这些结构关系，是许多自然语言理解任务（如信息抽取，篇章分析，深度问答等）的一个重要中间步骤。在研究中一般都假定谓词是给定的，所要做的就是找出给定谓词的各个论元和它们的语义角色。
+
+传统的SRL系统大多建立在句法分析基础之上，通常包括5个流程：
+
+1. 构建一棵句法分析树，例如，图1是对上面例子进行依存句法分析得到的一棵句法树。
+2. 从句法树上识别出给定谓词的候选论元。
+3. 候选论元剪除；一个句子中的候选论元可能很多，候选论元剪除就是从大量的候选项中剪除那些最不可能成为论元的候选项。
+4. 论元识别：这个过程是从上一步剪除之后的候选中判断哪些是真正的论元，通常当做一个二分类问题来解决。
+5. 对第4步的结果，通过多分类得到论元的语义角色标签。可以看到，句法分析是基础，并且后续步骤常常会构造的一些人工特征，这些特征往往也来自句法分析。
+
+<div  align="center">
+<img src="https://github.com/PaddlePaddle/book/blob/develop/07.label_semantic_roles/image/dependency_parsing.png?raw=true" width = "80%" align=center /><br>
+图1. 依存句法分析句法树示例
+</div>
+
+然而，完全句法分析需要确定句子所包含的全部句法信息，并确定句子各成分之间的关系，是一个非常困难的任务，目前技术下的句法分析准确率并不高，句法分析的细微错误都会导致SRL的错误。为了降低问题的复杂度，同时获得一定的句法结构信息，“浅层句法分析”的思想应运而生。浅层句法分析也称为部分句法分析（partial parsing）或语块划分（chunking）。和完全句法分析得到一颗完整的句法树不同，浅层句法分析只需要识别句子中某些结构相对简单的独立成分，例如：动词短语，这些被识别出来的结构称为语块。为了回避 “无法获得准确率较高的句法树” 所带来的困难，一些研究\[[1](#参考文献)\]也提出了基于语块（chunk）的SRL方法。基于语块的SRL方法将SRL作为一个序列标注问题来解决。序列标注任务一般都会采用BIO表示方式来定义序列标注的标签集，我们先来介绍这种表示方法。在BIO表示法中，B代表语块的开始，I代表语块的中间，O代表语块结束。通过B、I、O 三种标记将不同的语块赋予不同的标签，例如：对于一个由角色A拓展得到的语块组，将它所包含的第一个语块赋予标签B-A，将它所包含的其它语块赋予标签I-A，不属于任何论元的语块赋予标签O。
+
+我们继续以上面的这句话为例，图1展示了BIO表示方法。
+
+<div  align="center">
+<img src="https://github.com/PaddlePaddle/book/blob/develop/07.label_semantic_roles/image/bio_example.png?raw=true" width = "90%"  align=center /><br>
+图2. BIO标注方法示例
+</div>
+
+从上面的例子可以看到，根据序列标注结果可以直接得到论元的语义角色标注结果，是一个相对简单的过程。这种简单性体现在：（1）依赖浅层句法分析，降低了句法分析的要求和难度；（2）没有了候选论元剪除这一步骤；（3）论元的识别和论元标注是同时实现的。这种一体化处理论元识别和论元标注的方法，简化了流程，降低了错误累积的风险，往往能够取得更好的结果。
+
+与基于语块的SRL方法类似，在本教程中我们也将SRL看作一个序列标注问题，不同的是，我们只依赖输入文本序列，不依赖任何额外的语法解析结果或是复杂的人造特征，利用深度神经网络构建一个端到端学习的SRL系统。我们以[CoNLL-2004 and CoNLL-2005 Shared Tasks](http://www.cs.upc.edu/~srlconll/)任务中SRL任务的公开数据集为例，实践下面的任务：给定一句话和这句话里的一个谓词，通过序列标注的方式，从句子中找到谓词对应的论元，同时标注它们的语义角色。
+
+## 模型概览
+
+循环神经网络（Recurrent Neural Network）是一种对序列建模的重要模型，在自然语言处理任务中有着广泛地应用。不同于前馈神经网络（Feed-forward Neural Network），RNN能够处理输入之间前后关联的问题。LSTM是RNN的一种重要变种，常用来学习长序列中蕴含的长程依赖关系，我们在[情感分析](https://github.com/PaddlePaddle/book/tree/develop/05.understand_sentiment)一篇中已经介绍过，这一篇中我们依然利用LSTM来解决SRL问题。
+
+### 栈式循环神经网络（Stacked Recurrent Neural Network）
+
+深层网络有助于形成层次化特征，网络上层在下层已经学习到的初级特征基础上，形成更复杂的高级特征。尽管LSTM沿时间轴展开后等价于一个非常“深”的前馈网络，但由于LSTM各个时间步参数共享，$t-1$时刻状态到$t$时刻的映射，始终只经过了一次非线性映射，也就是说单层LSTM对状态转移的建模是 “浅” 的。堆叠多个LSTM单元，令前一个LSTM$t$时刻的输出，成为下一个LSTM单元$t$时刻的输入，帮助我们构建起一个深层网络，我们把它称为第一个版本的栈式循环神经网络。深层网络提高了模型拟合复杂模式的能力，能够更好地建模跨不同时间步的模式\[[2](#参考文献)\]。
+
+然而，训练一个深层LSTM网络并非易事。纵向堆叠多个LSTM单元可能遇到梯度在纵向深度上传播受阻的问题。通常，堆叠4层LSTM单元可以正常训练，当层数达到4~8层时，会出现性能衰减，这时必须考虑一些新的结构以保证梯度纵向顺畅传播，这是训练深层LSTM网络必须解决的问题。我们可以借鉴LSTM解决 “梯度消失梯度爆炸” 问题的智慧之一：在记忆单元（Memory Cell）这条信息传播的路线上没有非线性映射，当梯度反向传播时既不会衰减、也不会爆炸。因此，深层LSTM模型也可以在纵向上添加一条保证梯度顺畅传播的路径。
+
+一个LSTM单元完成的运算可以被分为三部分：（1）输入到隐层的映射（input-to-hidden） ：每个时间步输入信息$x$会首先经过一个矩阵映射，再作为遗忘门，输入门，记忆单元，输出门的输入，注意，这一次映射没有引入非线性激活；（2）隐层到隐层的映射（hidden-to-hidden）：这一步是LSTM计算的主体，包括遗忘门，输入门，记忆单元更新，输出门的计算；（3）隐层到输出的映射（hidden-to-output）：通常是简单的对隐层向量进行激活。我们在第一个版本的栈式网络的基础上，加入一条新的路径：除上一层LSTM输出之外，将前层LSTM的输入到隐层的映射作为的一个新的输入，同时加入一个线性映射去学习一个新的变换。
+
+图3是最终得到的栈式循环神经网络结构示意图。
+
+<p align="center">  
+<img src="https://github.com/PaddlePaddle/book/blob/develop/07.label_semantic_roles/image/stacked_lstm.png?raw=true" width = "40%"  align=center><br>
+图3. 基于LSTM的栈式循环神经网络结构示意图
+</p>
+
+### 双向循环神经网络（Bidirectional Recurrent Neural Network）
+
+在LSTM中，$t$时刻的隐藏层向量编码了到$t$时刻为止所有输入的信息，但$t$时刻的LSTM可以看到历史，却无法看到未来。在绝大多数自然语言处理任务中，我们几乎总是能拿到整个句子。这种情况下，如果能够像获取历史信息一样，得到未来的信息，对序列学习任务会有很大的帮助。
+
+为了克服这一缺陷，我们可以设计一种双向循环网络单元，它的思想简单且直接：对上一节的栈式循环神经网络进行一个小小的修改，堆叠多个LSTM单元，让每一层LSTM单元分别以：正向、反向、正向 …… 的顺序学习上一层的输出序列。于是，从第2层开始，$t$时刻我们的LSTM单元便总是可以看到历史和未来的信息。图4是基于LSTM的双向循环神经网络结构示意图。
+
+<p align="center">  
+<img src="https://github.com/PaddlePaddle/book/blob/develop/07.label_semantic_roles/image/bidirectional_stacked_lstm.png?raw=true" width = "60%" align=center><br>
+图4. 基于LSTM的双向循环神经网络结构示意图
+</p>
+
+需要说明的是，这种双向RNN结构和Bengio等人在机器翻译任务中使用的双向RNN结构\[[3](#参考文献), [4](#参考文献)\] 并不相同，我们会在后续[机器翻译](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.cn.md)任务中，介绍另一种双向循环神经网络。
+
+### 条件随机场 (Conditional Random Field)
+
+使用神经网络模型解决问题的思路通常是：前层网络学习输入的特征表示，网络的最后一层在特征基础上完成最终的任务。在SRL任务中，深层LSTM网络学习输入的特征表示，条件随机场（Conditional Random Filed， CRF）在特征的基础上完成序列标注，处于整个网络的末端。
+
+CRF是一种概率化结构模型，可以看作是一个概率无向图模型，结点表示随机变量，边表示随机变量之间的概率依赖关系。简单来讲，CRF学习条件概率$P(X|Y)$，其中 $X = (x_1, x_2, ... , x_n)$ 是输入序列，$Y = (y_1, y_2, ... , y_n)$ 是标记序列；解码过程是给定 $X$序列求解令$P(Y|X)$最大的$Y$序列，即$Y^* = \mbox{arg max}_{Y} P(Y | X)$。
+
+序列标注任务只需要考虑输入和输出都是一个线性序列，并且由于我们只是将输入序列作为条件，不做任何条件独立假设，因此输入序列的元素之间并不存在图结构。综上，在序列标注任务中使用的是如图5所示的定义在链式图上的CRF，称之为线性链条件随机场（Linear Chain Conditional Random Field）。
+
+<p align="center">  
+<img src="https://github.com/PaddlePaddle/book/blob/develop/07.label_semantic_roles/image/linear_chain_crf.png?raw=true" width = "35%" align=center><br>
+图5. 序列标注任务中使用的线性链条件随机场
+</p>
+
+根据线性链条件随机场上的因子分解定理\[[5](#参考文献)\]，在给定观测序列$X$时，一个特定标记序列$Y$的概率可以定义为：
+
+$$p(Y | X) = \frac{1}{Z(X)} \text{exp}\left(\sum_{i=1}^{n}\left(\sum_{j}\lambda_{j}t_{j} (y_{i - 1}, y_{i}, X, i) + \sum_{k} \mu_k s_k (y_i, X, i)\right)\right)$$
+
+其中$Z(X)$是归一化因子，$t_j$ 是定义在边上的特征函数，依赖于当前和前一个位置，称为转移特征，表示对于输入序列$X$及其标注序列在 $i$及$i - 1$位置上标记的转移概率。$s_k$是定义在结点上的特征函数，称为状态特征，依赖于当前位置，表示对于观察序列$X$及其$i$位置的标记概率。$\lambda_j$ 和 $\mu_k$ 分别是转移特征函数和状态特征函数对应的权值。实际上，$t$和$s$可以用相同的数学形式表示，再对转移特征和状态特在各个位置$i$求和有：$f_{k}(Y, X) = \sum_{i=1}^{n}f_k({y_{i - 1}, y_i, X, i})$，把$f$统称为特征函数，于是$P(Y|X)$可表示为：
+
+$$p(Y|X, W) = \frac{1}{Z(X)}\text{exp}\sum_{k}\omega_{k}f_{k}(Y, X)$$
+
+$\omega$是特征函数对应的权值，是CRF模型要学习的参数。训练时，对于给定的输入序列和对应的标记序列集合$D = \left[(X_1,  Y_1), (X_2 , Y_2) , ... , (X_N, Y_N)\right]$ ，通过正则化的极大似然估计，求解如下优化目标：
+
+$$\DeclareMathOperator*{\argmax}{arg\,max} L(\lambda, D) = - \text{log}\left(\prod_{m=1}^{N}p(Y_m|X_m, W)\right) + C \frac{1}{2}\lVert W\rVert^{2}$$
+
+这个优化目标可以通过反向传播算法和整个神经网络一起求解。解码时，对于给定的输入序列$X$，通过解码算法（通常有：维特比算法、Beam Search）求令出条件概率$\bar{P}(Y|X)$最大的输出序列 $\bar{Y}$。
+
+### 深度双向LSTM（DB-LSTM）SRL模型
+
+在SRL任务中，输入是 “谓词” 和 “一句话”，目标是从这句话中找到谓词的论元，并标注论元的语义角色。如果一个句子含有$n$个谓词，这个句子会被处理$n$次。一个最为直接的模型是下面这样：
+
+1. 构造输入；
+ - 输入1是谓词，输入2是句子
+ - 将输入1扩展成和输入2一样长的序列，用one-hot方式表示；
+2. one-hot方式的谓词序列和句子序列通过词表，转换为实向量表示的词向量序列；
+3. 将步骤2中的2个词向量序列作为双向LSTM的输入，学习输入序列的特征表示；
+4. CRF以步骤3中模型学习到的特征为输入，以标记序列为监督信号，实现序列标注；
+
+大家可以尝试上面这种方法。这里，我们提出一些改进，引入两个简单但对提高系统性能非常有效的特征：
+
+- 谓词上下文：上面的方法中，只用到了谓词的词向量表达谓词相关的所有信息，这种方法始终是非常弱的，特别是如果谓词在句子中出现多次，有可能引起一定的歧义。从经验出发，谓词前后若干个词的一个小片段，能够提供更丰富的信息，帮助消解歧义。于是，我们把这样的经验也添加到模型中，为每个谓词同时抽取一个“谓词上下文” 片段，也就是从这个谓词前后各取$n$个词构成的一个窗口片段；
+- 谓词上下文区域标记：为句子中的每一个词引入一个0-1二值变量，表示它们是否在“谓词上下文”片段中；
+
+修改后的模型如下（图6是一个深度为4的模型结构示意图）：
+
+1. 构造输入
+ - 输入1是句子序列，输入2是谓词序列，输入3是谓词上下文，从句子中抽取这个谓词前后各$n$个词，构成谓词上下文，用one-hot方式表示，输入4是谓词上下文区域标记，标记了句子中每一个词是否在谓词上下文中；
+ - 将输入2~3均扩展为和输入1一样长的序列；
+2. 输入1~4均通过词表取词向量转换为实向量表示的词向量序列；其中输入1、3共享同一个词表，输入2和4各自独有词表；
+3. 第2步的4个词向量序列作为双向LSTM模型的输入；LSTM模型学习输入序列的特征表示，得到新的特性表示序列；
+4. CRF以第3步中LSTM学习到的特征为输入，以标记序列为监督信号，完成序列标注；
+
+<div  align="center">  
+<img src="https://github.com/PaddlePaddle/book/blob/develop/07.label_semantic_roles/image/db_lstm_network.png?raw=true" width = "60%"  align=center /><br>
+图6. SRL任务上的深层双向LSTM模型
+</div>
+
+
+## 数据介绍
+
+在此教程中，我们选用[CoNLL 2005](http://www.cs.upc.edu/~srlconll/)SRL任务开放出的数据集作为示例。需要特别说明的是，CoNLL 2005 SRL任务的训练数集和开发集在比赛之后并非免费进行公开，目前，能够获取到的只有测试集，包括Wall Street Journal的23节和Brown语料集中的3节。在本教程中，我们以测试集中的WSJ数据为训练集来讲解模型。但是，由于测试集中样本的数量远远不够，如果希望训练一个可用的神经网络SRL系统，请考虑付费获取全量数据。
+
+原始数据中同时包括了词性标注、命名实体识别、语法解析树等多种信息。本教程中，我们使用test.wsj文件夹中的数据进行训练和测试，并只会用到words文件夹（文本序列）和props文件夹（标注结果）下的数据。本教程使用的数据目录如下：
+
+```text
+conll05st-release/
+└── test.wsj
+    ├── props  # 标注结果
+    └── words  # 输入文本序列
+```
+
+标注信息源自Penn TreeBank\[[7](#参考文献)\]和PropBank\[[8](#参考文献)\]的标注结果。PropBank标注结果的标签和我们在文章一开始示例中使用的标注结果标签不同，但原理是相同的，关于标注结果标签含义的说明，请参考论文\[[9](#参考文献)\]。
+
+原始数据需要进行数据预处理才能被PaddlePaddle处理，预处理包括下面几个步骤:
+
+1. 将文本序列和标记序列其合并到一条记录中；
+2. 一个句子如果含有$n$个谓词，这个句子会被处理$n$次，变成$n$条独立的训练样本，每个样本一个不同的谓词；
+3. 抽取谓词上下文和构造谓词上下文区域标记；
+4. 构造以BIO法表示的标记；
+5. 依据词典获取词对应的整数索引。
+
+预处理完成之后一条训练样本包含9个特征，分别是：句子序列、谓词、谓词上下文（占 5 列）、谓词上下区域标志、标注序列。下表是一条训练样本的示例。
+
+| 句子序列 | 谓词 | 谓词上下文（窗口 = 5） | 谓词上下文区域标记 | 标注序列 |
+|---|---|---|---|---|
+| A | set | n't been set . × | 0 | B-A1 |
+| record | set | n't been set . × | 0 | I-A1 |
+| date | set | n't been set . × | 0 | I-A1 |
+| has | set | n't been set . × | 0 | O |
+| n't | set | n't been set . × | 1 | B-AM-NEG |
+| been | set | n't been set . × | 1 | O |
+| set | set | n't been set . × | 1 | B-V |
+| . | set | n't been set . × | 1 | O |
+
+
+除数据之外，我们同时提供了以下资源：
+
+| 文件名称 | 说明 |
+|---|---|
+| word_dict | 输入句子的词典，共计44068个词 |
+| label_dict | 标记的词典，共计106个标记 |
+| predicate_dict | 谓词的词典，共计3162个词 |
+| emb | 一个训练好的词表，32维 |
+
+我们在英文维基百科上训练语言模型得到了一份词向量用来初始化SRL模型。在SRL模型训练过程中，词向量不再被更新。关于语言模型和词向量可以参考[词向量](https://github.com/PaddlePaddle/book/blob/develop/04.word2vec/README.cn.md) 这篇教程。我们训练语言模型的语料共有995,000,000个token，词典大小控制为4900,000词。CoNLL 2005训练语料中有5%的词不在这4900,000个词中，我们将它们全部看作未登录词，用`<unk>`表示。
+
+获取词典，打印词典大小：
+
+```python
+from __future__ import print_function
+
+import math, os
+import numpy as np
+import paddle
+import paddle.v2.dataset.conll05 as conll05
+import paddle.fluid as fluid
+import time
+
+with_gpu = os.getenv('WITH_GPU', '0') != '0'
+
+word_dict, verb_dict, label_dict = conll05.get_dict()
+word_dict_len = len(word_dict)
+label_dict_len = len(label_dict)
+pred_dict_len = len(verb_dict)
+
+print('word_dict_len: ', word_dict_len)
+print('label_dict_len: ', label_dict_len)
+print('pred_dict_len: ', pred_dict_len)
+```
+
+## 模型配置说明
+
+- 定义输入数据维度及模型超参数。
+
+```python
+mark_dict_len = 2   # 谓上下文区域标志的维度，是一个0-1 2值特征，因此维度为2
+word_dim = 32       # 词向量维度
+mark_dim = 5        # 谓词上下文区域通过词表被映射为一个实向量，这个是相邻的维度
+hidden_dim = 512    # LSTM隐层向量的维度 ： 512 / 4
+depth = 8           # 栈式LSTM的深度
+mix_hidden_lr = 1e-3
+
+IS_SPARSE = True
+PASS_NUM = 10
+BATCH_SIZE = 10
+
+embedding_name = 'emb'
+```
+
+这里需要特别说明的是hidden_dim = 512指定了LSTM隐层向量的维度为128维，关于这一点请参考PaddlePaddle官方文档中[lstmemory](http://www.paddlepaddle.org/doc/ui/api/trainer_config_helpers/layers.html#lstmemory)的说明。
+
+- 如上文提到，我们用基于英文维基百科训练好的词向量来初始化序列输入、谓词上下文总共6个特征的embedding层参数，在训练中不更新。
+
+```python
+# 这里加载PaddlePaddle上版保存的二进制模型
+def load_parameter(file_name, h, w):
+    with open(file_name, 'rb') as f:
+        f.read(16)  # skip header.
+        return np.fromfile(f, dtype=np.float32).reshape(h, w)
+```
+
+- 8个LSTM单元以“正向/反向”的顺序对所有输入序列进行学习。
+
+```python  
+def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
+            **ignored):
+    # 8 features
+    predicate_embedding = fluid.layers.embedding(
+        input=predicate,
+        size=[pred_dict_len, word_dim],
+        dtype='float32',
+        is_sparse=IS_SPARSE,
+        param_attr='vemb')
+
+    mark_embedding = fluid.layers.embedding(
+        input=mark,
+        size=[mark_dict_len, mark_dim],
+        dtype='float32',
+        is_sparse=IS_SPARSE)
+
+    word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
+    # Since word vector lookup table is pre-trained, we won't update it this time.
+    # trainable being False prevents updating the lookup table during training.
+    emb_layers = [
+        fluid.layers.embedding(
+            size=[word_dict_len, word_dim],
+            input=x,
+            param_attr=fluid.ParamAttr(
+                name=embedding_name, trainable=False)) for x in word_input
+    ]
+    emb_layers.append(predicate_embedding)
+    emb_layers.append(mark_embedding)
+
+    # 8 LSTM units are trained through alternating left-to-right / right-to-left order
+    # denoted by the variable `reverse`.
+    hidden_0_layers = [
+        fluid.layers.fc(input=emb, size=hidden_dim, act='tanh')
+        for emb in emb_layers
+    ]
+
+    hidden_0 = fluid.layers.sums(input=hidden_0_layers)
+
+    lstm_0 = fluid.layers.dynamic_lstm(
+        input=hidden_0,
+        size=hidden_dim,
+        candidate_activation='relu',
+        gate_activation='sigmoid',
+        cell_activation='sigmoid')
+
+    # stack L-LSTM and R-LSTM with direct edges
+    input_tmp = [hidden_0, lstm_0]
+
+    # In PaddlePaddle, state features and transition features of a CRF are implemented
+    # by a fully connected layer and a CRF layer seperately. The fully connected layer
+    # with linear activation learns the state features, here we use fluid.layers.sums
+    # (fluid.layers.fc can be uesed as well), and the CRF layer in PaddlePaddle:
+    # fluid.layers.linear_chain_crf only
+    # learns the transition features, which is a cost layer and is the last layer of the network.
+    # fluid.layers.linear_chain_crf outputs the log probability of true tag sequence
+    # as the cost by given the input sequence and it requires the true tag sequence
+    # as target in the learning process.
+
+    for i in range(1, depth):
+        mix_hidden = fluid.layers.sums(input=[
+            fluid.layers.fc(input=input_tmp[0], size=hidden_dim, act='tanh'),
+            fluid.layers.fc(input=input_tmp[1], size=hidden_dim, act='tanh')
+        ])
+
+        lstm = fluid.layers.dynamic_lstm(
+            input=mix_hidden,
+            size=hidden_dim,
+            candidate_activation='relu',
+            gate_activation='sigmoid',
+            cell_activation='sigmoid',
+            is_reverse=((i % 2) == 1))
+
+        input_tmp = [mix_hidden, lstm]
+
+    # 取最后一个栈式LSTM的输出和这个LSTM单元的输入到隐层映射，
+    # 经过一个全连接层映射到标记字典的维度，来学习 CRF 的状态特征
+    feature_out = fluid.layers.sums(input=[
+        fluid.layers.fc(input=input_tmp[0], size=label_dict_len, act='tanh'),
+        fluid.layers.fc(input=input_tmp[1], size=label_dict_len, act='tanh')
+    ])
+
+    return feature_out
+```
+
+## 训练模型
+
+- 我们根据网络拓扑结构和模型参数来构造出trainer用来训练，在构造时还需指定优化方法，这里使用最基本的SGD方法(momentum设置为0)，同时设定了学习率、正则等。
+
+- 数据介绍部分提到CoNLL 2005训练集付费，这里我们使用测试集训练供大家学习。conll05.test()每次产生一条样本，包含9个特征，shuffle和组完batch后作为训练的输入。
+
+- 通过feeding来指定每一个数据和data_layer的对应关系。 例如 下面feeding表示: conll05.test()产生数据的第0列对应word_data层的特征。
+
+- 可以使用event_handler回调函数来观察训练过程，或进行测试等。这里我们打印了训练过程的cost，该回调函数是trainer.train函数里设定。
+
+- 通过trainer.train函数训练
+
+```python
+def train(use_cuda, save_dirname=None, is_local=True):
+    # define network topology
+
+    # 句子序列
+    word = fluid.layers.data(
+        name='word_data', shape=[1], dtype='int64', lod_level=1)
+
+    # 谓词
+    predicate = fluid.layers.data(
+        name='verb_data', shape=[1], dtype='int64', lod_level=1)
+
+    # 谓词上下文5个特征
+    ctx_n2 = fluid.layers.data(
+        name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1)
+    ctx_n1 = fluid.layers.data(
+        name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1)
+    ctx_0 = fluid.layers.data(
+        name='ctx_0_data', shape=[1], dtype='int64', lod_level=1)
+    ctx_p1 = fluid.layers.data(
+        name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1)
+    ctx_p2 = fluid.layers.data(
+        name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1)
+
+    # 谓词上下区域标志
+    mark = fluid.layers.data(
+        name='mark_data', shape=[1], dtype='int64', lod_level=1)
+
+    # define network topology
+    feature_out = db_lstm(**locals())
+
+    # 标注序列
+    target = fluid.layers.data(
+        name='target', shape=[1], dtype='int64', lod_level=1)
+
+    # 学习 CRF 的转移特征
+    crf_cost = fluid.layers.linear_chain_crf(
+        input=feature_out,
+        label=target,
+        param_attr=fluid.ParamAttr(
+            name='crfw', learning_rate=mix_hidden_lr))
+
+    avg_cost = fluid.layers.mean(crf_cost)
+
+    sgd_optimizer = fluid.optimizer.SGD(
+        learning_rate=fluid.layers.exponential_decay(
+            learning_rate=0.01,
+            decay_steps=100000,
+            decay_rate=0.5,
+            staircase=True))
+
+    sgd_optimizer.minimize(avg_cost)
+
+    # The CRF decoding layer is used for evaluation and inference.
+    # It shares weights with CRF layer.  The sharing of parameters among multiple layers
+    # is specified by using the same parameter name in these layers. If true tag sequence
+    # is provided in training process, `fluid.layers.crf_decoding` calculates labelling error
+    # for each input token and sums the error over the entire sequence.
+    # Otherwise, `fluid.layers.crf_decoding`  generates the labelling tags.
+    crf_decode = fluid.layers.crf_decoding(
+        input=feature_out, param_attr=fluid.ParamAttr(name='crfw'))
+
+    train_data = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.conll05.test(), buf_size=8192),
+        batch_size=BATCH_SIZE)
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+
+
+    feeder = fluid.DataFeeder(
+        feed_list=[
+            word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, predicate, mark, target
+        ],
+        place=place)
+    exe = fluid.Executor(place)
+
+    def train_loop(main_program):
+        exe.run(fluid.default_startup_program())
+        embedding_param = fluid.global_scope().find_var(
+            embedding_name).get_tensor()
+        embedding_param.set(
+            load_parameter(conll05.get_embedding(), word_dict_len, word_dim),
+            place)
+
+        start_time = time.time()
+        batch_id = 0
+        for pass_id in xrange(PASS_NUM):
+            for data in train_data():
+                cost = exe.run(main_program,
+                               feed=feeder.feed(data),
+                               fetch_list=[avg_cost])
+                cost = cost[0]
+
+                if batch_id % 10 == 0:
+                    print("avg_cost: " + str(cost))
+                    if batch_id != 0:
+                        print("second per batch: " + str((time.time(
+                        ) - start_time) / batch_id))
+                    # Set the threshold low to speed up the CI test
+                    if float(cost) < 60.0:
+                        if save_dirname is not None:
+                            fluid.io.save_inference_model(save_dirname, [
+                                'word_data', 'verb_data', 'ctx_n2_data',
+                                'ctx_n1_data', 'ctx_0_data', 'ctx_p1_data',
+                                'ctx_p2_data', 'mark_data'
+                            ], [feature_out], exe)
+                        return
+
+                batch_id = batch_id + 1
+
+    train_loop(fluid.default_main_program())
+```
+
+
+## 应用模型
+
+训练完成之后，需要依据某个我们关心的性能指标选择最优的模型进行预测，可以简单的选择测试集上标记错误最少的那个模型。以下我们给出一个使用训练后的模型进行预测的示例。
+
+```python
+def infer(use_cuda, save_dirname=None):
+    if save_dirname is None:
+        return
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    inference_scope = fluid.core.Scope()
+    with fluid.scope_guard(inference_scope):
+        # Use fluid.io.load_inference_model to obtain the inference program desc,
+        # the feed_target_names (the names of variables that will be fed
+        # data using feed operators), and the fetch_targets (variables that
+        # we want to obtain data from using fetch operators).
+        [inference_program, feed_target_names,
+         fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
+
+        # Setup inputs by creating LoDTensors to represent sequences of words.
+        # Here each word is the basic element of these LoDTensors and the shape of
+        # each word (base_shape) should be [1] since it is simply an index to
+        # look up for the corresponding word vector.
+        # Suppose the length_based level of detail (lod) info is set to [[3, 4, 2]],
+        # which has only one lod level. Then the created LoDTensors will have only
+        # one higher level structure (sequence of words, or sentence) than the basic
+        # element (word). Hence the LoDTensor will hold data for three sentences of
+        # length 3, 4 and 2, respectively.
+        # Note that lod info should be a list of lists.
+        lod = [[3, 4, 2]]
+        base_shape = [1]
+        # The range of random integers is [low, high]
+        word = fluid.create_random_int_lodtensor(
+            lod, base_shape, place, low=0, high=word_dict_len - 1)
+        pred = fluid.create_random_int_lodtensor(
+            lod, base_shape, place, low=0, high=pred_dict_len - 1)
+        ctx_n2 = fluid.create_random_int_lodtensor(
+            lod, base_shape, place, low=0, high=word_dict_len - 1)
+        ctx_n1 = fluid.create_random_int_lodtensor(
+            lod, base_shape, place, low=0, high=word_dict_len - 1)
+        ctx_0 = fluid.create_random_int_lodtensor(
+            lod, base_shape, place, low=0, high=word_dict_len - 1)
+        ctx_p1 = fluid.create_random_int_lodtensor(
+            lod, base_shape, place, low=0, high=word_dict_len - 1)
+        ctx_p2 = fluid.create_random_int_lodtensor(
+            lod, base_shape, place, low=0, high=word_dict_len - 1)
+        mark = fluid.create_random_int_lodtensor(
+            lod, base_shape, place, low=0, high=mark_dict_len - 1)
+
+        # Construct feed as a dictionary of {feed_target_name: feed_target_data}
+        # and results will contain a list of data corresponding to fetch_targets.
+        assert feed_target_names[0] == 'word_data'
+        assert feed_target_names[1] == 'verb_data'
+        assert feed_target_names[2] == 'ctx_n2_data'
+        assert feed_target_names[3] == 'ctx_n1_data'
+        assert feed_target_names[4] == 'ctx_0_data'
+        assert feed_target_names[5] == 'ctx_p1_data'
+        assert feed_target_names[6] == 'ctx_p2_data'
+        assert feed_target_names[7] == 'mark_data'
+
+        results = exe.run(inference_program,
+                          feed={
+                              feed_target_names[0]: word,
+                              feed_target_names[1]: pred,
+                              feed_target_names[2]: ctx_n2,
+                              feed_target_names[3]: ctx_n1,
+                              feed_target_names[4]: ctx_0,
+                              feed_target_names[5]: ctx_p1,
+                              feed_target_names[6]: ctx_p2,
+                              feed_target_names[7]: mark
+                          },
+                          fetch_list=fetch_targets,
+                          return_numpy=False)
+        print(results[0].lod())
+        np_data = np.array(results[0])
+        print("Inference Shape: ", np_data.shape)
+```
+
+整个程序的入口如下：
+
+```python
+def main(use_cuda, is_local=True):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+
+    # Directory for saving the trained model
+    save_dirname = "label_semantic_roles.inference.model"
+
+    train(use_cuda, save_dirname, is_local)
+    infer(use_cuda, save_dirname)
+
+
+main(use_cuda=False)
+```
+
+## 总结
+
+语义角色标注是许多自然语言理解任务的重要中间步骤。这篇教程中我们以语义角色标注任务为例，介绍如何利用PaddlePaddle进行序列标注任务。教程中所介绍的模型来自我们发表的论文\[[10](#参考文献)\]。由于 CoNLL 2005 SRL任务的训练数据目前并非完全开放，教程中只使用测试数据作为示例。在这个过程中，我们希望减少对其它自然语言处理工具的依赖，利用神经网络数据驱动、端到端学习的能力，得到一个和传统方法可比、甚至更好的模型。在论文中我们证实了这种可能性。关于模型更多的信息和讨论可以在论文中找到。
+
+## 参考文献
+1. Sun W, Sui Z, Wang M, et al. [Chinese semantic role labeling with shallow parsing](http://www.aclweb.org/anthology/D09-1#page=1513)[C]//Proceedings of the 2009 Conference on Empirical Methods in Natural Language Processing: Volume 3-Volume 3. Association for Computational Linguistics, 2009: 1475-1483.
+2. Pascanu R, Gulcehre C, Cho K, et al. [How to construct deep recurrent neural networks](https://arxiv.org/abs/1312.6026)[J]. arXiv preprint arXiv:1312.6026, 2013.
+3. Cho K, Van Merriënboer B, Gulcehre C, et al. [Learning phrase representations using RNN encoder-decoder for statistical machine translation](https://arxiv.org/abs/1406.1078)[J]. arXiv preprint arXiv:1406.1078, 2014.
+4. Bahdanau D, Cho K, Bengio Y. [Neural machine translation by jointly learning to align and translate](https://arxiv.org/abs/1409.0473)[J]. arXiv preprint arXiv:1409.0473, 2014.
+5. Lafferty J, McCallum A, Pereira F. [Conditional random fields: Probabilistic models for segmenting and labeling sequence data](http://www.jmlr.org/papers/volume15/doppa14a/source/biblio.bib.old)[C]//Proceedings of the eighteenth international conference on machine learning, ICML. 2001, 1: 282-289.
+6. 李航. 统计学习方法[J]. 清华大学出版社, 北京, 2012.
+7. Marcus M P, Marcinkiewicz M A, Santorini B. [Building a large annotated corpus of English: The Penn Treebank](http://repository.upenn.edu/cgi/viewcontent.cgi?article=1246&context=cis_reports)[J]. Computational linguistics, 1993, 19(2): 313-330.
+8. Palmer M, Gildea D, Kingsbury P. [The proposition bank: An annotated corpus of semantic roles](http://www.mitpressjournals.org/doi/pdfplus/10.1162/0891201053630264)[J]. Computational linguistics, 2005, 31(1): 71-106.
+9. Carreras X, Màrquez L. [Introduction to the CoNLL-2005 shared task: Semantic role labeling](http://www.cs.upc.edu/~srlconll/st05/papers/intro.pdf)[C]//Proceedings of the Ninth Conference on Computational Natural Language Learning. Association for Computational Linguistics, 2005: 152-164.
+10. Zhou J, Xu W. [End-to-end learning of semantic role labeling using recurrent neural networks](http://www.aclweb.org/anthology/P/P15/P15-1109.pdf)[C]//Proceedings of the Annual Meeting of the Association for Computational Linguistics. 2015.
+
+<br/>
+<a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span> 由 <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作，采用 <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">知识共享 署名-相同方式共享 4.0 国际 许可协议</a>进行许可。
diff --git a/doc/fluid/new_docs/beginners_guide/basics/learning_materials.md b/doc/fluid/new_docs/beginners_guide/basics/learning_materials.md
new file mode 100644
index 0000000000000000000000000000000000000000..a27499c6ed8d1149c6d519006086febbcae943fa
--- /dev/null
+++ b/doc/fluid/new_docs/beginners_guide/basics/learning_materials.md
@@ -0,0 +1,54 @@
+# 学习资料
+
+## 要读的第一本书
+基础理论习得的最直接来源就是书本。按机器学习理论、深度学习理论、编程语言三方面划分，这里推荐如下书籍辅助您。
+
+
+### 机器学习理论
+
+在开启深度学习之前，您需要先行掌握机器学习的理论。深度学习是机器学习中的一个分支，两者内在的理论基础存在强关联。
+机器学习理论的书籍教材比较多，这里推荐一本易懂易学的书籍，可以重点关注神经网络部分。
+
+书名：《机器学习》（周志华著，清华大学出版社，2016年版）
+
+### 深度学习理论
+
+打好机器学习的理论功底后，您可以开始钻研深度学习的理论。通常深度学习理论会给人留下抽象难懂的印象，且和数学结合紧密。
+为了让您能够顺利入门，这里推荐一份易学易用的教材，无论深度学习理论还是数学理论即可一本搞定。
+
+书名：《Deep Learning（深度学习）》（Goodfellow, Bengio, Courville合著，赵申剑、黎彧君、符天凡和李凯合译，人民邮电出版社，2017年版）
+此书电子版在Github上已经开源，详情可参考此链接 [《深度学习》](https://github.com/exacity/deeplearningbook-chinese)
+
+### 编程语言
+
+Python方向：这里推荐您学习Python，一方面各大主流深度学习框架的主力支撑编程语言均为Python；另一方面，对比其他语言，Python较为简单易学。
+Python的教材种类较多，这里推荐一本实操和理论性都兼顾的教材，只要完成书中52个习题，跑代码然后发现问题解决，就能逐步上手。
+
+书名：《“笨办法”学Python》（Zed Shaw著，王巍巍译，人民邮电出版社，2014年11月版）
+
+
+C++方向：C++语言在底层框架中使用较多，您逐步掌握开源框架的基本操作后，在更高阶的框架应用中会用到这个技能点。
+同前面提到的Python一样，学习C++时需要多上手操作。这里推荐迅速上手C++的书籍，不但能够学习功能和结构，还提供了解决方案的示例。
+
+书名：《Essential C++》【美】李普曼（Lippman,S.B.）著，侯捷译，电子工业出版社2013年8月版
+
+
+
+## 要看的视频公开课
+
+在学习一门新技术的同时，除了看书，如果有老师面对面教授，可以更快更好的学会知识。相比于线下授课，视频公开课能够在省钱省力的同时，达到易学易掌握的效果。
+目前深度学习的课程多是公开免费的，通过学习您可以更轻松的理解深度学习中的抽象理论，并在实操方面不绕弯路。
+综合课程生动性、可操作性、紧凑性、连续性这些特点，这里推荐如下课程，同步附上网址，便于您查找学习。
+
+### 理论知识详解视频课
+[机器学习](http://open.163.com/special/opencourse/machinelearning.html) 斯坦福大学教授吴恩达公开课程，包含相关算法的详细讲解。
+
+[AI技术](https://ai.baidu.com/paddlepaddle/player?id=13) 百度推出的“AI核心技术掌握”课程，每节课在20-30分钟左右，从AI技术到深度学习进行全面细致的解读。
+
+[深度学习](http://speech.ee.ntu.edu.tw/~tlkagk/courses_ML17_2.html) 台湾李宏毅教授的在线课程，其中是英文课程，会结合国外的科研成果，但也适合新手入门和理解深度学习。
+
+[编程语言](https://ai.baidu.com/paddlepaddle/openCourses) Python操作课程，从基础到进阶操作都提供详细说明，每节课时长20分钟左右。
+
+### PaddlePaddle实操视频课
+掌握好理论基础，具备编程能力后，您可以开始使用PaddlePaddle Fluid进行实操，从初阶开始学习，向着中高阶努力。
+目前已有PaddlePaddle官方视频公开课在官网呈现,内含PaddlePaddle实战、PaddlePaddle应用场景和机器学习模型讲解课程，帮助开发者从零开始使用PaddlePaddle，从简单场景逐步过渡到工业级应用。[点击这里](http://ai.baidu.com/paddlepaddle/openCourses)您即可开始视频课的学习之旅。
diff --git a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/.gitignore b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..6129b9e8645010fcb8372d9dc3dbb568dfa80907
--- /dev/null
+++ b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/.gitignore
@@ -0,0 +1,9 @@
+data/wmt14
+data/pre-wmt14
+pretrained/wmt14_model
+gen.log
+gen_result
+train.log
+dataprovider_copy_1.py
+*.pyc
+multi-bleu.perl
diff --git a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/README.cn.md b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/README.cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..6e5f77fec8a894c390ced8c93ee344fd8d27370e
--- /dev/null
+++ b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/README.cn.md
@@ -0,0 +1,472 @@
+# 机器翻译
+
+本教程源代码目录在[book/machine_translation](https://github.com/PaddlePaddle/book/tree/develop/08.machine_translation)， 初次使用请参考PaddlePaddle[安装教程](https://github.com/PaddlePaddle/book/blob/develop/README.cn.md#运行这本书)。
+
+## 背景介绍
+
+机器翻译（machine translation, MT）是用计算机来实现不同语言之间翻译的技术。被翻译的语言通常称为源语言（source language），翻译成的结果语言称为目标语言（target language）。机器翻译即实现从源语言到目标语言转换的过程，是自然语言处理的重要研究领域之一。
+
+早期机器翻译系统多为基于规则的翻译系统，需要由语言学家编写两种语言之间的转换规则，再将这些规则录入计算机。该方法对语言学家的要求非常高，而且我们几乎无法总结一门语言会用到的所有规则，更何况两种甚至更多的语言。因此，传统机器翻译方法面临的主要挑战是无法得到一个完备的规则集合\[[1](#参考文献)\]。
+
+为解决以上问题，统计机器翻译（Statistical Machine Translation, SMT）技术应运而生。在统计机器翻译技术中，转化规则是由机器自动从大规模的语料中学习得到的，而非我们人主动提供规则。因此，它克服了基于规则的翻译系统所面临的知识获取瓶颈的问题，但仍然存在许多挑战：1）人为设计许多特征（feature），但永远无法覆盖所有的语言现象；2）难以利用全局的特征；3）依赖于许多预处理环节，如词语对齐、分词或符号化（tokenization）、规则抽取、句法分析等，而每个环节的错误会逐步累积，对翻译的影响也越来越大。
+
+近年来，深度学习技术的发展为解决上述挑战提供了新的思路。将深度学习应用于机器翻译任务的方法大致分为两类：1）仍以统计机器翻译系统为框架，只是利用神经网络来改进其中的关键模块，如语言模型、调序模型等（见图1的左半部分）；2）不再以统计机器翻译系统为框架，而是直接用神经网络将源语言映射到目标语言，即端到端的神经网络机器翻译（End-to-End Neural Machine Translation, End-to-End NMT）（见图1的右半部分），简称为NMT模型。
+<div align="center">
+<img src="https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/image/nmt.png?raw=true" width = "400" align=center/><br/>
+图1. 基于神经网络的机器翻译系统
+</div>
+
+本教程主要介绍NMT模型，以及如何用PaddlePaddle来训练一个NMT模型。
+
+## 效果展示
+
+以中英翻译（中文翻译到英文）的模型为例，当模型训练完毕时，如果输入如下已分词的中文句子：
+```text
+这些 是 希望 的 曙光 和 解脱 的 迹象 .
+```
+如果设定显示翻译结果的条数（即[柱搜索算法](#柱搜索算法)的宽度）为3，生成的英语句子如下：
+```text
+0 -5.36816   These are signs of hope and relief . <e>
+1 -6.23177   These are the light of hope and relief . <e>
+2 -7.7914  These are the light of hope and the relief of hope . <e>
+```
+
+- 左起第一列是生成句子的序号；左起第二列是该条句子的得分（从大到小），分值越高越好；左起第三列是生成的英语句子。
+
+- 另外有两个特殊标志：`<e>`表示句子的结尾，`<unk>`表示未登录词（unknown word），即未在训练字典中出现的词。
+
+## 模型概览
+
+本节依次介绍双向循环神经网络（Bi-directional Recurrent Neural Network），NMT模型中典型的编码器-解码器（Encoder-Decoder）框架以及柱搜索（beam search）算法。
+
+### 双向循环神经网络
+
+我们已经在[语义角色标注](https://github.com/PaddlePaddle/book/blob/develop/07.label_semantic_roles/README.cn.md)一章中介绍了一种双向循环神经网络，这里介绍Bengio团队在论文\[[2](#参考文献),[4](#参考文献)\]中提出的另一种结构。该结构的目的是输入一个序列，得到其在每个时刻的特征表示，即输出的每个时刻都用定长向量表示到该时刻的上下文语义信息。
+
+具体来说，该双向循环神经网络分别在时间维以顺序和逆序——即前向（forward）和后向（backward）——依次处理输入序列，并将每个时间步RNN的输出拼接成为最终的输出层。这样每个时间步的输出节点，都包含了输入序列中当前时刻完整的过去和未来的上下文信息。下图展示的是一个按时间步展开的双向循环神经网络。该网络包含一个前向和一个后向RNN，其中有六个权重矩阵：输入到前向隐层和后向隐层的权重矩阵（`$W_1, W_3$`），隐层到隐层自己的权重矩阵（`$W_2,W_5$`），前向隐层和后向隐层到输出层的权重矩阵（`$W_4, W_6$`）。注意，该网络的前向隐层和后向隐层之间没有连接。
+
+
+<div align="center">
+<img src = "https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/image/bi_rnn.png?raw=true" width="400"><br/>
+图2. 按时间步展开的双向循环神经网络
+</div>
+
+### 编码器-解码器框架
+
+编码器-解码器（Encoder-Decoder）\[[2](#参考文献)\]框架用于解决由一个任意长度的源序列到另一个任意长度的目标序列的变换问题。即编码阶段将整个源序列编码成一个向量，解码阶段通过最大化预测序列概率，从中解码出整个目标序列。编码和解码的过程通常都使用RNN实现。
+![encoder_decoder](./image/encoder_decoder.png)
+<div align="center">
+<img src ="https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/image/encoder_decoder.png?raw=true" width="400"><br/>
+图3. 编码器-解码器框架
+</div>
+
+<a name="编码器"></a>
+#### 编码器
+
+编码阶段分为三步：
+
+1. one-hot vector表示：将源语言句子`$x=\left \{ x_1,x_2,...,x_T \right \}$`的每个词`$x_i$`表示成一个列向量`$w_i\epsilon \left \{ 0,1 \right \}^{\left | V \right |},i=1,2,...,T$`。这个向量`$w_i$`的维度与词汇表大小`$\left | V \right |$` 相同，并且只有一个维度上有值1（该位置对应该词在词汇表中的位置），其余全是0。
+
+2. 映射到低维语义空间的词向量：one-hot vector表示存在两个问题，1）生成的向量维度往往很大，容易造成维数灾难；2）难以刻画词与词之间的关系（如语义相似性，也就是无法很好地表达语义）。因此，需再one-hot vector映射到低维的语义空间，由一个固定维度的稠密向量（称为词向量）表示。记映射矩阵为`$C\epsilon R^{K\times \left | V \right |}$`，用`$s_i=Cw_i$`表示第`$i$`个词的词向量，`$K$`为向量维度。
+
+3. 用RNN编码源语言词序列：这一过程的计算公式为`$h_i=\varnothing _\theta \left ( h_{i-1}, s_i \right )$`，其中`$h_0$`是一个全零的向量，`$\varnothing _\theta$`是一个非线性激活函数，最后得到的`$\mathbf{h}=\left \{ h_1,..., h_T \right \}$`就是RNN依次读入源语言`$T$`个词的状态编码序列。整句话的向量表示可以采用`$\mathbf{h}$`在最后一个时间步`$T$`的状态编码，或使用时间维上的池化（pooling）结果。
+
+第3步也可以使用双向循环神经网络实现更复杂的句编码表示，具体可以用双向GRU实现。前向GRU按照词序列`$(x_1,x_2,...,x_T)$`的顺序依次编码源语言端词，并得到一系列隐层状态`$(\overrightarrow{h_1},\overrightarrow{h_2},...,\overrightarrow{h_T})$`。类似的，后向GRU按照`$(x_T,x_{T-1},...,x_1)$`的顺序依次编码源语言端词，得到`$(\overleftarrow{h_1},\overleftarrow{h_2},...,\overleftarrow{h_T})$`。最后对于词`$x_i$`，通过拼接两个GRU的结果得到它的隐层状态，即`$h_i=\left [ \overrightarrow{h_i^T},\overleftarrow{h_i^T} \right ]^{T}$`。
+<div align="center">
+<img src="https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/image/encoder_attention.png?raw=true" width="400"><br/>
+图4. 使用双向GRU的编码器
+</div>
+
+#### 解码器
+
+机器翻译任务的训练过程中，解码阶段的目标是最大化下一个正确的目标语言词的概率。思路是：
+1. 每一个时刻，根据源语言句子的编码信息（又叫上下文向量，context vector）`$c$`、真实目标语言序列的第`$i$`个词`$u_i$`和`$i$`时刻RNN的隐层状态`$z_i$`，计算出下一个隐层状态`$z_{i+1}$`。计算公式如下：
+$$z_{i+1}=\phi_{\theta '} \left ( c,u_i,z_i \right )$$
+其中`$\phi _{\theta '}$`是一个非线性激活函数；`$c=q\mathbf{h}$`是源语言句子的上下文向量，在不使用注意力机制时，如果[编码器](#编码器)的输出是源语言句子编码后的最后一个元素，则可以定义`$c=h_T$`；`$u_i$`是目标语言序列的第`$i$`个单词，`$u_0$`是目标语言序列的开始标记`<s>`，表示解码开始；`$z_i$`是`$i$`时刻解码RNN的隐层状态，`$z_0$`是一个全零的向量。
+
+2. 将`$z_{i+1}$`通过`softmax`归一化，得到目标语言序列的第`$i+1$`个单词的概率分布`$p_{i+1}$`。概率分布公式如下：
+$$p\left ( u_{i+1}|u_{&lt;i+1},\mathbf{x} \right )=softmax(W_sz_{i+1}+b_z)$$
+其中`$W_sz_{i+1}+b_z$`是对每个可能的输出单词进行打分，再用softmax归一化就可以得到第`$i+1$`个词的概率`$p_{i+1}$`。
+
+3. 根据`$p_{i+1}$`和`$u_{i+1}$`计算代价。
+
+4. 重复步骤1~3，直到目标语言序列中的所有词处理完毕。
+
+机器翻译任务的生成过程，通俗来讲就是根据预先训练的模型来翻译源语言句子。生成过程中的解码阶段和上述训练过程的有所差异，具体介绍请见[柱搜索算法](#柱搜索算法)。
+
+<a name="柱搜索算法"></a>
+### 柱搜索算法
+
+柱搜索（[beam search](http://en.wikipedia.org/wiki/Beam_search)）是一种启发式图搜索算法，用于在图或树中搜索有限集合中的最优扩展节点，通常用在解空间非常大的系统（如机器翻译、语音识别）中，原因是内存无法装下图或树中所有展开的解。如在机器翻译任务中希望翻译“`<s>你好<e>`”，就算目标语言字典中只有3个词（`<s>`, `<e>`, `hello`），也可能生成无限句话（`hello`循环出现的次数不定），为了找到其中较好的翻译结果，我们可采用柱搜索算法。
+
+柱搜索算法使用广度优先策略建立搜索树，在树的每一层，按照启发代价（heuristic cost）（本教程中，为生成词的log概率之和）对节点进行排序，然后仅留下预先确定的个数（文献中通常称为beam width、beam size、柱宽度等）的节点。只有这些节点会在下一层继续扩展，其他节点就被剪掉了，也就是说保留了质量较高的节点，剪枝了质量较差的节点。因此，搜索所占用的空间和时间大幅减少，但缺点是无法保证一定获得最优解。
+
+使用柱搜索算法的解码阶段，目标是最大化生成序列的概率。思路是：
+1. 每一个时刻，根据源语言句子的编码信息`$c$`、生成的第`$i$`个目标语言序列单词`$u_i$`和`$i$`时刻RNN的隐层状态`$z_i$`，计算出下一个隐层状态`$z_{i+1}$`。
+
+2. 将`$z_{i+1}$`通过`softmax`归一化，得到目标语言序列的第`$i+1$`个单词的概率分布`$p_{i+1}$`。
+
+3. 根据`$p_{i+1}$`采样出单词`$u_{i+1}$`。
+
+4. 重复步骤1~3，直到获得句子结束标记`<e>`或超过句子的最大生成长度为止。
+
+注意：`$z_{i+1}$`和`$p_{i+1}$`的计算公式同[解码器](#解码器)中的一样。且由于生成时的每一步都是通过贪心法实现的，因此并不能保证得到全局最优解。
+
+## 数据介绍
+
+本教程使用[WMT-14](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/)数据集中的[bitexts(after selection)](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/bitexts.tgz)作为训练集，[dev+test data](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgz)作为测试集和生成集。
+
+### 数据预处理
+
+我们的预处理流程包括两步：
+
+- 将每个源语言到目标语言的平行语料库文件合并为一个文件：
+
+- 合并每个`XXX.src`和`XXX.trg`文件为`XXX`。
+
+- `XXX`中的第`$i$`行内容为`XXX.src`中的第`$i$`行和`XXX.trg`中的第`$i$`行连接，用'\t'分隔。
+
+- 创建训练数据的“源字典”和“目标字典”。每个字典都有**DICTSIZE**个单词，包括：语料中词频最高的（DICTSIZE - 3）个单词，和3个特殊符号`<s>`（序列的开始）、`<e>`（序列的结束）和`<unk>`（未登录词）。
+
+### 示例数据
+
+因为完整的数据集数据量较大，为了验证训练流程，PaddlePaddle接口paddle.dataset.wmt14中默认提供了一个经过预处理的[较小规模的数据集](http://paddlepaddle.bj.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz)。
+
+该数据集有193319条训练数据，6003条测试数据，词典长度为30000。因为数据规模限制，使用该数据集训练出来的模型效果无法保证。
+
+## 模型配置说明
+
+下面我们开始根据输入数据的形式配置模型。首先引入所需的库函数以及定义全局变量。
+
+```python
+from __future__ import print_function
+import contextlib
+
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.framework as framework
+import paddle.fluid.layers as pd
+from paddle.fluid.executor import Executor
+from functools import partial
+import os
+
+dict_size = 30000
+source_dict_dim = target_dict_dim = dict_size
+hidden_dim = 32
+word_dim = 16
+batch_size = 2
+max_length = 8
+topk_size = 50
+beam_size = 2
+
+decoder_size = hidden_dim
+```
+
+然后如下实现编码器框架：
+
+   ```python
+   def encoder(is_sparse):
+    src_word_id = pd.data(
+        name="src_word_id", shape=[1], dtype='int64', lod_level=1)
+    src_embedding = pd.embedding(
+        input=src_word_id,
+        size=[dict_size, word_dim],
+        dtype='float32',
+        is_sparse=is_sparse,
+        param_attr=fluid.ParamAttr(name='vemb'))
+
+    fc1 = pd.fc(input=src_embedding, size=hidden_dim * 4, act='tanh')
+    lstm_hidden0, lstm_0 = pd.dynamic_lstm(input=fc1, size=hidden_dim * 4)
+    encoder_out = pd.sequence_last_step(input=lstm_hidden0)
+    return encoder_out
+   ```
+
+再实现训练模式下的解码器：
+
+```python
+   def train_decoder(context, is_sparse):
+    trg_language_word = pd.data(
+        name="target_language_word", shape=[1], dtype='int64', lod_level=1)
+    trg_embedding = pd.embedding(
+        input=trg_language_word,
+        size=[dict_size, word_dim],
+        dtype='float32',
+        is_sparse=is_sparse,
+        param_attr=fluid.ParamAttr(name='vemb'))
+
+    rnn = pd.DynamicRNN()
+    with rnn.block():
+        current_word = rnn.step_input(trg_embedding)
+        pre_state = rnn.memory(init=context)
+        current_state = pd.fc(input=[current_word, pre_state],
+                              size=decoder_size,
+                              act='tanh')
+
+        current_score = pd.fc(input=current_state,
+                              size=target_dict_dim,
+                              act='softmax')
+        rnn.update_memory(pre_state, current_state)
+        rnn.output(current_score)
+
+    return rnn()
+```
+
+实现推测模式下的解码器：
+
+```python
+def decode(context, is_sparse):
+    init_state = context
+    array_len = pd.fill_constant(shape=[1], dtype='int64', value=max_length)
+    counter = pd.zeros(shape=[1], dtype='int64', force_cpu=True)
+
+    # fill the first element with init_state
+    state_array = pd.create_array('float32')
+    pd.array_write(init_state, array=state_array, i=counter)
+
+    # ids, scores as memory
+    ids_array = pd.create_array('int64')
+    scores_array = pd.create_array('float32')
+
+    init_ids = pd.data(name="init_ids", shape=[1], dtype="int64", lod_level=2)
+    init_scores = pd.data(
+        name="init_scores", shape=[1], dtype="float32", lod_level=2)
+
+    pd.array_write(init_ids, array=ids_array, i=counter)
+    pd.array_write(init_scores, array=scores_array, i=counter)
+
+    cond = pd.less_than(x=counter, y=array_len)
+
+    while_op = pd.While(cond=cond)
+    with while_op.block():
+        pre_ids = pd.array_read(array=ids_array, i=counter)
+        pre_state = pd.array_read(array=state_array, i=counter)
+        pre_score = pd.array_read(array=scores_array, i=counter)
+
+        # expand the lod of pre_state to be the same with pre_score
+        pre_state_expanded = pd.sequence_expand(pre_state, pre_score)
+
+        pre_ids_emb = pd.embedding(
+            input=pre_ids,
+            size=[dict_size, word_dim],
+            dtype='float32',
+            is_sparse=is_sparse)
+
+        # use rnn unit to update rnn
+        current_state = pd.fc(input=[pre_state_expanded, pre_ids_emb],
+                              size=decoder_size,
+                              act='tanh')
+        current_state_with_lod = pd.lod_reset(x=current_state, y=pre_score)
+        # use score to do beam search
+        current_score = pd.fc(input=current_state_with_lod,
+                              size=target_dict_dim,
+                              act='softmax')
+        topk_scores, topk_indices = pd.topk(current_score, k=beam_size)
+        # calculate accumulated scores after topk to reduce computation cost
+        accu_scores = pd.elementwise_add(
+            x=pd.log(topk_scores), y=pd.reshape(pre_score, shape=[-1]), axis=0)
+        selected_ids, selected_scores = pd.beam_search(
+            pre_ids,
+            pre_score,
+            topk_indices,
+            accu_scores,
+            beam_size,
+            end_id=10,
+            level=0)
+
+        pd.increment(x=counter, value=1, in_place=True)
+
+        # update the memories
+        pd.array_write(current_state, array=state_array, i=counter)
+        pd.array_write(selected_ids, array=ids_array, i=counter)
+        pd.array_write(selected_scores, array=scores_array, i=counter)
+
+        # update the break condition: up to the max length or all candidates of
+        # source sentences have ended.
+        length_cond = pd.less_than(x=counter, y=array_len)
+        finish_cond = pd.logical_not(pd.is_empty(x=selected_ids))
+        pd.logical_and(x=length_cond, y=finish_cond, out=cond)
+
+    translation_ids, translation_scores = pd.beam_search_decode(
+        ids=ids_array, scores=scores_array, beam_size=beam_size, end_id=10)
+
+    return translation_ids, translation_scores
+```
+
+进而，我们定义一个`train_program`来使用`inference_program`计算出的结果，在标记数据的帮助下来计算误差。我们还定义了一个`optimizer_func`来定义优化器。
+
+```python
+def train_program(is_sparse):
+    context = encoder(is_sparse)
+    rnn_out = train_decoder(context, is_sparse)
+    label = pd.data(
+        name="target_language_next_word", shape=[1], dtype='int64', lod_level=1)
+    cost = pd.cross_entropy(input=rnn_out, label=label)
+    avg_cost = pd.mean(cost)
+    return avg_cost
+
+
+def optimizer_func():
+    return fluid.optimizer.Adagrad(
+        learning_rate=1e-4,
+        regularization=fluid.regularizer.L2DecayRegularizer(
+            regularization_coeff=0.1))
+```
+
+## 训练模型
+
+### 定义训练环境
+定义您的训练环境，可以指定训练是发生在CPU还是GPU上。
+
+```python
+use_cuda = False
+place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+```
+
+### 定义数据提供器
+下一步是为训练和测试定义数据提供器。提供器读入一个大小为 `BATCH_SIZE`的数据。`paddle.dataset.wmt.train` 每次会在乱序化后提供一个大小为`BATCH_SIZE`的数据，乱序化的大小为缓存大小`buf_size`。
+
+```python
+train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.wmt14.train(dict_size), buf_size=1000),
+        batch_size=batch_size)
+```
+
+### 构造训练器(trainer)
+训练器需要一个训练程序和一个训练优化函数。
+
+```python
+is_sparse = False
+trainer = fluid.Trainer(
+        train_func=partial(train_program, is_sparse),
+        place=place,
+        optimizer_func=optimizer_func)
+```
+
+### 提供数据
+
+`feed_order`用来定义每条产生的数据和`paddle.layer.data`之间的映射关系。比如，`wmt14.train`产生的第一列的数据对应的是`src_word_id`这个特征。
+
+```python
+feed_order = [
+        'src_word_id', 'target_language_word', 'target_language_next_word'
+    ]
+```
+
+### 事件处理器
+回调函数`event_handler`在一个之前定义好的事件发生后会被调用。例如，我们可以在每步训练结束后查看误差。
+
+```python
+def event_handler(event):
+    if isinstance(event, fluid.EndStepEvent):
+        if event.step % 10 == 0:
+            print('pass_id=' + str(event.epoch) + ' batch=' + str(event.step))
+
+        if event.step == 20:
+            trainer.stop()
+```
+
+### 开始训练
+最后，我们传入训练循环数（`num_epoch`）和一些别的参数，调用 `trainer.train` 来开始训练。
+
+```python
+EPOCH_NUM = 1
+
+trainer.train(
+        reader=train_reader,
+        num_epochs=EPOCH_NUM,
+        event_handler=event_handler,
+        feed_order=feed_order)
+```
+
+## 应用模型
+
+### 定义解码部分
+
+使用上面定义的 `encoder` 和 `decoder` 函数来推测翻译后的对应id和分数.
+
+```python
+context = encoder(is_sparse)
+translation_ids, translation_scores = decode(context, is_sparse)
+```
+
+### 定义数据
+
+我们先初始化id和分数来生成tensors来作为输入数据。在这个预测例子中，我们用`wmt14.test`数据中的第一个记录来做推测，最后我们用"源字典"和"目标字典"来列印对应的句子结果。
+
+```python
+init_ids_data = np.array([1 for _ in range(batch_size)], dtype='int64')
+init_scores_data = np.array(
+    [1. for _ in range(batch_size)], dtype='float32')
+init_ids_data = init_ids_data.reshape((batch_size, 1))
+init_scores_data = init_scores_data.reshape((batch_size, 1))
+init_lod = [1] * batch_size
+init_lod = [init_lod, init_lod]
+
+init_ids = fluid.create_lod_tensor(init_ids_data, init_lod, place)
+init_scores = fluid.create_lod_tensor(init_scores_data, init_lod, place)
+
+test_data = paddle.batch(
+    paddle.reader.shuffle(
+        paddle.dataset.wmt14.test(dict_size), buf_size=1000),
+    batch_size=batch_size)
+
+feed_order = ['src_word_id']
+feed_list = [
+    framework.default_main_program().global_block().var(var_name)
+    for var_name in feed_order
+]
+feeder = fluid.DataFeeder(feed_list, place)
+
+src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)
+```
+
+### 测试
+现在我们可以进行预测了。我们要在`feed_order`提供对应参数，放在`executor`上运行以取得id和分数结果
+
+```python
+exe = Executor(place)
+exe.run(framework.default_startup_program())
+
+for data in test_data():
+    feed_data = map(lambda x: [x[0]], data)
+    feed_dict = feeder.feed(feed_data)
+    feed_dict['init_ids'] = init_ids
+    feed_dict['init_scores'] = init_scores
+
+    results = exe.run(
+        framework.default_main_program(),
+        feed=feed_dict,
+        fetch_list=[translation_ids, translation_scores],
+        return_numpy=False)
+
+    result_ids = np.array(results[0])
+    result_scores = np.array(results[1])
+
+    print("Original sentence:")
+    print(" ".join([src_dict[w] for w in feed_data[0][0][1:-1]]))
+    print("Translated score and sentence:")
+    for i in xrange(beam_size):
+        start_pos = result_ids_lod[1][i] + 1
+        end_pos = result_ids_lod[1][i+1]
+        print("%d\t%.4f\t%s\n" % (i+1, result_scores[end_pos-1],
+                " ".join([trg_dict[w] for w in result_ids[start_pos:end_pos]])))
+
+    break
+```
+
+## 总结
+
+端到端的神经网络机器翻译是近几年兴起的一种全新的机器翻译方法。本章中，我们介绍了NMT中典型的“编码器-解码器”框架。由于NMT是一个典型的Seq2Seq（Sequence to Sequence，序列到序列）学习问题，因此，Seq2Seq中的query改写（query rewriting）、摘要、单轮对话等问题都可以用本教程的模型来解决。
+
+## 参考文献
+
+1. Koehn P. [Statistical machine translation](https://books.google.com.hk/books?id=4v_Cx1wIMLkC&printsec=frontcover&hl=zh-CN&source=gbs_ge_summary_r&cad=0#v=onepage&q&f=false)[M]. Cambridge University Press, 2009.
+2. Cho K, Van Merriënboer B, Gulcehre C, et al. [Learning phrase representations using RNN encoder-decoder for statistical machine translation](http://www.aclweb.org/anthology/D/D14/D14-1179.pdf)[C]//Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP), 2014: 1724-1734.
+3. Chung J, Gulcehre C, Cho K H, et al. [Empirical evaluation of gated recurrent neural networks on sequence modeling](https://arxiv.org/abs/1412.3555)[J]. arXiv preprint arXiv:1412.3555, 2014.
+4.  Bahdanau D, Cho K, Bengio Y. [Neural machine translation by jointly learning to align and translate](https://arxiv.org/abs/1409.0473)[C]//Proceedings of ICLR 2015, 2015.
+5. Papineni K, Roukos S, Ward T, et al. [BLEU: a method for automatic evaluation of machine translation](http://dl.acm.org/citation.cfm?id=1073135)[C]//Proceedings of the 40th annual meeting on association for computational linguistics. Association for Computational Linguistics, 2002: 311-318.
+
+<br/>
+<a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span> 由 <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作，采用 <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">知识共享 署名-相同方式共享 4.0 国际 许可协议</a>进行许可。
diff --git a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/.gitignore b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..f23901aeb3a9e7cd12611fc556742670d04a9bb5
--- /dev/null
+++ b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/.gitignore
@@ -0,0 +1,2 @@
+.idea
+.ipynb_checkpoints
diff --git a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/README.cn.md b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/README.cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..4b79e62f74e587fcd939d9f9e911af80992ea6a3
--- /dev/null
+++ b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/README.cn.md
@@ -0,0 +1,537 @@
+# 个性化推荐
+
+本教程源代码目录在[book/recommender_system](https://github.com/PaddlePaddle/book/tree/develop/05.recommender_system)， 初次使用请参考PaddlePaddle[安装教程](https://github.com/PaddlePaddle/book/blob/develop/README.cn.md#运行这本书)，更多内容请参考本教程的[视频课堂](http://bit.baidu.com/course/detail/id/176.html)。
+
+## 背景介绍
+
+在网络技术不断发展和电子商务规模不断扩大的背景下，商品数量和种类快速增长，用户需要花费大量时间才能找到自己想买的商品，这就是信息超载问题。为了解决这个难题，推荐系统（Recommender System）应运而生。
+
+个性化推荐系统是信息过滤系统（Information Filtering System）的子集，它可以用在很多领域，如电影、音乐、电商和 Feed 流推荐等。推荐系统通过分析、挖掘用户行为，发现用户的个性化需求与兴趣特点，将用户可能感兴趣的信息或商品推荐给用户。与搜索引擎不同，推荐系统不需要用户准确地描述出自己的需求，而是根据分析历史行为建模，主动提供满足用户兴趣和需求的信息。
+
+传统的推荐系统方法主要有：
+
+- 协同过滤推荐（Collaborative Filtering Recommendation）：该方法收集分析用户历史行为、活动、偏好，计算一个用户与其他用户的相似度，利用目标用户的相似用户对商品评价的加权评价值，来预测目标用户对特定商品的喜好程度。优点是可以给用户推荐未浏览过的新产品；缺点是对于没有任何行为的新用户存在冷启动的问题，同时也存在用户与商品之间的交互数据不够多造成的稀疏问题，会导致模型难以找到相近用户。
+- 基于内容过滤推荐[[1](#参考文献)]（Content-based Filtering Recommendation）：该方法利用商品的内容描述，抽象出有意义的特征，通过计算用户的兴趣和商品描述之间的相似度，来给用户做推荐。优点是简单直接，不需要依据其他用户对商品的评价，而是通过商品属性进行商品相似度度量，从而推荐给用户所感兴趣商品的相似商品；缺点是对于没有任何行为的新用户同样存在冷启动的问题。
+- 组合推荐[[2](#参考文献)]（Hybrid Recommendation）：运用不同的输入和技术共同进行推荐，以弥补各自推荐技术的缺点。
+
+其中协同过滤是应用最广泛的技术之一，它又可以分为多个子类：基于用户 （User-Based）的推荐[[3](#参考文献)] 、基于物品（Item-Based）的推荐[[4](#参考文献)]、基于社交网络关系（Social-Based）的推荐[[5](#参考文献)]、基于模型（Model-based）的推荐等。1994年明尼苏达大学推出的GroupLens系统[[3](#参考文献)]一般被认为是推荐系统成为一个相对独立的研究方向的标志。该系统首次提出了基于协同过滤来完成推荐任务的思想，此后，基于该模型的协同过滤推荐引领了推荐系统十几年的发展方向。
+
+深度学习具有优秀的自动提取特征的能力，能够学习多层次的抽象特征表示，并对异质或跨域的内容信息进行学习，可以一定程度上处理推荐系统冷启动问题[[6](#参考文献)]。本教程主要介绍个性化推荐的深度学习模型，以及如何使用PaddlePaddle实现模型。
+
+## 效果展示
+
+我们使用包含用户信息、电影信息与电影评分的数据集作为个性化推荐的应用场景。当我们训练好模型后，只需要输入对应的用户ID和电影ID，就可以得出一个匹配的分数（范围[0,5]，分数越高视为兴趣越大），然后根据所有电影的推荐得分排序，推荐给用户可能感兴趣的电影。
+
+```
+Input movie_id: 1962
+Input user_id: 1
+Prediction Score is 4.25
+```
+
+## 模型概览
+
+本章中，我们首先介绍YouTube的视频推荐系统[[7](#参考文献)]，然后介绍我们实现的融合推荐模型。
+
+### YouTube的深度神经网络推荐系统
+
+YouTube是世界上最大的视频上传、分享和发现网站，YouTube推荐系统为超过10亿用户从不断增长的视频库中推荐个性化的内容。整个系统由两个神经网络组成：候选生成网络和排序网络。候选生成网络从百万量级的视频库中生成上百个候选，排序网络对候选进行打分排序，输出排名最高的数十个结果。系统结构如图1所示：
+
+<p align="center">
+<img src="https://github.com/PaddlePaddle/book/blob/develop/05.recommender_system/image/YouTube_Overview.png?raw=true" width="70%" ><br/>
+图1. YouTube 推荐系统结构
+</p>
+
+#### 候选生成网络（Candidate Generation Network）
+
+候选生成网络将推荐问题建模为一个类别数极大的多类分类问题：对于一个Youtube用户，使用其观看历史（视频ID）、搜索词记录（search tokens）、人口学信息（如地理位置、用户登录设备）、二值特征（如性别，是否登录）和连续特征（如用户年龄）等，对视频库中所有视频进行多分类，得到每一类别的分类结果（即每一个视频的推荐概率），最终输出概率较高的几百个视频。
+
+首先，将观看历史及搜索词记录这类历史信息，映射为向量后取平均值得到定长表示；同时，输入人口学特征以优化新用户的推荐效果，并将二值特征和连续特征归一化处理到[0, 1]范围。接下来，将所有特征表示拼接为一个向量，并输入给非线形多层感知器（MLP，详见[识别数字](https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/README.cn.md)教程）处理。最后，训练时将MLP的输出给softmax做分类，预测时计算用户的综合特征（MLP的输出）与所有视频的相似度，取得分最高的$k$个作为候选生成网络的筛选结果。图2显示了候选生成网络结构。
+
+<p align="center">
+<img src="https://github.com/PaddlePaddle/book/blob/develop/05.recommender_system/image/Deep_candidate_generation_model_architecture.png?raw=true" width="70%" ><br/>
+图2. 候选生成网络结构
+</p>
+
+对于一个用户$U$，预测此刻用户要观看的视频$\omega$为视频$i$的概率公式为：
+
+$$P(\omega=i|u)=\frac{e^{v_{i}u}}{\sum_{j \in V}e^{v_{j}u}}$$
+
+其中$u$为用户$U$的特征表示，$V$为视频库集合，$v_i$为视频库中第$i$个视频的特征表示。$u$和$v_i$为长度相等的向量，两者点积可以通过全连接层实现。
+
+考虑到softmax分类的类别数非常多，为了保证一定的计算效率：1）训练阶段，使用负样本类别采样将实际计算的类别数缩小至数千；2）推荐（预测）阶段，忽略softmax的归一化计算（不影响结果），将类别打分问题简化为点积（dot product）空间中的最近邻（nearest neighbor）搜索问题，取与$u$最近的$k$个视频作为生成的候选。
+
+#### 排序网络（Ranking Network）
+排序网络的结构类似于候选生成网络，但是它的目标是对候选进行更细致的打分排序。和传统广告排序中的特征抽取方法类似，这里也构造了大量的用于视频排序的相关特征（如视频 ID、上次观看时间等）。这些特征的处理方式和候选生成网络类似，不同之处是排序网络的顶部是一个加权逻辑回归（weighted logistic regression），它对所有候选视频进行打分，从高到底排序后将分数较高的一些视频返回给用户。
+
+### 融合推荐模型
+本节会使卷积神经网络（Convolutional Neural Networks）来学习电影名称的表示。下面会依次介绍文本卷积神经网络以及融合推荐模型。
+
+#### 文本卷积神经网络（CNN）
+
+卷积神经网络经常用来处理具有类似网格拓扑结构（grid-like topology）的数据。例如，图像可以视为二维网格的像素点，自然语言可以视为一维的词序列。卷积神经网络可以提取多种局部特征，并对其进行组合抽象得到更高级的特征表示。实验表明，卷积神经网络能高效地对图像及文本问题进行建模处理。
+
+卷积神经网络主要由卷积（convolution）和池化（pooling）操作构成，其应用及组合方式灵活多变，种类繁多。本小结我们以如图3所示的网络进行讲解：
+
+<p align="center">
+<img src="https://github.com/PaddlePaddle/book/blob/develop/05.recommender_system/image/text_cnn.png?raw=true" width = "80%" align="center"/><br/>
+图3. 卷积神经网络文本分类模型
+</p>
+
+假设待处理句子的长度为$n$，其中第$i$个词的词向量（word embedding）为$x_i\in\mathbb{R}^k$，$k$为维度大小。
+
+首先，进行词向量的拼接操作：将每$h$个词拼接起来形成一个大小为$h$的词窗口，记为$x_{i:i+h-1}$，它表示词序列$x_{i},x_{i+1},\ldots,x_{i+h-1}$的拼接，其中，$i$表示词窗口中第一个词在整个句子中的位置，取值范围从$1$到$n-h+1$，$x_{i:i+h-1}\in\mathbb{R}^{hk}$。
+
+其次，进行卷积操作：把卷积核(kernel)$w\in\mathbb{R}^{hk}$应用于包含$h$个词的窗口$x_{i:i+h-1}$，得到特征$c_i=f(w\cdot x_{i:i+h-1}+b)$，其中$b\in\mathbb{R}$为偏置项（bias），$f$为非线性激活函数，如$sigmoid$。将卷积核应用于句子中所有的词窗口${x_{1:h},x_{2:h+1},\ldots,x_{n-h+1:n}}$，产生一个特征图（feature map）：
+
+$$c=[c_1,c_2,\ldots,c_{n-h+1}], c \in \mathbb{R}^{n-h+1}$$
+
+接下来，对特征图采用时间维度上的最大池化（max pooling over time）操作得到此卷积核对应的整句话的特征$\hat c$，它是特征图中所有元素的最大值：
+
+$$\hat c=max(c)$$
+
+#### 模型概览
+
+在融合推荐模型的电影推荐系统中：
+
+1. 首先，使用用户特征和电影特征作为神经网络的输入，其中：
+
+   - 用户特征融合了四个属性信息，分别是用户ID、性别、职业和年龄。
+
+   - 电影特征融合了三个属性信息，分别是电影ID、电影类型ID和电影名称。
+
+2. 对用户特征，将用户ID映射为维度大小为256的向量表示，输入全连接层，并对其他三个属性也做类似的处理。然后将四个属性的特征表示分别全连接并相加。
+
+3. 对电影特征，将电影ID以类似用户ID的方式进行处理，电影类型ID以向量的形式直接输入全连接层，电影名称用文本卷积神经网络得到其定长向量表示。然后将三个属性的特征表示分别全连接并相加。
+
+4. 得到用户和电影的向量表示后，计算二者的余弦相似度作为推荐系统的打分。最后，用该相似度打分和用户真实打分的差异的平方作为该回归模型的损失函数。
+
+<p align="center">
+
+<img src="https://github.com/PaddlePaddle/book/blob/develop/05.recommender_system/image/rec_regression_network.png?raw=true" width="90%" ><br/>
+图4. 融合推荐模型
+</p>
+
+## 数据准备
+
+### 数据介绍与下载
+
+我们以 [MovieLens 百万数据集（ml-1m）](http://files.grouplens.org/datasets/movielens/ml-1m.zip)为例进行介绍。ml-1m 数据集包含了 6,000 位用户对 4,000 部电影的 1,000,000 条评价（评分范围 1~5 分，均为整数），由 GroupLens Research 实验室搜集整理。
+
+Paddle在API中提供了自动加载数据的模块。数据模块为 `paddle.dataset.movielens`
+
+
+```python
+import paddle
+movie_info = paddle.dataset.movielens.movie_info()
+print movie_info.values()[0]
+```
+
+
+```python
+# Run this block to show dataset's documentation
+# help(paddle.dataset.movielens)
+```
+
+在原始数据中包含电影的特征数据，用户的特征数据，和用户对电影的评分。
+
+例如，其中某一个电影特征为:
+
+
+```python
+movie_info = paddle.dataset.movielens.movie_info()
+print movie_info.values()[0]
+```
+
+    <MovieInfo id(1), title(Toy Story ), categories(['Animation', "Children's", 'Comedy'])>
+
+
+这表示，电影的id是1，标题是《Toy Story》，该电影被分为到三个类别中。这三个类别是动画，儿童，喜剧。
+
+
+```python
+user_info = paddle.dataset.movielens.user_info()
+print user_info.values()[0]
+```
+
+    <UserInfo id(1), gender(F), age(1), job(10)>
+
+
+这表示，该用户ID是1，女性，年龄比18岁还年轻。职业ID是10。
+
+
+其中，年龄使用下列分布
+
+*  1:  "Under 18"
+* 18:  "18-24"
+* 25:  "25-34"
+* 35:  "35-44"
+* 45:  "45-49"
+* 50:  "50-55"
+* 56:  "56+"
+
+职业是从下面几种选项里面选则得出:
+
+*  0:  "other" or not specified
+*  1:  "academic/educator"
+*  2:  "artist"
+*  3:  "clerical/admin"
+*  4:  "college/grad student"
+*  5:  "customer service"
+*  6:  "doctor/health care"
+*  7:  "executive/managerial"
+*  8:  "farmer"
+*  9:  "homemaker"
+* 10:  "K-12 student"
+* 11:  "lawyer"
+* 12:  "programmer"
+* 13:  "retired"
+* 14:  "sales/marketing"
+* 15:  "scientist"
+* 16:  "self-employed"
+* 17:  "technician/engineer"
+* 18:  "tradesman/craftsman"
+* 19:  "unemployed"
+* 20:  "writer"
+
+而对于每一条训练/测试数据，均为 <用户特征> + <电影特征> + 评分。
+
+例如，我们获得第一条训练数据:
+
+
+```python
+train_set_creator = paddle.dataset.movielens.train()
+train_sample = next(train_set_creator())
+uid = train_sample[0]
+mov_id = train_sample[len(user_info[uid].value())]
+print "User %s rates Movie %s with Score %s"%(user_info[uid], movie_info[mov_id], train_sample[-1])
+```
+
+    User <UserInfo id(1), gender(F), age(1), job(10)> rates Movie <MovieInfo id(1193), title(One Flew Over the Cuckoo's Nest ), categories(['Drama'])> with Score [5.0]
+
+
+即用户1对电影1193的评价为5分。
+
+## 模型配置说明
+
+下面我们开始根据输入数据的形式配置模型。首先引入所需的库函数以及定义全局变量。
+
+
+```python
+from __future__ import print_function
+import math
+import sys
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+import paddle.fluid.nets as nets
+
+IS_SPARSE = True
+USE_GPU = False
+BATCH_SIZE = 256
+```
+
+然后为我们的用户特征综合模型定义模型配置
+
+```python
+def get_usr_combined_features():
+
+    USR_DICT_SIZE = paddle.dataset.movielens.max_user_id() + 1
+
+    uid = layers.data(name='user_id', shape=[1], dtype='int64')
+
+    usr_emb = layers.embedding(
+        input=uid,
+        dtype='float32',
+        size=[USR_DICT_SIZE, 32],
+        param_attr='user_table',
+        is_sparse=IS_SPARSE)
+
+    usr_fc = layers.fc(input=usr_emb, size=32)
+
+    USR_GENDER_DICT_SIZE = 2
+
+    usr_gender_id = layers.data(name='gender_id', shape=[1], dtype='int64')
+
+    usr_gender_emb = layers.embedding(
+        input=usr_gender_id,
+        size=[USR_GENDER_DICT_SIZE, 16],
+        param_attr='gender_table',
+        is_sparse=IS_SPARSE)
+
+    usr_gender_fc = layers.fc(input=usr_gender_emb, size=16)
+
+    USR_AGE_DICT_SIZE = len(paddle.dataset.movielens.age_table)
+    usr_age_id = layers.data(name='age_id', shape=[1], dtype="int64")
+
+    usr_age_emb = layers.embedding(
+        input=usr_age_id,
+        size=[USR_AGE_DICT_SIZE, 16],
+        is_sparse=IS_SPARSE,
+        param_attr='age_table')
+
+    usr_age_fc = layers.fc(input=usr_age_emb, size=16)
+
+    USR_JOB_DICT_SIZE = paddle.dataset.movielens.max_job_id() + 1
+    usr_job_id = layers.data(name='job_id', shape=[1], dtype="int64")
+
+    usr_job_emb = layers.embedding(
+        input=usr_job_id,
+        size=[USR_JOB_DICT_SIZE, 16],
+        param_attr='job_table',
+        is_sparse=IS_SPARSE)
+
+    usr_job_fc = layers.fc(input=usr_job_emb, size=16)
+
+    concat_embed = layers.concat(
+        input=[usr_fc, usr_gender_fc, usr_age_fc, usr_job_fc], axis=1)
+
+    usr_combined_features = layers.fc(input=concat_embed, size=200, act="tanh")
+
+    return usr_combined_features
+```
+
+如上述代码所示，对于每个用户，我们输入4维特征。其中包括user_id,gender_id,age_id,job_id。这几维特征均是简单的整数值。为了后续神经网络处理这些特征方便，我们借鉴NLP中的语言模型，将这几维离散的整数值，变换成embedding取出。分别形成usr_emb, usr_gender_emb, usr_age_emb, usr_job_emb。
+
+然后，我们对于所有的用户特征，均输入到一个全连接层(fc)中。将所有特征融合为一个200维度的特征。
+
+进而，我们对每一个电影特征做类似的变换，网络配置为:
+
+
+```python
+def get_mov_combined_features():
+
+    MOV_DICT_SIZE = paddle.dataset.movielens.max_movie_id() + 1
+
+    mov_id = layers.data(name='movie_id', shape=[1], dtype='int64')
+
+    mov_emb = layers.embedding(
+        input=mov_id,
+        dtype='float32',
+        size=[MOV_DICT_SIZE, 32],
+        param_attr='movie_table',
+        is_sparse=IS_SPARSE)
+
+    mov_fc = layers.fc(input=mov_emb, size=32)
+
+    CATEGORY_DICT_SIZE = len(paddle.dataset.movielens.movie_categories())
+
+    category_id = layers.data(
+        name='category_id', shape=[1], dtype='int64', lod_level=1)
+
+    mov_categories_emb = layers.embedding(
+        input=category_id, size=[CATEGORY_DICT_SIZE, 32], is_sparse=IS_SPARSE)
+
+    mov_categories_hidden = layers.sequence_pool(
+        input=mov_categories_emb, pool_type="sum")
+
+    MOV_TITLE_DICT_SIZE = len(paddle.dataset.movielens.get_movie_title_dict())
+
+    mov_title_id = layers.data(
+        name='movie_title', shape=[1], dtype='int64', lod_level=1)
+
+    mov_title_emb = layers.embedding(
+        input=mov_title_id, size=[MOV_TITLE_DICT_SIZE, 32], is_sparse=IS_SPARSE)
+
+    mov_title_conv = nets.sequence_conv_pool(
+        input=mov_title_emb,
+        num_filters=32,
+        filter_size=3,
+        act="tanh",
+        pool_type="sum")
+
+    concat_embed = layers.concat(
+        input=[mov_fc, mov_categories_hidden, mov_title_conv], axis=1)
+
+    mov_combined_features = layers.fc(input=concat_embed, size=200, act="tanh")
+
+    return mov_combined_features
+```
+
+电影标题名称(title)是一个序列的整数，整数代表的是这个词在索引序列中的下标。这个序列会被送入 `sequence_conv_pool` 层，这个层会在时间维度上使用卷积和池化。因为如此，所以输出会是固定长度，尽管输入的序列长度各不相同。
+
+最后，我们定义一个`inference_program`来使用余弦相似度计算用户特征与电影特征的相似性。
+
+```python
+def inference_program():
+    usr_combined_features = get_usr_combined_features()
+    mov_combined_features = get_mov_combined_features()
+
+    inference = layers.cos_sim(X=usr_combined_features, Y=mov_combined_features)
+    scale_infer = layers.scale(x=inference, scale=5.0)
+
+    return scale_infer
+```
+
+进而，我们定义一个`train_program`来使用`inference_program`计算出的结果，在标记数据的帮助下来计算误差。我们还定义了一个`optimizer_func`来定义优化器。
+
+```python
+def train_program():
+
+    scale_infer = inference_program()
+
+    label = layers.data(name='score', shape=[1], dtype='float32')
+    square_cost = layers.square_error_cost(input=scale_infer, label=label)
+    avg_cost = layers.mean(square_cost)
+
+    return [avg_cost, scale_infer]
+
+
+def optimizer_func():
+    return fluid.optimizer.SGD(learning_rate=0.2)
+```
+
+
+## 训练模型
+
+### 定义训练环境
+定义您的训练环境，可以指定训练是发生在CPU还是GPU上。
+
+```python
+use_cuda = False
+place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+```
+
+### 定义数据提供器
+下一步是为训练和测试定义数据提供器。提供器读入一个大小为 `BATCH_SIZE`的数据。`paddle.dataset.movielens.train` 每次会在乱序化后提供一个大小为`BATCH_SIZE`的数据，乱序化的大小为缓存大小`buf_size`。
+
+```python
+train_reader = paddle.batch(
+    paddle.reader.shuffle(
+        paddle.dataset.movielens.train(), buf_size=8192),
+    batch_size=BATCH_SIZE)
+
+test_reader = paddle.batch(
+    paddle.dataset.movielens.test(), batch_size=BATCH_SIZE)
+```
+
+### 构造训练器(trainer)
+训练器需要一个训练程序和一个训练优化函数。
+
+```python
+trainer = fluid.Trainer(
+    train_func=train_program, place=place, optimizer_func=optimizer_func)
+```
+
+### 提供数据
+
+`feed_order`用来定义每条产生的数据和`paddle.layer.data`之间的映射关系。比如，`movielens.train`产生的第一列的数据对应的是`user_id`这个特征。
+
+```python
+feed_order = [
+    'user_id', 'gender_id', 'age_id', 'job_id', 'movie_id', 'category_id',
+    'movie_title', 'score'
+]
+```
+
+### 事件处理器
+回调函数`event_handler`在一个之前定义好的事件发生后会被调用。例如，我们可以在每步训练结束后查看误差。
+
+```python
+# Specify the directory path to save the parameters
+params_dirname = "recommender_system.inference.model"
+
+from paddle.v2.plot import Ploter
+test_title = "Test cost"
+plot_cost = Ploter(test_title)
+
+
+def event_handler(event):
+    if isinstance(event, fluid.EndStepEvent):
+        avg_cost_set = trainer.test(
+            reader=test_reader, feed_order=feed_order)
+
+        # get avg cost
+        avg_cost = np.array(avg_cost_set).mean()
+
+        plot_cost.append(test_title, event.step, avg_cost_set[0])
+        plot_cost.plot()
+
+        print("avg_cost: %s" % avg_cost)
+        print('BatchID {0}, Test Loss {1:0.2}'.format(event.epoch + 1,
+                                                          float(avg_cost)))
+
+        if event.step == 20: # Adjust this number for accuracy
+            trainer.save_params(params_dirname)
+            trainer.stop()
+```
+
+### 开始训练
+最后，我们传入训练循环数（`num_epoch`）和一些别的参数，调用 `trainer.train` 来开始训练。
+
+```python
+trainer.train(
+    num_epochs=1,
+    event_handler=event_handler,
+    reader=train_reader,
+    feed_order=feed_order)
+```
+
+## 应用模型
+
+### 构建预测器
+传入`inference_program`和`params_dirname`来初始化一个预测器, `params_dirname`用来存放训练过程中的各个参数。
+
+```python
+inferencer = fluid.Inferencer(
+        inference_program, param_path=params_dirname, place=place)
+```
+
+### 生成测试用输入数据
+使用 create_lod_tensor(data, lod, place) 的API来生成细节层次的张量。`data`是一个序列，每个元素是一个索引号的序列。`lod`是细节层次的信息，对应于`data`。比如，data = [[10, 2, 3], [2, 3]] 意味着它包含两个序列，长度分别是3和2。于是相应地 lod = [[3, 2]]，它表明其包含一层细节信息，意味着 `data` 有两个序列，长度分别是3和2。
+
+在这个预测例子中，我们试着预测用户ID为1的用户对于电影'Hunchback of Notre Dame'的评分
+
+```python
+infer_movie_id = 783
+infer_movie_name = paddle.dataset.movielens.movie_info()[infer_movie_id].title
+user_id = fluid.create_lod_tensor([[1]], [[1]], place)
+gender_id = fluid.create_lod_tensor([[1]], [[1]], place)
+age_id = fluid.create_lod_tensor([[0]], [[1]], place)
+job_id = fluid.create_lod_tensor([[10]], [[1]], place)
+movie_id = fluid.create_lod_tensor([[783]], [[1]], place) # Hunchback of Notre Dame
+category_id = fluid.create_lod_tensor([[10, 8, 9]], [[3]], place) # Animation, Children's, Musical
+movie_title = fluid.create_lod_tensor([[1069, 4140, 2923, 710, 988]], [[5]],
+                                      place) # 'hunchback','of','notre','dame','the'
+```
+
+### 测试
+现在我们可以进行预测了。我们要提供的`feed_order`应该和训练过程一致。
+
+
+```python
+results = inferencer.infer(
+    {
+        'user_id': user_id,
+        'gender_id': gender_id,
+        'age_id': age_id,
+        'job_id': job_id,
+        'movie_id': movie_id,
+        'category_id': category_id,
+        'movie_title': movie_title
+    },
+    return_numpy=False)
+
+predict_rating = np.array(results[0])
+print("Predict Rating of user id 1 on movie \"" + infer_movie_name + "\" is " + str(predict_rating[0][0]))
+print("Actual Rating of user id 1 on movie \"" + infer_movie_name + "\" is 4.")
+
+```
+
+## 总结
+
+本章介绍了传统的推荐系统方法和YouTube的深度神经网络推荐系统，并以电影推荐为例，使用PaddlePaddle训练了一个个性化推荐神经网络模型。推荐系统几乎涵盖了电商系统、社交网络、广告推荐、搜索引擎等领域的方方面面，而在图像处理、自然语言处理等领域已经发挥重要作用的深度学习技术，也将会在推荐系统领域大放异彩。
+
+## 参考文献
+
+1. [Peter Brusilovsky](https://en.wikipedia.org/wiki/Peter_Brusilovsky) (2007). *The Adaptive Web*. p. 325.
+2. Robin Burke , [Hybrid Web Recommender Systems](http://www.dcs.warwick.ac.uk/~acristea/courses/CS411/2010/Book%20-%20The%20Adaptive%20Web/HybridWebRecommenderSystems.pdf), pp. 377-408, The Adaptive Web, Peter Brusilovsky, Alfred Kobsa, Wolfgang Nejdl (Ed.), Lecture Notes in Computer Science, Springer-Verlag, Berlin, Germany, Lecture Notes in Computer Science, Vol. 4321, May 2007, 978-3-540-72078-2.
+3. P. Resnick, N. Iacovou, etc. “[GroupLens: An Open Architecture for Collaborative Filtering of Netnews](http://ccs.mit.edu/papers/CCSWP165.html)”, Proceedings of ACM Conference on Computer Supported Cooperative Work, CSCW 1994. pp.175-186.
+4. Sarwar, Badrul, et al. "[Item-based collaborative filtering recommendation algorithms.](http://files.grouplens.org/papers/www10_sarwar.pdf)" *Proceedings of the 10th international conference on World Wide Web*. ACM, 2001.
+5. Kautz, Henry, Bart Selman, and Mehul Shah. "[Referral Web: combining social networks and collaborative filtering.](http://www.cs.cornell.edu/selman/papers/pdf/97.cacm.refweb.pdf)" Communications of the ACM 40.3 (1997): 63-65. APA
+6. Yuan, Jianbo, et al. ["Solving Cold-Start Problem in Large-scale Recommendation Engines: A Deep Learning Approach."](https://arxiv.org/pdf/1611.05480v1.pdf) *arXiv preprint arXiv:1611.05480* (2016).
+7. Covington P, Adams J, Sargin E. [Deep neural networks for youtube recommendations](https://static.googleusercontent.com/media/research.google.com/zh-CN//pubs/archive/45530.pdf)[C]//Proceedings of the 10th ACM Conference on Recommender Systems. ACM, 2016: 191-198.
+
+
+<br/>
+<a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span> 由 <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作，采用 <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">知识共享 署名-相同方式共享 4.0 国际 许可协议</a>进行许可。
diff --git a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/.gitignore b/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..667762d327cb160376a4119fa9df9db41b6443b2
--- /dev/null
+++ b/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/.gitignore
@@ -0,0 +1,10 @@
+data/aclImdb
+data/imdb
+data/pre-imdb
+data/mosesdecoder-master
+*.log
+model_output
+dataprovider_copy_1.py
+model.list
+*.pyc
+.DS_Store
diff --git a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/README.cn.md b/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/README.cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..8477cf32146c33947ced447c8bdd287a3e1e71f5
--- /dev/null
+++ b/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/README.cn.md
@@ -0,0 +1,358 @@
+# 情感分析
+
+本教程源代码目录在[book/understand_sentiment](https://github.com/PaddlePaddle/book/tree/develop/06.understand_sentiment)， 初次使用请参考PaddlePaddle[安装教程](https://github.com/PaddlePaddle/book/blob/develop/README.cn.md#运行这本书)，更多内容请参考本教程的[视频课堂](http://bit.baidu.com/course/detail/id/177.html)。
+
+## 背景介绍
+
+在自然语言处理中，情感分析一般是指判断一段文本所表达的情绪状态。其中，一段文本可以是一个句子，一个段落或一个文档。情绪状态可以是两类，如（正面，负面），（高兴，悲伤）；也可以是三类，如（积极，消极，中性）等等。情感分析的应用场景十分广泛，如把用户在购物网站（亚马逊、天猫、淘宝等）、旅游网站、电影评论网站上发表的评论分成正面评论和负面评论；或为了分析用户对于某一产品的整体使用感受，抓取产品的用户评论并进行情感分析等等。表格1展示了对电影评论进行情感分析的例子：
+
+| 电影评论       | 类别  |
+| --------     | -----  |
+| 在冯小刚这几年的电影里，算最好的一部的了| 正面 |
+| 很不好看，好像一个地方台的电视剧     | 负面 |
+| 圆方镜头全程炫技，色调背景美则美矣，但剧情拖沓，口音不伦不类，一直努力却始终无法入戏| 负面|
+|剧情四星。但是圆镜视角加上婺源的风景整个非常有中国写意山水画的感觉，看得实在太舒服了。。|正面|
+
+<p align="center">表格 1 电影评论情感分析</p>
+
+在自然语言处理中，情感分析属于典型的**文本分类**问题，即把需要进行情感分析的文本划分为其所属类别。文本分类涉及文本表示和分类方法两个问题。在深度学习的方法出现之前，主流的文本表示方法为词袋模型BOW(bag of words)，话题模型等等；分类方法有SVM(support vector machine), LR(logistic regression)等等。  
+
+对于一段文本，BOW表示会忽略其词顺序、语法和句法，将这段文本仅仅看做是一个词集合，因此BOW方法并不能充分表示文本的语义信息。例如，句子“这部电影糟糕透了”和“一个乏味，空洞，没有内涵的作品”在情感分析中具有很高的语义相似度，但是它们的BOW表示的相似度为0。又如，句子“一个空洞，没有内涵的作品”和“一个不空洞而且有内涵的作品”的BOW相似度很高，但实际上它们的意思很不一样。  
+
+本章我们所要介绍的深度学习模型克服了BOW表示的上述缺陷，它在考虑词顺序的基础上把文本映射到低维度的语义空间，并且以端对端（end to end）的方式进行文本表示及分类，其性能相对于传统方法有显著的提升\[[1](#参考文献)\]。
+
+## 模型概览
+本章所使用的文本表示模型为卷积神经网络（Convolutional Neural Networks）和循环神经网络(Recurrent Neural Networks)及其扩展。下面依次介绍这几个模型。
+
+### 文本卷积神经网络简介（CNN）
+
+我们在[推荐系统](https://github.com/PaddlePaddle/book/tree/develop/05.recommender_system)一节介绍过应用于文本数据的卷积神经网络模型的计算过程，这里进行一个简单的回顾。
+
+对卷积神经网络来说，首先使用卷积处理输入的词向量序列，产生一个特征图（feature map），对特征图采用时间维度上的最大池化（max pooling over time）操作得到此卷积核对应的整句话的特征，最后，将所有卷积核得到的特征拼接起来即为文本的定长向量表示，对于文本分类问题，将其连接至softmax即构建出完整的模型。在实际应用中，我们会使用多个卷积核来处理句子，窗口大小相同的卷积核堆叠起来形成一个矩阵，这样可以更高效的完成运算。另外，我们也可使用窗口大小不同的卷积核来处理句子，[推荐系统](https://github.com/PaddlePaddle/book/tree/develop/05.recommender_system)一节的图3作为示意画了四个卷积核，不同颜色表示不同大小的卷积核操作。
+
+对于一般的短文本分类问题，上文所述的简单的文本卷积网络即可达到很高的正确率\[[1](#参考文献)\]。若想得到更抽象更高级的文本特征表示，可以构建深层文本卷积神经网络\[[2](#参考文献),[3](#参考文献)\]。
+
+### 循环神经网络（RNN）
+
+循环神经网络是一种能对序列数据进行精确建模的有力工具。实际上，循环神经网络的理论计算能力是图灵完备的\[[4](#参考文献)\]。自然语言是一种典型的序列数据（词序列），近年来，循环神经网络及其变体（如long short term memory\[[5](#参考文献)\]等）在自然语言处理的多个领域，如语言模型、句法解析、语义角色标注（或一般的序列标注）、语义表示、图文生成、对话、机器翻译等任务上均表现优异甚至成为目前效果最好的方法。
+
+<p align="center">
+<img src="https://github.com/PaddlePaddle/book/blob/develop/06.understand_sentiment/image/rnn.png?raw=true" width = "60%" align="center"/><br/>
+图1. 循环神经网络按时间展开的示意图
+</p>
+
+循环神经网络按时间展开后如图1所示：在第$t$时刻，网络读入第$t$个输入$x_t$（向量表示）及前一时刻隐层的状态值$h_{t-1}$（向量表示，$h_0$一般初始化为$0$向量），计算得出本时刻隐层的状态值$h_t$，重复这一步骤直至读完所有输入。如果将循环神经网络所表示的函数记为$f$，则其公式可表示为：
+
+$$h_t=f(x_t,h_{t-1})=\sigma(W_{xh}x_t+W_{hh}h_{t-1}+b_h)$$
+
+其中$W_{xh}$是输入到隐层的矩阵参数，$W_{hh}$是隐层到隐层的矩阵参数，$b_h$为隐层的偏置向量（bias）参数，$\sigma$为$sigmoid$函数。  
+
+在处理自然语言时，一般会先将词（one-hot表示）映射为其词向量（word embedding）表示，然后再作为循环神经网络每一时刻的输入$x_t$。此外，可以根据实际需要的不同在循环神经网络的隐层上连接其它层。如，可以把一个循环神经网络的隐层输出连接至下一个循环神经网络的输入构建深层（deep or stacked）循环神经网络，或者提取最后一个时刻的隐层状态作为句子表示进而使用分类模型等等。  
+
+### 长短期记忆网络（LSTM）
+
+对于较长的序列数据，循环神经网络的训练过程中容易出现梯度消失或爆炸现象\[[6](#参考文献)\]。为了解决这一问题，Hochreiter S, Schmidhuber J. (1997)提出了LSTM(long short term memory\[[5](#参考文献)\])。  
+
+相比于简单的循环神经网络，LSTM增加了记忆单元$c$、输入门$i$、遗忘门$f$及输出门$o$。这些门及记忆单元组合起来大大提升了循环神经网络处理长序列数据的能力。若将基于LSTM的循环神经网络表示的函数记为$F$，则其公式为：
+
+$$ h_t=F(x_t,h_{t-1})$$
+
+$F$由下列公式组合而成\[[7](#参考文献)\]：
+$$ i_t = \sigma{(W_{xi}x_t+W_{hi}h_{t-1}+W_{ci}c_{t-1}+b_i)} $$
+$$ f_t = \sigma(W_{xf}x_t+W_{hf}h_{t-1}+W_{cf}c_{t-1}+b_f) $$
+$$ c_t = f_t\odot c_{t-1}+i_t\odot tanh(W_{xc}x_t+W_{hc}h_{t-1}+b_c) $$
+$$ o_t = \sigma(W_{xo}x_t+W_{ho}h_{t-1}+W_{co}c_{t}+b_o) $$
+$$ h_t = o_t\odot tanh(c_t) $$
+其中，$i_t, f_t, c_t, o_t$分别表示输入门，遗忘门，记忆单元及输出门的向量值，带角标的$W$及$b$为模型参数，$tanh$为双曲正切函数，$\odot$表示逐元素（elementwise）的乘法操作。输入门控制着新输入进入记忆单元$c$的强度，遗忘门控制着记忆单元维持上一时刻值的强度，输出门控制着输出记忆单元的强度。三种门的计算方式类似，但有着完全不同的参数，它们各自以不同的方式控制着记忆单元$c$，如图2所示：
+
+<p align="center">
+<img src="https://github.com/PaddlePaddle/book/blob/develop/06.understand_sentiment/image/lstm.png?raw=true" width = "65%" align="center"/><br/>
+图2. 时刻$t$的LSTM [7]
+</p>
+
+LSTM通过给简单的循环神经网络增加记忆及控制门的方式，增强了其处理远距离依赖问题的能力。类似原理的改进还有Gated Recurrent Unit (GRU)\[[8](#参考文献)\]，其设计更为简洁一些。**这些改进虽然各有不同，但是它们的宏观描述却与简单的循环神经网络一样（如图2所示），即隐状态依据当前输入及前一时刻的隐状态来改变，不断地循环这一过程直至输入处理完毕：**
+
+$$ h_t=Recrurent(x_t,h_{t-1})$$
+
+其中，$Recrurent$可以表示简单的循环神经网络、GRU或LSTM。
+
+### 栈式双向LSTM（Stacked Bidirectional LSTM）
+
+对于正常顺序的循环神经网络，$h_t$包含了$t$时刻之前的输入信息，也就是上文信息。同样，为了得到下文信息，我们可以使用反方向（将输入逆序处理）的循环神经网络。结合构建深层循环神经网络的方法（深层神经网络往往能得到更抽象和高级的特征表示），我们可以通过构建更加强有力的基于LSTM的栈式双向循环神经网络\[[9](#参考文献)\]，来对时序数据进行建模。  
+
+如图3所示（以三层为例），奇数层LSTM正向，偶数层LSTM反向，高一层的LSTM使用低一层LSTM及之前所有层的信息作为输入，对最高层LSTM序列使用时间维度上的最大池化即可得到文本的定长向量表示（这一表示充分融合了文本的上下文信息，并且对文本进行了深层次抽象），最后我们将文本表示连接至softmax构建分类模型。
+
+<p align="center">
+<img src="https://github.com/PaddlePaddle/book/blob/develop/06.understand_sentiment/image/stacked_lstm.jpg?raw=true" width=450><br/>
+图3. 栈式双向LSTM用于文本分类
+</p>
+
+
+## 数据集介绍
+
+我们以[IMDB情感分析数据集](http://ai.stanford.edu/%7Eamaas/data/sentiment/)为例进行介绍。IMDB数据集的训练集和测试集分别包含25000个已标注过的电影评论。其中，负面评论的得分小于等于4，正面评论的得分大于等于7，满分10分。
+```text
+aclImdb
+|- test
+   |-- neg
+   |-- pos
+|- train
+   |-- neg
+   |-- pos
+```
+Paddle在`dataset/imdb.py`中提实现了imdb数据集的自动下载和读取，并提供了读取字典、训练数据、测试数据等API。
+
+## 配置模型
+
+在该示例中，我们实现了两种文本分类算法，分别基于[推荐系统](https://github.com/PaddlePaddle/book/tree/develop/05.recommender_system)一节介绍过的文本卷积神经网络，以及[栈式双向LSTM](#栈式双向LSTM（Stacked Bidirectional LSTM）)。我们首先引入要用到的库和定义全局变量：
+
+```python
+from __future__ import print_function
+import paddle
+import paddle.fluid as fluid
+from functools import partial
+import numpy as np
+
+CLASS_DIM = 2
+EMB_DIM = 128
+HID_DIM = 512
+STACKED_NUM = 3
+BATCH_SIZE = 128
+USE_GPU = False
+```
+
+
+### 文本卷积神经网络
+我们构建神经网络`convolution_net`，示例代码如下。
+需要注意的是：`fluid.nets.sequence_conv_pool` 包含卷积和池化层两个操作。
+
+```python
+def convolution_net(data, input_dim, class_dim, emb_dim, hid_dim):
+    emb = fluid.layers.embedding(
+        input=data, size=[input_dim, emb_dim], is_sparse=True)
+    conv_3 = fluid.nets.sequence_conv_pool(
+        input=emb,
+        num_filters=hid_dim,
+        filter_size=3,
+        act="tanh",
+        pool_type="sqrt")
+    conv_4 = fluid.nets.sequence_conv_pool(
+        input=emb,
+        num_filters=hid_dim,
+        filter_size=4,
+        act="tanh",
+        pool_type="sqrt")
+    prediction = fluid.layers.fc(
+        input=[conv_3, conv_4], size=class_dim, act="softmax")
+    return prediction
+```
+
+网络的输入`input_dim`表示的是词典的大小，`class_dim`表示类别数。这里，我们使用[`sequence_conv_pool`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/trainer_config_helpers/networks.py) API实现了卷积和池化操作。
+
+<a name="栈值双向LSTM"></a>
+
+### 栈式双向LSTM
+
+栈式双向神经网络`stacked_lstm_net`的代码片段如下：
+
+```python
+def stacked_lstm_net(data, input_dim, class_dim, emb_dim, hid_dim, stacked_num):
+
+    emb = fluid.layers.embedding(
+        input=data, size=[input_dim, emb_dim], is_sparse=True)
+
+    fc1 = fluid.layers.fc(input=emb, size=hid_dim)
+    lstm1, cell1 = fluid.layers.dynamic_lstm(input=fc1, size=hid_dim)
+
+    inputs = [fc1, lstm1]
+
+    for i in range(2, stacked_num + 1):
+        fc = fluid.layers.fc(input=inputs, size=hid_dim)
+        lstm, cell = fluid.layers.dynamic_lstm(
+            input=fc, size=hid_dim, is_reverse=(i % 2) == 0)
+        inputs = [fc, lstm]
+
+    fc_last = fluid.layers.sequence_pool(input=inputs[0], pool_type='max')
+    lstm_last = fluid.layers.sequence_pool(input=inputs[1], pool_type='max')
+
+    prediction = fluid.layers.fc(
+        input=[fc_last, lstm_last], size=class_dim, act='softmax')
+    return prediction
+```
+以上的栈式双向LSTM抽象出了高级特征并把其映射到和分类类别数同样大小的向量上。`paddle.activation.Softmax`函数用来计算分类属于某个类别的概率。
+
+重申一下，此处我们可以调用`convolution_net`或`stacked_lstm_net`的任何一个。我们以`convolution_net`为例。
+
+接下来我们定义预测程序（`inference_program`）。预测程序使用`convolution_net`来对`fluid.layer.data`的输入进行预测。
+
+```python
+def inference_program(word_dict):
+    data = fluid.layers.data(
+        name="words", shape=[1], dtype="int64", lod_level=1)
+
+    dict_dim = len(word_dict)
+    net = convolution_net(data, dict_dim, CLASS_DIM, EMB_DIM, HID_DIM)
+    # net = stacked_lstm_net(data, dict_dim, CLASS_DIM, EMB_DIM, HID_DIM, STACKED_NUM)
+    return net
+```
+
+我们这里定义了`training_program`。它使用了从`inference_program`返回的结果来计算误差。我们同时定义了优化函数`optimizer_func`。
+
+因为是有监督的学习，训练集的标签也在`paddle.layer.data`中定义了。在训练过程中，交叉熵用来在`paddle.layer.classification_cost`中作为损失函数。
+
+在测试过程中，分类器会计算各个输出的概率。第一个返回的数值规定为 损耗(cost)。
+
+```python
+def train_program(word_dict):
+    prediction = inference_program(word_dict)
+    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(cost)
+    accuracy = fluid.layers.accuracy(input=prediction, label=label)
+    return [avg_cost, accuracy]
+
+
+def optimizer_func():
+    return fluid.optimizer.Adagrad(learning_rate=0.002)
+```
+
+## 训练模型
+
+### 定义训练环境
+
+定义您的训练是在CPU上还是在GPU上：
+
+
+```python
+use_cuda = False
+place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+```
+
+### 定义数据提供器
+
+下一步是为训练和测试定义数据提供器。提供器读入一个大小为 BATCH_SIZE的数据。paddle.dataset.imdb.train 每次会在乱序化后提供一个大小为BATCH_SIZE的数据，乱序化的大小为缓存大小buf_size。
+
+注意：读取IMDB的数据可能会花费几分钟的时间，请耐心等待。
+
+```python
+print("Loading IMDB word dict....")
+word_dict = paddle.dataset.imdb.word_dict()
+
+print ("Reading training data....")
+train_reader = paddle.batch(
+    paddle.reader.shuffle(
+        paddle.dataset.imdb.train(word_dict), buf_size=25000),
+    batch_size=BATCH_SIZE)
+```
+
+### 构造训练器(trainer)
+训练器需要一个训练程序和一个训练优化函数。
+
+```python
+trainer = fluid.Trainer(
+    train_func=partial(train_program, word_dict),
+    place=place,
+    optimizer_func=optimizer_func)
+```
+
+### 提供数据
+
+`feed_order`用来定义每条产生的数据和`paddle.layer.data`之间的映射关系。比如，`imdb.train`产生的第一列的数据对应的是`words`这个特征。
+
+```python
+feed_order = ['words', 'label']
+```
+
+### 事件处理器
+
+回调函数event_handler在一个之前定义好的事件发生后会被调用。例如，我们可以在每步训练结束后查看误差。
+
+```python
+# Specify the directory path to save the parameters
+params_dirname = "understand_sentiment_conv.inference.model"
+
+def event_handler(event):
+    if isinstance(event, fluid.EndStepEvent):
+        print("Step {0}, Epoch {1} Metrics {2}".format(
+                event.step, event.epoch, map(np.array, event.metrics)))
+
+        if event.step == 10:
+            trainer.save_params(params_dirname)
+            trainer.stop()
+```
+
+### 开始训练
+
+最后，我们传入训练循环数（num_epoch）和一些别的参数，调用 trainer.train 来开始训练。
+
+```python
+trainer.train(
+    num_epochs=1,
+    event_handler=event_handler,
+    reader=train_reader,
+    feed_order=feed_order)
+```
+
+## 应用模型
+
+### 构建预测器
+
+传入`inference_program`和`params_dirname`来初始化一个预测器, `params_dirname`用来存放训练过程中的各个参数。
+
+```python
+inferencer = fluid.Inferencer(
+        infer_func=partial(inference_program, word_dict), param_path=params_dirname, place=place)
+```
+
+### 生成测试用输入数据
+
+为了进行预测，我们任意选取3个评论。请随意选取您看好的3个。我们把评论中的每个词对应到`word_dict`中的id。如果词典中没有这个词，则设为`unknown`。
+然后我们用`create_lod_tensor`来创建细节层次的张量。
+
+```python
+reviews_str = [
+    'read the book forget the movie', 'this is a great movie', 'this is very bad'
+]
+reviews = [c.split() for c in reviews_str]
+
+UNK = word_dict['<unk>']
+lod = []
+for c in reviews:
+    lod.append([word_dict.get(words, UNK) for words in c])
+
+base_shape = [[len(c) for c in lod]]
+
+tensor_words = fluid.create_lod_tensor(lod, base_shape, place)
+```
+
+## 应用模型
+
+现在我们可以对每一条评论进行正面或者负面的预测啦。
+
+```python
+results = inferencer.infer({'words': tensor_words})
+
+for i, r in enumerate(results[0]):
+    print("Predict probability of ", r[0], " to be positive and ", r[1], " to be negative for review \'", reviews_str[i], "\'")
+
+```
+
+
+## 总结
+
+本章我们以情感分析为例，介绍了使用深度学习的方法进行端对端的短文本分类，并且使用PaddlePaddle完成了全部相关实验。同时，我们简要介绍了两种文本处理模型：卷积神经网络和循环神经网络。在后续的章节中我们会看到这两种基本的深度学习模型在其它任务上的应用。
+
+
+## 参考文献
+1. Kim Y. [Convolutional neural networks for sentence classification](http://arxiv.org/pdf/1408.5882)[J]. arXiv preprint arXiv:1408.5882, 2014.
+2. Kalchbrenner N, Grefenstette E, Blunsom P. [A convolutional neural network for modelling sentences](http://arxiv.org/pdf/1404.2188.pdf?utm_medium=App.net&utm_source=PourOver)[J]. arXiv preprint arXiv:1404.2188, 2014.
+3. Yann N. Dauphin, et al. [Language Modeling with Gated Convolutional Networks](https://arxiv.org/pdf/1612.08083v1.pdf)[J] arXiv preprint arXiv:1612.08083, 2016.
+4. Siegelmann H T, Sontag E D. [On the computational power of neural nets](http://research.cs.queensu.ca/home/akl/cisc879/papers/SELECTED_PAPERS_FROM_VARIOUS_SOURCES/05070215382317071.pdf)[C]//Proceedings of the fifth annual workshop on Computational learning theory. ACM, 1992: 440-449.
+5. Hochreiter S, Schmidhuber J. [Long short-term memory](http://web.eecs.utk.edu/~itamar/courses/ECE-692/Bobby_paper1.pdf)[J]. Neural computation, 1997, 9(8): 1735-1780.
+6. Bengio Y, Simard P, Frasconi P. [Learning long-term dependencies with gradient descent is difficult](http://www-dsi.ing.unifi.it/~paolo/ps/tnn-94-gradient.pdf)[J]. IEEE transactions on neural networks, 1994, 5(2): 157-166.
+7. Graves A. [Generating sequences with recurrent neural networks](http://arxiv.org/pdf/1308.0850)[J]. arXiv preprint arXiv:1308.0850, 2013.
+8. Cho K, Van Merriënboer B, Gulcehre C, et al. [Learning phrase representations using RNN encoder-decoder for statistical machine translation](http://arxiv.org/pdf/1406.1078)[J]. arXiv preprint arXiv:1406.1078, 2014.
+9. Zhou J, Xu W. [End-to-end learning of semantic role labeling using recurrent neural networks](http://www.aclweb.org/anthology/P/P15/P15-1109.pdf)[C]//Proceedings of the Annual Meeting of the Association for Computational Linguistics. 2015.
+
+<br/>
+<a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span> 由 <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作，采用 <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">知识共享 署名-相同方式共享 4.0 国际 许可协议</a>进行许可。
diff --git a/doc/fluid/new_docs/beginners_guide/basics/word2vec/.gitignore b/doc/fluid/new_docs/beginners_guide/basics/word2vec/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..a620e0279c310d213d4e6d8e99e666962c11e352
--- /dev/null
+++ b/doc/fluid/new_docs/beginners_guide/basics/word2vec/.gitignore
@@ -0,0 +1,3 @@
+data/train.list
+data/test.list
+data/simple-examples*
diff --git a/doc/fluid/new_docs/beginners_guide/basics/word2vec/README.cn.md b/doc/fluid/new_docs/beginners_guide/basics/word2vec/README.cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..904d99fe2ffc9ead69a86c9763568a5c098348d5
--- /dev/null
+++ b/doc/fluid/new_docs/beginners_guide/basics/word2vec/README.cn.md
@@ -0,0 +1,446 @@
+
+# 词向量
+
+本教程源代码目录在[book/word2vec](https://github.com/PaddlePaddle/book/tree/develop/04.word2vec)， 初次使用请参考PaddlePaddle[安装教程](https://github.com/PaddlePaddle/book/blob/develop/README.cn.md#运行这本书)，更多内容请参考本教程的[视频课堂](http://bit.baidu.com/course/detail/id/175.html)。
+
+## 背景介绍
+
+本章我们介绍词的向量表征，也称为word embedding。词向量是自然语言处理中常见的一个操作，是搜索引擎、广告系统、推荐系统等互联网服务背后常见的基础技术。
+
+在这些互联网服务里，我们经常要比较两个词或者两段文本之间的相关性。为了做这样的比较，我们往往先要把词表示成计算机适合处理的方式。最自然的方式恐怕莫过于向量空间模型(vector space model)。
+在这种方式里，每个词被表示成一个实数向量（one-hot vector），其长度为字典大小，每个维度对应一个字典里的每个词，除了这个词对应维度上的值是1，其他元素都是0。
+
+One-hot vector虽然自然，但是用处有限。比如，在互联网广告系统里，如果用户输入的query是“母亲节”，而有一个广告的关键词是“康乃馨”。虽然按照常理，我们知道这两个词之间是有联系的——母亲节通常应该送给母亲一束康乃馨；但是这两个词对应的one-hot vectors之间的距离度量，无论是欧氏距离还是余弦相似度(cosine similarity)，由于其向量正交，都认为这两个词毫无相关性。 得出这种与我们相悖的结论的根本原因是：每个词本身的信息量都太小。所以，仅仅给定两个词，不足以让我们准确判别它们是否相关。要想精确计算相关性，我们还需要更多的信息——从大量数据里通过机器学习方法归纳出来的知识。
+
+在机器学习领域里，各种“知识”被各种模型表示，词向量模型(word embedding model)就是其中的一类。通过词向量模型可将一个 one-hot vector映射到一个维度更低的实数向量（embedding vector），如$embedding(母亲节) = [0.3, 4.2, -1.5, ...], embedding(康乃馨) = [0.2, 5.6, -2.3, ...]$。在这个映射到的实数向量表示中，希望两个语义（或用法）上相似的词对应的词向量“更像”，这样如“母亲节”和“康乃馨”的对应词向量的余弦相似度就不再为零了。
+
+词向量模型可以是概率模型、共生矩阵(co-occurrence matrix)模型或神经元网络模型。在用神经网络求词向量之前，传统做法是统计一个词语的共生矩阵$X$。$X$是一个$|V| \times |V|$ 大小的矩阵，$X_{ij}$表示在所有语料中，词汇表`V`(vocabulary)中第i个词和第j个词同时出现的词数，$|V|$为词汇表的大小。对$X$做矩阵分解（如奇异值分解，Singular Value Decomposition \[[5](#参考文献)\]），得到的$U$即视为所有词的词向量：
+
+$$X = USV^T$$
+
+但这样的传统做法有很多问题：
+
+1) 由于很多词没有出现，导致矩阵极其稀疏，因此需要对词频做额外处理来达到好的矩阵分解效果；
+
+2) 矩阵非常大，维度太高(通常达到$10^6 \times 10^6$的数量级)；
+
+3) 需要手动去掉停用词（如although, a,...），不然这些频繁出现的词也会影响矩阵分解的效果。
+
+基于神经网络的模型不需要计算存储一个在全语料上统计的大表，而是通过学习语义信息得到词向量，因此能很好地解决以上问题。在本章里，我们将展示基于神经网络训练词向量的细节，以及如何用PaddlePaddle训练一个词向量模型。
+
+
+## 效果展示
+
+本章中，当词向量训练好后，我们可以用数据可视化算法t-SNE\[[4](#参考文献)\]画出词语特征在二维上的投影（如下图所示）。从图中可以看出，语义相关的词语（如a, the, these; big, huge）在投影上距离很近，语意无关的词（如say, business; decision, japan）在投影上的距离很远。
+
+<p align="center">
+    <img src = "https://github.com/PaddlePaddle/book/blob/develop/04.word2vec/image/2d_similarity.png?raw=true" width=400><br/>
+    图1. 词向量的二维投影
+</p>
+
+另一方面，我们知道两个向量的余弦值在$[-1,1]$的区间内：两个完全相同的向量余弦值为1, 两个相互垂直的向量之间余弦值为0，两个方向完全相反的向量余弦值为-1，即相关性和余弦值大小成正比。因此我们还可以计算两个词向量的余弦相似度:
+
+```
+
+please input two words: big huge
+similarity: 0.899180685161
+
+please input two words: from company
+similarity: -0.0997506977351
+
+```
+
+以上结果可以通过运行`calculate_dis.py`, 加载字典里的单词和对应训练特征结果得到，我们将在[模型应用](#模型应用)中详细描述用法。
+
+
+## 模型概览
+
+在这里我们介绍三个训练词向量的模型：N-gram模型，CBOW模型和Skip-gram模型，它们的中心思想都是通过上下文得到一个词出现的概率。对于N-gram模型，我们会先介绍语言模型的概念，并在之后的[训练模型](#训练模型)中，带大家用PaddlePaddle实现它。而后两个模型，是近年来最有名的神经元词向量模型，由 Tomas Mikolov 在Google 研发\[[3](#参考文献)\]，虽然它们很浅很简单，但训练效果很好。
+
+### 语言模型
+
+在介绍词向量模型之前，我们先来引入一个概念：语言模型。
+语言模型旨在为语句的联合概率函数$P(w_1, ..., w_T)$建模, 其中$w_i$表示句子中的第i个词。语言模型的目标是，希望模型对有意义的句子赋予大概率，对没意义的句子赋予小概率。
+这样的模型可以应用于很多领域，如机器翻译、语音识别、信息检索、词性标注、手写识别等，它们都希望能得到一个连续序列的概率。 以信息检索为例，当你在搜索“how long is a football bame”时（bame是一个医学名词），搜索引擎会提示你是否希望搜索"how long is a football game", 这是因为根据语言模型计算出“how long is a football bame”的概率很低，而与bame近似的，可能引起错误的词中，game会使该句生成的概率最大。
+
+对语言模型的目标概率$P(w_1, ..., w_T)$，如果假设文本中每个词都是相互独立的，则整句话的联合概率可以表示为其中所有词语条件概率的乘积，即：
+
+$$P(w_1, ..., w_T) = \prod_{t=1}^TP(w_t)$$
+
+然而我们知道语句中的每个词出现的概率都与其前面的词紧密相关, 所以实际上通常用条件概率表示语言模型：
+
+$$P(w_1, ..., w_T) = \prod_{t=1}^TP(w_t | w_1, ... , w_{t-1})$$
+
+
+
+### N-gram neural model
+
+在计算语言学中，n-gram是一种重要的文本表示方法，表示一个文本中连续的n个项。基于具体的应用场景，每一项可以是一个字母、单词或者音节。 n-gram模型也是统计语言模型中的一种重要方法，用n-gram训练语言模型时，一般用每个n-gram的历史n-1个词语组成的内容来预测第n个词。
+
+Yoshua Bengio等科学家就于2003年在著名论文 Neural Probabilistic Language Models \[[1](#参考文献)\] 中介绍如何学习一个神经元网络表示的词向量模型。文中的神经概率语言模型（Neural Network Language Model，NNLM）通过一个线性映射和一个非线性隐层连接，同时学习了语言模型和词向量，即通过学习大量语料得到词语的向量表达，通过这些向量得到整个句子的概率。用这种方法学习语言模型可以克服维度灾难（curse of dimensionality）,即训练和测试数据不同导致的模型不准。注意：由于“神经概率语言模型”说法较为泛泛，我们在这里不用其NNLM的本名，考虑到其具体做法，本文中称该模型为N-gram neural model。
+
+我们在上文中已经讲到用条件概率建模语言模型，即一句话中第$t$个词的概率和该句话的前$t-1$个词相关。可实际上越远的词语其实对该词的影响越小，那么如果考虑一个n-gram, 每个词都只受其前面`n-1`个词的影响，则有：
+
+$$P(w_1, ..., w_T) = \prod_{t=n}^TP(w_t|w_{t-1}, w_{t-2}, ..., w_{t-n+1})$$
+
+给定一些真实语料，这些语料中都是有意义的句子，N-gram模型的优化目标则是最大化目标函数:
+
+$$\frac{1}{T}\sum_t f(w_t, w_{t-1}, ..., w_{t-n+1};\theta) + R(\theta)$$
+
+其中$f(w_t, w_{t-1}, ..., w_{t-n+1})$表示根据历史n-1个词得到当前词$w_t$的条件概率，$R(\theta)$表示参数正则项。
+
+<p align="center">
+       <img src="https://github.com/PaddlePaddle/book/blob/develop/04.word2vec/image/nnlm.png?raw=true" width=500><br/>
+       图2. N-gram神经网络模型
+</p>
+
+图2展示了N-gram神经网络模型，从下往上看，该模型分为以下几个部分：
+ - 对于每个样本，模型输入$w_{t-n+1},...w_{t-1}$, 输出句子第t个词为字典中`|V|`个词的概率。
+
+   每个输入词$w_{t-n+1},...w_{t-1}$首先通过映射矩阵映射到词向量$C(w_{t-n+1}),...C(w_{t-1})$。
+
+ - 然后所有词语的词向量连接成一个大向量，并经过一个非线性映射得到历史词语的隐层表示：
+
+    $$g=Utanh(\theta^Tx + b_1) + Wx + b_2$$
+
+    其中，$x$为所有词语的词向量连接成的大向量，表示文本历史特征；$\theta$、$U$、$b_1$、$b_2$和$W$分别为词向量层到隐层连接的参数。$g$表示未经归一化的所有输出单词概率，$g_i$表示未经归一化的字典中第$i$个单词的输出概率。
+
+ - 根据softmax的定义，通过归一化$g_i$, 生成目标词$w_t$的概率为：
+
+  $$P(w_t | w_1, ..., w_{t-n+1}) = \frac{e^{g_{w_t}}}{\sum_i^{|V|} e^{g_i}}$$
+
+ - 整个网络的损失值(cost)为多类分类交叉熵，用公式表示为
+
+   $$J(\theta) = -\sum_{i=1}^N\sum_{c=1}^{|V|}y_k^{i}log(softmax(g_k^i))$$
+
+   其中$y_k^i$表示第$i$个样本第$k$类的真实标签(0或1)，$softmax(g_k^i)$表示第i个样本第k类softmax输出的概率。
+
+
+
+### Continuous Bag-of-Words model(CBOW)
+
+CBOW模型通过一个词的上下文（各N个词）预测当前词。当N=2时，模型如下图所示：
+
+<p align="center">
+    <img src="https://github.com/PaddlePaddle/book/blob/develop/04.word2vec/image/cbow.png?raw=true" width=250><br/>
+    图3. CBOW模型
+</p>
+
+具体来说，不考虑上下文的词语输入顺序，CBOW是用上下文词语的词向量的均值来预测当前词。即：
+
+$$context = \frac{x_{t-1} + x_{t-2} + x_{t+1} + x_{t+2}}{4}$$
+
+其中$x_t$为第$t$个词的词向量，分类分数（score）向量 $z=U*context$，最终的分类$y$采用softmax，损失函数采用多类分类交叉熵。
+
+### Skip-gram model
+
+CBOW的好处是对上下文词语的分布在词向量上进行了平滑，去掉了噪声，因此在小数据集上很有效。而Skip-gram的方法中，用一个词预测其上下文，得到了当前词上下文的很多样本，因此可用于更大的数据集。
+
+<p align="center">
+    <img src="https://github.com/PaddlePaddle/book/blob/develop/04.word2vec/image/skipgram.png?raw=true" width=250><br/>
+    图4. Skip-gram模型
+</p>
+
+如上图所示，Skip-gram模型的具体做法是，将一个词的词向量映射到$2n$个词的词向量（$2n$表示当前输入词的前后各$n$个词），然后分别通过softmax得到这$2n$个词的分类损失值之和。
+
+
+## 数据准备
+
+### 数据介绍
+
+本教程使用Penn Treebank （PTB）（经Tomas Mikolov预处理过的版本）数据集。PTB数据集较小，训练速度快，应用于Mikolov的公开语言模型训练工具\[[2](#参考文献)\]中。其统计情况如下：
+
+<p align="center">
+<table>
+    <tr>
+        <td>训练数据</td>
+        <td>验证数据</td>
+        <td>测试数据</td>
+    </tr>
+    <tr>
+        <td>ptb.train.txt</td>
+        <td>ptb.valid.txt</td>
+        <td>ptb.test.txt</td>
+    </tr>
+    <tr>
+        <td>42068句</td>
+        <td>3370句</td>
+        <td>3761句</td>
+    </tr>
+</table>
+</p>
+
+
+### 数据预处理
+
+本章训练的是5-gram模型，表示在PaddlePaddle训练时，每条数据的前4个词用来预测第5个词。PaddlePaddle提供了对应PTB数据集的python包`paddle.dataset.imikolov`，自动做数据的下载与预处理，方便大家使用。
+
+预处理会把数据集中的每一句话前后加上开始符号`<s>`以及结束符号`<e>`。然后依据窗口大小（本教程中为5），从头到尾每次向右滑动窗口并生成一条数据。
+
+如"I have a dream that one day" 一句提供了5条数据：
+
+```text
+<s> I have a dream
+I have a dream that
+have a dream that one
+a dream that one day
+dream that one day <e>
+```
+
+最后，每个输入会按其单词次在字典里的位置，转化成整数的索引序列，作为PaddlePaddle的输入。
+
+<a name="训练模型"></a>
+## 编程实现
+
+本配置的模型结构如下图所示：
+
+<p align="center">
+    <img src="https://github.com/PaddlePaddle/book/blob/develop/04.word2vec/image/ngram.png?raw=true" width=400><br/>
+    图5. 模型配置中的N-gram神经网络模型
+</p>
+
+首先，加载所需要的包：
+
+```python
+import paddle
+import paddle.fluid as fluid
+import numpy
+from functools import partial
+import math
+import os
+import sys
+from __future__ import print_function
+```
+
+然后，定义参数：
+```python
+EMBED_SIZE = 32  # word vector dimension
+HIDDEN_SIZE = 256  # hidden layer dimension
+N = 5  # train 5-gram
+BATCH_SIZE = 32  # batch size
+
+# can use CPU or GPU
+use_cuda = os.getenv('WITH_GPU', '0') != '0'
+
+word_dict = paddle.dataset.imikolov.build_dict()
+dict_size = len(word_dict)
+```
+
+不同于之前的PaddlePaddle v2版本，在新的Fluid版本里，我们不必再手动计算词向量。PaddlePaddle提供了一个内置的方法`fluid.layers.embedding`，我们就可以直接用它来构造 N-gram 神经网络。
+
+- 我们来定义我们的 N-gram 神经网络结构。这个结构在训练和预测中都会使用到。因为词向量比较稀疏，我们传入参数 `is_sparse == True`, 可以加速稀疏矩阵的更新。
+
+```python
+def inference_program(is_sparse):
+    first_word = fluid.layers.data(name='firstw', shape=[1], dtype='int64')
+    second_word = fluid.layers.data(name='secondw', shape=[1], dtype='int64')
+    third_word = fluid.layers.data(name='thirdw', shape=[1], dtype='int64')
+    fourth_word = fluid.layers.data(name='fourthw', shape=[1], dtype='int64')
+
+    embed_first = fluid.layers.embedding(
+        input=first_word,
+        size=[dict_size, EMBED_SIZE],
+        dtype='float32',
+        is_sparse=is_sparse,
+        param_attr='shared_w')
+    embed_second = fluid.layers.embedding(
+        input=second_word,
+        size=[dict_size, EMBED_SIZE],
+        dtype='float32',
+        is_sparse=is_sparse,
+        param_attr='shared_w')
+    embed_third = fluid.layers.embedding(
+        input=third_word,
+        size=[dict_size, EMBED_SIZE],
+        dtype='float32',
+        is_sparse=is_sparse,
+        param_attr='shared_w')
+    embed_fourth = fluid.layers.embedding(
+        input=fourth_word,
+        size=[dict_size, EMBED_SIZE],
+        dtype='float32',
+        is_sparse=is_sparse,
+        param_attr='shared_w')
+
+    concat_embed = fluid.layers.concat(
+        input=[embed_first, embed_second, embed_third, embed_fourth], axis=1)
+    hidden1 = fluid.layers.fc(input=concat_embed,
+                              size=HIDDEN_SIZE,
+                              act='sigmoid')
+    predict_word = fluid.layers.fc(input=hidden1, size=dict_size, act='softmax')
+    return predict_word
+```
+
+- 基于以上的神经网络结构，我们可以如下定义我们的`训练`方法
+
+```python
+def train_program(is_sparse):
+    # The declaration of 'next_word' must be after the invoking of inference_program,
+    # or the data input order of train program would be [next_word, firstw, secondw,
+    # thirdw, fourthw], which is not correct.
+    predict_word = inference_program(is_sparse)
+    next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64')
+    cost = fluid.layers.cross_entropy(input=predict_word, label=next_word)
+    avg_cost = fluid.layers.mean(cost)
+    return avg_cost
+```
+
+- 现在我们可以开始训练啦。如今的版本较之以前就简单了许多。我们有现成的训练和测试集：`paddle.dataset.imikolov.train()`和`paddle.dataset.imikolov.test()`。两者都会返回一个读取器。在PaddlePaddle中，读取器是一个Python的函数，每次调用，会读取下一条数据。它是一个Python的generator。
+
+`paddle.batch` 会读入一个读取器，然后输出一个批次化了的读取器。`event_handler`亦可以一并传入`trainer.train`来时不时的输出每个步骤，批次的训练情况。
+
+```python
+def optimizer_func():
+    # Note here we need to choose more sophisticated optimizers
+    # such as AdaGrad with a decay rate. The normal SGD converges
+    # very slowly.
+    # optimizer=fluid.optimizer.SGD(learning_rate=0.001),
+    return fluid.optimizer.AdagradOptimizer(
+        learning_rate=3e-3,
+        regularization=fluid.regularizer.L2DecayRegularizer(8e-4))
+
+
+def train(use_cuda, train_program, params_dirname):
+    train_reader = paddle.batch(
+        paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE)
+    test_reader = paddle.batch(
+        paddle.dataset.imikolov.test(word_dict, N), BATCH_SIZE)
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+
+    def event_handler(event):
+        if isinstance(event, fluid.EndStepEvent):
+            # We output cost every 10 steps.
+            if event.step % 10 == 0:
+                outs = trainer.test(
+                    reader=test_reader,
+                    feed_order=['firstw', 'secondw', 'thirdw', 'fourthw', 'nextw'])
+                avg_cost = outs[0]
+
+                print("Step %d: Average Cost %f" % (event.step, avg_cost))
+
+                # If average cost is lower than 5.8, we consider the model good enough to stop.
+                # Note 5.8 is a relatively high value. In order to get a better model, one should
+                # aim for avg_cost lower than 3.5. But the training could take longer time.
+                if avg_cost < 5.8:
+                    trainer.save_params(params_dirname)
+                    trainer.stop()
+
+                if math.isnan(avg_cost):
+                    sys.exit("got NaN loss, training failed.")
+
+    trainer = fluid.Trainer(
+        train_func=train_program,
+        optimizer_func=optimizer_func,
+        place=place)
+
+    trainer.train(
+        reader=train_reader,
+        num_epochs=1,
+        event_handler=event_handler,
+        feed_order=['firstw', 'secondw', 'thirdw', 'fourthw', 'nextw'])
+```
+
+- `trainer.train`将会开始训练。从`event_handler`返回的监控情况如下：
+
+```text
+Step 0: Average Cost 7.337213
+Step 10: Average Cost 6.136128
+Step 20: Average Cost 5.766995
+...
+```
+
+<a name="模型应用"></a>
+## 模型应用
+在模型训练后，我们可以用它做一些预测。
+
+### 预测下一个词
+我们可以用我们训练过的模型，在得知之前的 N-gram 后，预测下一个词。
+
+```python
+def infer(use_cuda, inference_program, params_dirname=None):
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    inferencer = fluid.Inferencer(
+        infer_func=inference_program, param_path=params_dirname, place=place)
+
+    # Setup inputs by creating 4 LoDTensors representing 4 words. Here each word
+    # is simply an index to look up for the corresponding word vector and hence
+    # the shape of word (base_shape) should be [1]. The length-based level of
+    # detail (lod) info of each LoDtensor should be [[1]] meaning there is only
+    # one lod_level and there is only one sequence of one word on this level.
+    # Note that lod info should be a list of lists.
+
+    data1 = [[211]]  # 'among'
+    data2 = [[6]]    # 'a'
+    data3 = [[96]]   # 'group'
+    data4 = [[4]]    # 'of'
+    lod = [[1]]
+
+    first_word  = fluid.create_lod_tensor(data1, lod, place)
+    second_word = fluid.create_lod_tensor(data2, lod, place)
+    third_word  = fluid.create_lod_tensor(data3, lod, place)
+    fourth_word = fluid.create_lod_tensor(data4, lod, place)
+
+    result = inferencer.infer(
+        {
+            'firstw': first_word,
+            'secondw': second_word,
+            'thirdw': third_word,
+            'fourthw': fourth_word
+        },
+        return_numpy=False)
+
+    print(numpy.array(result[0]))
+    most_possible_word_index = numpy.argmax(result[0])
+    print(most_possible_word_index)
+    print([
+        key for key, value in word_dict.iteritems()
+        if value == most_possible_word_index
+    ][0])
+```
+
+在经历3分钟的短暂训练后，我们得到如下的预测。我们的模型预测 `among a group of` 的下一个词是`a`。这比较符合文法规律。如果我们训练时间更长，比如几个小时，那么我们会得到的下一个预测是 `workers`。
+
+```text
+[[0.00106646 0.0007907  0.00072041 ... 0.00049024 0.00041355 0.00084464]]
+6
+a
+```
+
+整个程序的入口很简单：
+
+```python
+def main(use_cuda, is_sparse):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+
+    params_dirname = "word2vec.inference.model"
+
+    train(
+        use_cuda=use_cuda,
+        train_program=partial(train_program, is_sparse),
+        params_dirname=params_dirname)
+
+    infer(
+        use_cuda=use_cuda,
+        inference_program=partial(inference_program, is_sparse),
+        params_dirname=params_dirname)
+
+
+main(use_cuda=use_cuda, is_sparse=True)
+```
+
+
+## 总结
+本章中，我们介绍了词向量、语言模型和词向量的关系、以及如何通过训练神经网络模型获得词向量。在信息检索中，我们可以根据向量间的余弦夹角，来判断query和文档关键词这二者间的相关性。在句法分析和语义分析中，训练好的词向量可以用来初始化模型，以得到更好的效果。在文档分类中，有了词向量之后，可以用聚类的方法将文档中同义词进行分组，也可以用 N-gram 来预测下一个词。希望大家在本章后能够自行运用词向量进行相关领域的研究。
+
+
+## 参考文献
+1. Bengio Y, Ducharme R, Vincent P, et al. [A neural probabilistic language model](http://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf)[J]. journal of machine learning research, 2003, 3(Feb): 1137-1155.
+2. Mikolov T, Kombrink S, Deoras A, et al. [Rnnlm-recurrent neural network language modeling toolkit](http://www.fit.vutbr.cz/~imikolov/rnnlm/rnnlm-demo.pdf)[C]//Proc. of the 2011 ASRU Workshop. 2011: 196-201.
+3. Mikolov T, Chen K, Corrado G, et al. [Efficient estimation of word representations in vector space](https://arxiv.org/pdf/1301.3781.pdf)[J]. arXiv preprint arXiv:1301.3781, 2013.
+4. Maaten L, Hinton G. [Visualizing data using t-SNE](https://lvdmaaten.github.io/publications/papers/JMLR_2008.pdf)[J]. Journal of Machine Learning Research, 2008, 9(Nov): 2579-2605.
+5. https://en.wikipedia.org/wiki/Singular_value_decomposition
+
+<br/>
+<a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span> 由 <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作，采用 <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">知识共享 署名-相同方式共享 4.0 国际 许可协议</a>进行许可。
diff --git a/doc/fluid/new_docs/beginners_guide/index.rst b/doc/fluid/new_docs/beginners_guide/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e18933dcc0038129077a455892ddd785579f0003
--- /dev/null
+++ b/doc/fluid/new_docs/beginners_guide/index.rst
@@ -0,0 +1,15 @@
+########
+新手入门
+########
+
+..  todo::
+
+    新手入门的导引文字，需要完善。
+
+..  toctree::
+    :maxdepth: 2
+    
+    install/install_doc.rst
+    quick_start/index.rst
+    basics/index.rst
+    basics/learning_materials.md
diff --git a/doc/fluid/new_docs/beginners_guide/install/install_doc.rst b/doc/fluid/new_docs/beginners_guide/install/install_doc.rst
new file mode 100644
index 0000000000000000000000000000000000000000..18788d2eae048ac5120b0b7afd63cd784a235798
--- /dev/null
+++ b/doc/fluid/new_docs/beginners_guide/install/install_doc.rst
@@ -0,0 +1,564 @@
+.. _how_to_install:
+
+安装说明
+^^^^^^^^
+
+若您的系统为Linux或Windows，您可以使用我们提供的安装包来安装PaddlePaddle。
+
+对于MacOS系统，我们暂未提供安装包，您可以使用 **从源码编译** 的方式安装。
+
+
+.. _install_linux:
+
+在Linux安装PaddlePaddle
+--------
+
+推荐您使用 `pip <https://pypi.org/project/pip/>`_
+安装，它是Linux系统下最简单的安装方式。
+
+注意事项：
+
+- PaddlePaddle Python API 依赖Python 2.7版本。
+
+执行下面的命令即可在当前机器上安装PaddlePaddle的运行时环境，并自动下载安装依赖软件。
+
+  .. code-block:: bash
+
+     pip install paddlepaddle
+
+您可以通过指定版本号来安装其它版本，例如：
+
+  .. code-block:: bash
+
+      pip install paddlepaddle==0.13.0
+
+
+如果需要安装支持GPU的版本（cuda9.0_cudnn7_avx_openblas），需要执行：
+
+  .. code-block:: bash
+
+     pip install paddlepaddle-gpu
+
+PaddlePaddle针对不同需求提供了更多版本的安装包，部分列表如下：
+
+=================================   ========================================
+版本号                               版本说明
+=================================   ========================================
+paddlepaddle-gpu==0.14.0            使用CUDA 9.0和cuDNN 7编译的0.14.0版本
+paddlepaddle-gpu==0.14.0.post87     使用CUDA 8.0和cuDNN 7编译的0.14.0版本
+paddlepaddle-gpu==0.14.0.post85     使用CUDA 8.0和cuDNN 5编译的0.14.0版本
+paddlepaddle-gpu==0.13.0            使用CUDA 9.0和cuDNN 7编译的0.13.0版本
+paddlepaddle-gpu==0.12.0            使用CUDA 8.0和cuDNN 5编译的0.12.0版本
+paddlepaddle-gpu==0.11.0.post87     使用CUDA 8.0和cuDNN 7编译的0.11.0版本
+paddlepaddle-gpu==0.11.0.post8      使用CUDA 8.0和cuDNN 5编译的0.11.0版本
+paddlepaddle-gpu==0.11.0            使用CUDA 7.5和cuDNN 5编译的0.11.0版本
+=================================   ========================================
+
+您可以在 `Release History <https://pypi.org/project/paddlepaddle-gpu/#history>`_
+中找到paddlepaddle-gpu的各个发行版本。
+
+如果需要获取并安装最新的（开发分支）PaddlePaddle，可以从我们的CI系统中下载最新的whl
+安装包和c-api开发包并安装，您可以从下面的表格中找到需要的版本：
+
+如果在点击下面链接时出现如下登陆界面，点击“Log in as guest”即可开始下载：
+
+.. image:: paddleci.png
+   :scale: 50 %
+   :align: center
+
+..  csv-table:: 各个版本最新的whl包
+    :header: "版本说明", "cp27-cp27mu", "cp27-cp27m"
+    :widths: 1, 3, 3
+
+    "stable_cuda9.0_cudnn7", "`paddlepaddle_gpu-0.14.0-cp27-cp27mu-manylinux1_x86_64.whl <https://files.pythonhosted.org/packages/ee/ee/5d96e99d4a6d57bd1a7a8c4c98124a5ba0f6f0e07f38f4cee1365e0d9734/paddlepaddle_gpu-0.14.0-cp27-cp27mu-manylinux1_x86_64.whl>`__", "`paddlepaddle_gpu-0.14.0-cp27-cp27m-manylinux1_x86_64.whl <https://files.pythonhosted.org/packages/2e/65/3c1e44417dfc4afc7004f4db06789876b1237a0b6b234e0bd4213f3258b7/paddlepaddle_gpu-0.14.0-cp27-cp27m-manylinux1_x86_64.whl>`__"
+    "stable_cuda8.0_cudnn7", "`paddlepaddle_gpu-0.14.0.post87-cp27-cp27mu-manylinux1_x86_64.whl <https://files.pythonhosted.org/packages/a1/eb/261d920ede38d4b2b8dfb5817d7f7d25c526b1a70260f23312ad6029c0d3/paddlepaddle_gpu-0.14.0.post87-cp27-cp27mu-manylinux1_x86_64.whl>`__", "`paddlepaddle_gpu-0.14.0.post87-cp27-cp27m-manylinux1_x86_64.whl <https://files.pythonhosted.org/packages/54/1d/2c2a5c8665634b47fa925839108752611202a7c08ba4d65c2ee79f825a0e/paddlepaddle_gpu-0.14.0.post87-cp27-cp27m-manylinux1_x86_64.whl>`__"
+    "stable_cuda8.0_cudnn5", "`paddlepaddle_gpu-0.14.0.post85-cp27-cp27mu-manylinux1_x86_64.whl <https://files.pythonhosted.org/packages/60/50/94d16d34976f06b3cd8818d9b7bf40a9ff16bc48120ac9254d976f8ffc35/paddlepaddle_gpu-0.14.0.post85-cp27-cp27mu-manylinux1_x86_64.whl>`__", "`paddlepaddle_gpu-0.14.0.post85-cp27-cp27m-manylinux1_x86_64.whl <https://files.pythonhosted.org/packages/24/dd/25c1db09524f654c80baa83e7aafdd67109449bd5b500964f4005047dcf8/paddlepaddle_gpu-0.14.0.post85-cp27-cp27m-manylinux1_x86_64.whl>`__"
+    "cpu_avx_mkl", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/845:id/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/845:id/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cpu_avx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/846:id/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/846:id/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cpu_noavx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/847:id/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/847:id/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`_"
+    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/841:id/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/841:id/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/843:id/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/843:id/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cuda9.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/842:id/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/842:id/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
+
+.. _FAQ:
+
+安装常见问题和解决方法
+======================
+
+- paddlepaddle*.whl is not a supported wheel on this platform.
+
+出现这个问题的主要原因是，没有找到和当前系统匹配的paddlepaddle安装包。
+请检查Python版本是否为2.7系列。另外最新的pip官方源中的安装包默认是manylinux1标准，
+需要使用最新的pip (>9.0.0) 才可以安装。
+
+可以使用下面的命令更新您的pip：
+
+  .. code-block:: bash
+
+      pip install --upgrade pip
+
+如果仍然存在问题，可以执行：
+
+    .. code-block:: bash
+
+        python -c "import pip; print(pip.pep425tags.get_supported())"
+
+获取当前系统支持的安装包格式，并检查和需安装的包是否匹配。pypi安装包
+可以在 `这里 <https://pypi.python.org/pypi/paddlepaddle/0.10.5>`_ 找到。
+
+如果系统支持的是 linux_x86_64 而安装包是 manylinux1_x86_64 ，需要升级pip版本到最新；
+如果系统支持 manylinux1_x86_64 而安装包（本地）是 linux_x86_64，
+可以重命名这个whl包为 manylinux1_x86_64 再安装。
+
+
+.. _install_windows:
+
+在Windows安装PaddlePaddle
+------------------------------
+Windows系统需要通过Docker来使用PaddleaPaddle。Docker是一个虚拟容器，使用Docker可以简化复杂的环境配置工作。
+
+我们提供了 `PaddlePaddle_Windows快速安装包 <http://paddle-windows.bj.bcebos.com/PaddlePaddle-windows.zip>`_，
+它能够帮助您安装Docker和PaddlePaddle。
+
+* 安装包支持的系统：Windows7，Windows8的所有版本，Windows10的专业版、企业版。
+
+* 如果您希望使用GPU提升训练速度，请使用Linux系统安装，Windows系统暂不支持。
+   
+.. _install_mac:
+
+在MacOS安装PaddlePaddle
+--------
+
+对于MacOS系统，我们暂未提供pip安装方式，您可以使用 **源码编译** 的方式安装。
+
+.. _others:
+
+其他安装方式
+-------------
+
+.. _source:
+源码编译（使用Docker镜像）
+==========
+
+.. _requirements:
+
+需要的软硬件
+"""""""""""""
+
+为了编译PaddlePaddle，我们需要
+
+1. 一台电脑，可以装的是 Linux, Windows 或者 MacOS 操作系统
+2. Docker
+
+不需要依赖其他任何软件了。即便是 Python 和 GCC 都不需要，因为我们会把所有编译工具都安装进一个 Docker 镜像里。
+
+.. _build_step:
+
+编译方法
+"""""""""""""
+
+PaddlePaddle需要使用Docker环境完成编译，这样可以免去单独安装编译依赖的步骤，可选的不同编译环境Docker镜像可以在 `这里 <https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/>`_ 找到。
+
+
+**I. 编译CPU-Only版本的PaddlePaddle，需要执行：**
+
+.. code-block:: bash
+
+   # 1. 获取源码
+   git clone https://github.com/PaddlePaddle/Paddle.git
+   cd Paddle
+   # 2. 执行如下命令下载最新版本的docker镜像
+   docker run --name paddle-test -v $PWD:/paddle --network=host -it docker.paddlepaddlehub.com/paddle:latest-dev /bin/bash
+   # 3. 进入docker内执行如下命令编译CPU-Only的二进制安装包
+   mkdir -p /paddle/build && cd /paddle/build
+   cmake .. -DWITH_FLUID_ONLY=ON -DWITH_GPU=OFF -DWITH_TESTING=OFF
+   make -j$(nproc)
+
+**II. 编译GPU版本的PaddlePaddle，需要执行：**
+
+.. code-block:: bash
+
+  # 1. 获取源码 
+  git clone https://github.com/PaddlePaddle/Paddle.git 
+  cd Paddle
+  # 2. 安装nvidia-docker
+  apt-get install nvidia-docker
+  # 3. 执行如下命令下载支持GPU运行的docker容器
+  nvidia-docker run --name paddle-test-gpu -v $PWD:/paddle --network=host -it docker.paddlepaddlehub.com/paddle:latest-dev /bin/bash
+  # 4. 进入docker内执行如下命令编译GPU版本的PaddlePaddle
+  mkdir -p /paddle/build && cd /paddle/build
+  cmake .. -DWITH_FLUID_ONLY=ON -DWITH_GPU=ON -DWITH_TESTING=OFF
+  make -j$(nproc)
+
+**注意事项：**
+
+* 上述有关 :code:`docker` 的命令把当前目录（源码树根目录）映射为 container 里的 :code:`/paddle` 目录。
+* 进入 :code:`docker` 后执行 :code:`cmake` 命令，若是出现 :code:`patchelf not found, please install it.` 错误，则执行 :code:`apt-get install -y patchelf` 命令即可解决问题。
+* 若您在使用Docker编译PaddlePaddle遇到问题时， `这个issue <https://github.com/PaddlePaddle/Paddle/issues/12079>`_ 可能会对您有所帮助。
+
+
+.. _source:
+源码编译（不使用Docker镜像）
+==========
+
+如果您选择不使用Docker镜像，则需要在本机安装下面章节列出的 `附录：编译依赖`_ 之后才能开始编译的步骤。
+
+.. _build_step:
+
+编译方法
+"""""""""""""
+
+在本机上编译CPU-Only版本的PaddlePaddle，需要执行如下命令：
+
+.. code-block:: bash
+
+   # 1. 使用virtualenvwrapper创建python虚环境并将工作空间切换到虚环境 [可选]
+   mkvirtualenv paddle-venv
+   workon paddle-venv
+   # 2. 获取源码
+   git clone https://github.com/PaddlePaddle/Paddle.git
+   cd Paddle
+   # 3. 执行下面的命令编译CPU-Only的二进制
+   mkdir build && cd build
+   cmake .. -DWITH_FLUID_ONLY=ON -DWITH_GPU=OFF -DWITH_TESTING=OFF
+   make -j4 # 根据机器配备CPU的核心数开启相应的多线程进行编译
+
+
+**注意事项：**
+
+* MacOS系统下因为默认安装了cblas库，所以编译时可能会遇到 :code:`use of undeclared identifier 'openblas_set_num_threads'` 错误。因此，在执行cmake命令时需要指定所使用openblas库的头文件路径，具体操作如下：
+
+  .. code-block:: bash
+
+    cd Paddle/build && rm -rf *
+    cmake .. -DWITH_FLUID_ONLY=ON -DWITH_GPU=OFF -DWITH_TESTING=OFF -DOPENBLAS_INC_DIR=/usr/local/Cellar/openblas/[本机所安装的openblas版本号]/include/
+    make -j4 # 根据机器配备CPU的核心数开启相应的多线程进行编译
+* 若您在MacOS系统下从源码编译PaddlePaddle遇到问题时， `这个issue <https://github.com/PaddlePaddle/Paddle/issues/12078>`_ 可能会对您有所帮助。
+
+编译完成后会在build/python/dist目录下生成输出的whl包，可以选在在当前机器安装也可以拷贝到目标机器安装：
+
+.. code-block:: bash
+
+   pip install build/python/dist/*.whl
+
+如果机器中已经安装过PaddlePaddle，有两种方法：
+
+.. code-block:: bash
+
+   1. 先卸载之前的版本，再重新安装
+   pip uninstall paddlepaddle
+   pip install build/python/dist/*.whl
+
+   2. 直接升级到更新的版本
+   pip install build/python/dist/*.whl -U
+
+.. _run_test:
+
+执行单元测试
+"""""""""""""
+
+如果您期望在编译完成后立即执行所有的单元测试，可以按照下面的方法：
+
+设置 :code:`RUN_TEST=ON` 和 :code:`WITH_TESTING=ON` 就会在完成编译之后，立即执行单元测试。
+开启 :code:`WITH_GPU=ON` 可以指定同时执行GPU上的单元测试。
+
+.. code-block:: bash
+
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" docker.paddlepaddlehub.com/paddle:latest-dev bash -x /paddle/paddle/scripts/paddle_build.sh build
+
+如果期望执行其中一个单元测试，（比如 :code:`test_sum_op` ）：
+
+.. code-block:: bash
+
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=OFF" docker.paddlepaddlehub.com/paddle:latest-dev bash -x /paddle/paddle/scripts/paddle_build.sh build
+   cd /paddle/build
+   ctest -R test_sum_op -V
+
+.. _faq_docker:
+
+常见问题
+"""""""""""""
+
+- 什么是 Docker?
+
+  如果您没有听说 Docker，可以把它想象为一个类似 virtualenv 的系统，但是虚拟的不仅仅是 Python 的运行环境。
+
+- Docker 还是虚拟机？
+
+  有人用虚拟机来类比 Docker。需要强调的是：Docker 不会虚拟任何硬件，Docker container 里运行的编译工具实际上都是在本机的 CPU 和操作系统上直接运行的，性能和把编译工具安装在本机运行一样。
+
+- 为什么用 Docker?
+
+  把工具和配置都安装在一个 Docker image 里可以标准化编译环境。这样如果遇到问题，其他人可以复现问题以便帮助。
+
+  另外，对于习惯使用Windows和MacOS的开发者来说，使用Docker就不用配置交叉编译环境了。
+
+- 可以选择不用Docker吗？
+
+  当然可以。大家可以用把开发工具安装进入 Docker image 一样的方式，把这些工具安装到本机。这篇文档介绍基于 Docker 的开发流程，是因为这个流程比其他方法都更简便。
+
+- 学习 Docker 有多难？
+
+  理解 Docker 并不难，大概花十分钟看一下 `这篇文章 <https://zhuanlan.zhihu.com/p/19902938>`_。
+  这可以帮您省掉花一小时安装和配置各种开发工具，以及切换机器时需要新安装的辛苦。别忘了 PaddlePaddle 更新可能导致需要新的开发工具。更别提简化问题复现带来的好处了。
+
+- 可以用 IDE 吗？
+
+  当然可以，因为源码就在本机上。IDE 默认调用 make 之类的程序来编译源码，我们只需要配置 IDE 来调用 Docker 命令编译源码即可。
+
+  很多 PaddlePaddle 开发者使用 Emacs。他们在自己的 `~/.emacs` 配置文件里加两行
+
+  .. code-block:: bash
+
+    (global-set-key "\C-cc" 'compile)
+    (setq compile-command
+     "docker run --rm -it -v $(git rev-parse --show-toplevel):/paddle paddle:dev")
+
+  就可以按 `Ctrl-C` 和 `c` 键来启动编译了。
+
+- 可以并行编译吗？
+
+  是的。我们的 Docker image 运行一个 `Bash 脚本 <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/paddle/scripts/paddle_build.sh>`_。这个脚本调用 :code:`make -j$(nproc)` 来启动和 CPU 核一样多的进程来并行编译。
+
+- Docker 需要 sudo
+
+  如果用自己的电脑开发，自然也就有管理员权限（sudo）了。如果用公用的电脑开发，需要请管理员安装和配置好 Docker。此外，PaddlePaddle 项目在努力开始支持其他不需要 sudo 的集装箱技术，比如 rkt。
+
+- 在 Windows/MacOS 上编译很慢
+
+  Docker 在 Windows 和 MacOS 都可以运行。不过实际上是运行在一个 Linux 虚拟机上。可能需要注意给这个虚拟机多分配一些 CPU 和内存，以保证编译高效。具体做法请参考 `这个issue <https://github.com/PaddlePaddle/Paddle/issues/627>`_。
+
+- 磁盘不够
+
+  本文中的例子里， :code:`docker run` 命令里都用了 :code:`--rm` 参数，这样保证运行结束之后的 containers 不会保留在磁盘上。可以用 :code:`docker ps -a` 命令看到停止后但是没有删除的 containers。 :code:`docker build` 命令有时候会产生一些中间结果，是没有名字的 images，也会占用磁盘。可以参考 `这篇文章 <https://zaiste.net/posts/removing_docker_containers/>`_ 来清理这些内容。
+
+
+.. _compile_deps:
+
+附录：编译依赖
+"""""""""""""
+
+PaddlePaddle编译需要使用到下面的依赖（包含但不限于），其他的依赖软件，会自动在编译时下载。
+
+.. csv-table:: PaddlePaddle编译依赖
+   :header: "依赖", "版本", "说明"
+   :widths: 10, 15, 30
+
+   "CMake", "3.4", ""
+   "GCC", "4.8.2", "推荐使用CentOS的devtools2"
+   "Python", "2.7.x", "依赖libpython2.7.so"
+   "SWIG", ">=2.0", ""
+   "wget","",""
+   "openblas","",""
+   "pip", ">=9.0", ""
+   "numpy", "", ""
+   "protobuf","3.1.0",""
+   "wheel","",""
+   "Go", ">=1.8", "可选"
+
+
+.. _build_options:
+
+附录：编译选项
+"""""""""""""
+
+PaddlePaddle的编译选项，包括生成CPU/GPU二进制文件、链接何种BLAS库等。
+用户可在调用cmake的时候设置它们，详细的cmake使用方法可以参考
+`官方文档 <https://cmake.org/cmake-tutorial>`_ 。
+
+在cmake的命令行中，通过使用 ``-D`` 命令设置该类编译选项，例如：
+
+..  code-block:: bash
+
+    cmake .. -DWITH_GPU=OFF
+
+..  csv-table:: 编译选项说明
+    :header: "选项", "说明", "默认值"
+    :widths: 1, 7, 2
+
+    "WITH_GPU", "是否支持GPU", "ON"
+    "WITH_C_API", "是否仅编译CAPI", "OFF"
+    "WITH_DOUBLE", "是否使用双精度浮点数", "OFF"
+    "WITH_DSO", "是否运行时动态加载CUDA动态库，而非静态加载CUDA动态库。", "ON"
+    "WITH_AVX", "是否编译含有AVX指令集的PaddlePaddle二进制文件", "ON"
+    "WITH_PYTHON", "是否内嵌PYTHON解释器", "ON"
+    "WITH_STYLE_CHECK", "是否编译时进行代码风格检查", "ON"
+    "WITH_TESTING", "是否开启单元测试", "OFF"
+    "WITH_DOC", "是否编译中英文文档", "OFF"
+    "WITH_SWIG_PY", "是否编译PYTHON的SWIG接口，该接口可用于预测和定制化训练", "Auto"
+    "WITH_GOLANG", "是否编译go语言的可容错parameter server", "OFF"
+    "WITH_MKL", "是否使用MKL数学库，如果为否则是用OpenBLAS", "ON"
+
+BLAS
++++++
+
+PaddlePaddle支持 `MKL <https://software.intel.com/en-us/intel-mkl>`_ 和
+`OpenBlAS <http://www.openblas.net/>`_ 两种BLAS库。默认使用MKL。如果使用MKL并且机器含有AVX2指令集，
+还会下载MKL-DNN数学库，详细参考 `这里 <https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/mkldnn#cmake>`_ 。
+
+如果关闭MKL，则会使用OpenBLAS作为BLAS库。
+
+CUDA/cuDNN
++++++++++++
+
+PaddlePaddle在编译时/运行时会自动找到系统中安装的CUDA和cuDNN库进行编译和执行。
+使用参数 :code:`-DCUDA_ARCH_NAME=Auto` 可以指定开启自动检测SM架构，加速编译。
+
+PaddlePaddle可以使用cuDNN v5.1之后的任何一个版本来编译运行，但尽量请保持编译和运行使用的cuDNN是同一个版本。
+我们推荐使用最新版本的cuDNN。
+
+编译选项的设置
+++++++++++++++
+
+PaddePaddle通过编译时指定路径来实现引用各种BLAS/CUDA/cuDNN库。cmake编译时，首先在系统路径（ :code:`/usr/lib:/usr/local/lib` ）中搜索这几个库，同时也会读取相关路径变量来进行搜索。 通过使用 ``-D`` 命令可以设置，例如
+
+..  code-block:: bash
+
+    cmake .. -DWITH_GPU=ON -DWITH_TESTING=OFF -DCUDNN_ROOT=/opt/cudnnv5
+
+注意：这几个编译选项的设置，只在第一次cmake的时候有效。如果之后想要重新设置，推荐清理整个编译目录（ :code:`rm -rf` ）后，再指定。
+
+.. _install_docker:
+
+使用Docker安装运行
+==================
+
+使用Docker安装和运行PaddlePaddle可以无需考虑依赖环境。
+您可以在 `Docker官网 <https://docs.docker.com/get-started/>`_
+获得基本的Docker安装和使用方法。
+
+在了解Docker的基本使用方法之后，即可开始下面的步骤：
+
+.. _docker_pull:
+
+获取PaddlePaddle的Docker镜像
+""""""""""""""""""""""""""""
+
+执行下面的命令获取最新的PaddlePaddle Docker镜像，版本为cpu_avx_mkl：
+
+  .. code-block:: bash
+
+     docker pull paddlepaddle/paddle
+
+对于国内用户，我们提供了加速访问的镜像源：
+
+  .. code-block:: bash
+
+     docker pull docker.paddlepaddlehub.com/paddle
+
+下载GPU版本（cuda8.0_cudnn5_avx_mkl）的Docker镜像：
+
+  .. code-block:: bash
+
+     docker pull paddlepaddle/paddle:latest-gpu
+     docker pull docker.paddlepaddlehub.com/paddle:latest-gpu
+
+选择下载使用不同的BLAS库的Docker镜像：
+
+  .. code-block:: bash
+
+     # 默认是使用MKL的镜像
+     docker pull paddlepaddle/paddle
+     # 使用OpenBLAS的镜像
+     docker pull paddlepaddle/paddle:latest-openblas
+
+下载指定版本的Docker镜像，可以从 `DockerHub网站 <https://hub.docker.com/r/paddlepaddle/paddle/tags/>`_ 获取可选的tag，并执行下面的命令：
+
+  .. code-block:: bash
+
+     docker pull paddlepaddle/paddle:[tag]
+     # 比如：
+     docker pull docker.paddlepaddlehub.com/paddle:0.11.0-gpu
+
+.. _docker_run:
+
+在Docker中执行PaddlePaddle训练程序
+"""""""""""""""""""""""""""""""""""
+
+假设您已经在当前目录（比如在/home/work）编写了一个PaddlePaddle的程序 :code:`train.py` （可以参考
+`PaddlePaddleBook <http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.cn.html>`_
+编写），就可以使用下面的命令开始执行训练：
+
+  .. code-block:: bash
+
+     cd /home/work
+     docker run -it -v $PWD:/work paddlepaddle/paddle /work/train.py
+
+上述命令中， :code:`-it` 参数说明容器已交互式运行； :code:`-v $PWD:/work`
+指定将当前路径（Linux中$PWD变量会展开为当前路径的绝对路径）挂载到容器内部的 :code:`/work`
+目录； :code:`paddlepaddle/paddle` 指定需要使用的容器； 最后 :code:`/work/train.py`
+为容器内执行的命令，即运行训练程序。
+
+当然，您也可以进入到Docker容器中，以交互式的方式执行或调试您的代码：
+
+  .. code-block:: bash
+     docker run -it -v $PWD:/work paddlepaddle/paddle /bin/bash
+     cd /work
+     python train.py
+
+**注：PaddlePaddle Docker镜像为了减小体积，默认没有安装vim，您可以在容器中执行** :code:`apt-get install -y vim` **安装后，在容器中编辑代码。**
+
+.. _docker_run_book:
+
+使用Docker启动PaddlePaddle Book教程
+""""""""""""""""""""""""""""""""""""
+
+使用Docker可以快速在本地启动一个包含了PaddlePaddle官方Book教程的Jupyter Notebook，可以通过网页浏览。
+PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Notebook。
+如果您想要更深入了解deep learning，PaddlePaddle Book一定是您最好的选择。
+大家可以通过它阅读教程，或者制作和分享带有代码、公式、图表、文字的交互式文档。
+
+我们提供可以直接运行PaddlePaddle Book的Docker镜像，直接运行：
+
+  .. code-block:: bash
+
+     docker run -p 8888:8888 paddlepaddle/book
+
+国内用户可以使用下面的镜像源来加速访问：
+
+  .. code-block: bash
+
+    docker run -p 8888:8888 docker.paddlepaddlehub.com/book
+
+然后在浏览器中输入以下网址：
+
+  .. code-block:: text
+
+     http://localhost:8888/
+
+就这么简单，享受您的旅程！
+
+.. _docker_run_gpu:
+
+使用Docker执行GPU训练
+""""""""""""""""""""""""""""
+
+为了保证GPU驱动能够在镜像里面正常运行，我们推荐使用
+`nvidia-docker <https://github.com/NVIDIA/nvidia-docker>`_ 来运行镜像。
+请不要忘记提前在物理机上安装GPU最新驱动。
+
+  .. code-block:: bash
+
+     nvidia-docker run -it -v $PWD:/work paddlepaddle/paddle:latest-gpu /bin/bash
+
+**注: 如果没有安装nvidia-docker，可以尝试以下的方法，将CUDA库和Linux设备挂载到Docker容器内：**
+
+  .. code-block:: bash
+
+     export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
+     export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
+     docker run ${CUDA_SO} ${DEVICES} -it paddlepaddle/paddle:latest-gpu
+
+**关于AVX：**
+
+AVX是一种CPU指令集，可以加速PaddlePaddle的计算。最新的PaddlePaddle Docker镜像默认
+是开启AVX编译的，所以，如果您的电脑不支持AVX，需要单独
+`编译 <./build_from_source_cn.html>`_ PaddlePaddle为no-avx版本。
+
+以下指令能检查Linux电脑是否支持AVX：
+
+   .. code-block:: bash
+
+      if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
+
+如果输出是No，就需要选择使用no-AVX的镜像
diff --git a/doc/fluid/new_docs/beginners_guide/install/paddleci.png b/doc/fluid/new_docs/beginners_guide/install/paddleci.png
new file mode 100644
index 0000000000000000000000000000000000000000..16087ce059aa3c07ce8c927d983eb86351915825
Binary files /dev/null and b/doc/fluid/new_docs/beginners_guide/install/paddleci.png differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/README.cn.md b/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/README.cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..9574dbea2f9a39bb196b61bb4fd12ba7c378f75a
--- /dev/null
+++ b/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/README.cn.md
@@ -0,0 +1,288 @@
+# 线性回归
+让我们从经典的线性回归（Linear Regression \[[1](#参考文献)\]）模型开始这份教程。在这一章里，你将使用真实的数据集建立起一个房价预测模型，并且了解到机器学习中的若干重要概念。
+
+本教程源代码目录在[book/fit_a_line](https://github.com/PaddlePaddle/book/tree/develop/01.fit_a_line)， 初次使用请参考PaddlePaddle[安装教程](https://github.com/PaddlePaddle/book/blob/develop/README.cn.md#运行这本书)，更多内容请参考本教程的[视频课堂](http://bit.baidu.com/course/detail/id/137.html)。
+
+## 背景介绍
+给定一个大小为$n$的数据集  ${\{y_{i}, x_{i1}, ..., x_{id}\}}_{i=1}^{n}$，其中$x_{i1}, \ldots, x_{id}$是第$i$个样本$d$个属性上的取值，$y_i$是该样本待预测的目标。线性回归模型假设目标$y_i$可以被属性间的线性组合描述，即
+
+$$y_i = \omega_1x_{i1} + \omega_2x_{i2} + \ldots + \omega_dx_{id} + b,  i=1,\ldots,n$$
+
+例如，在我们将要建模的房价预测问题里，$x_{ij}$是描述房子$i$的各种属性（比如房间的个数、周围学校和医院的个数、交通状况等），而 $y_i$是房屋的价格。
+
+初看起来，这个假设实在过于简单了，变量间的真实关系很难是线性的。但由于线性回归模型有形式简单和易于建模分析的优点，它在实际问题中得到了大量的应用。很多经典的统计学习、机器学习书籍\[[2,3,4](#参考文献)\]也选择对线性模型独立成章重点讲解。
+
+## 效果展示
+我们使用从[UCI Housing Data Set](https://archive.ics.uci.edu/ml/datasets/Housing)获得的波士顿房价数据集进行模型的训练和预测。下面的散点图展示了使用模型对部分房屋价格进行的预测。其中，每个点的横坐标表示同一类房屋真实价格的中位数，纵坐标表示线性回归模型根据特征预测的结果，当二者值完全相等的时候就会落在虚线上。所以模型预测得越准确，则点离虚线越近。
+<p align="center">
+    <img src = "https://github.com/PaddlePaddle/book/blob/develop/01.fit_a_line/image/predictions.png?raw=true" width=400><br/>
+    图1. 预测值 V.S. 真实值
+</p>
+
+## 模型概览
+
+### 模型定义
+
+在波士顿房价数据集中，和房屋相关的值共有14个：前13个用来描述房屋相关的各种信息，即模型中的 $x_i$；最后一个值为我们要预测的该类房屋价格的中位数，即模型中的 $y_i$。因此，我们的模型就可以表示成：
+
+$$\hat{Y} = \omega_1X_{1} + \omega_2X_{2} + \ldots + \omega_{13}X_{13} + b$$
+
+$\hat{Y}$ 表示模型的预测结果，用来和真实值$Y$区分。模型要学习的参数即：$\omega_1, \ldots, \omega_{13}, b$。
+
+建立模型后，我们需要给模型一个优化目标，使得学到的参数能够让预测值$\hat{Y}$尽可能地接近真实值$Y$。这里我们引入损失函数（[Loss Function](https://en.wikipedia.org/wiki/Loss_function)，或Cost Function）这个概念。 输入任意一个数据样本的目标值$y_{i}$和模型给出的预测值$\hat{y_{i}}$，损失函数输出一个非负的实值。这个实值通常用来反映模型误差的大小。
+
+对于线性回归模型来讲，最常见的损失函数就是均方误差（Mean Squared Error， [MSE](https://en.wikipedia.org/wiki/Mean_squared_error)）了，它的形式是：
+
+$$MSE=\frac{1}{n}\sum_{i=1}^{n}{(\hat{Y_i}-Y_i)}^2$$
+
+即对于一个大小为$n$的测试集，$MSE$是$n$个数据预测结果误差平方的均值。
+
+### 训练过程
+
+定义好模型结构之后，我们要通过以下几个步骤进行模型训练
+ 1. 初始化参数，其中包括权重$\omega_i$和偏置$b$，对其进行初始化（如0均值，1方差）。
+ 2. 网络正向传播计算网络输出和损失函数。
+ 3. 根据损失函数进行反向误差传播 （[backpropagation](https://en.wikipedia.org/wiki/Backpropagation)），将网络误差从输出层依次向前传递, 并更新网络中的参数。
+ 4. 重复2~3步骤，直至网络训练误差达到规定的程度或训练轮次达到设定值。
+
+## 数据集
+
+### 数据集介绍
+这份数据集共506行，每行包含了波士顿郊区的一类房屋的相关信息及该类房屋价格的中位数。其各维属性的意义如下：
+
+| 属性名 | 解释 | 类型 |
+| ------| ------ | ------ |
+| CRIM | 该镇的人均犯罪率 | 连续值 |
+| ZN | 占地面积超过25,000平方呎的住宅用地比例 | 连续值 |
+| INDUS | 非零售商业用地比例 | 连续值 |
+| CHAS | 是否邻近 Charles River  | 离散值，1=邻近；0=不邻近 |
+| NOX | 一氧化氮浓度 | 连续值 |
+| RM | 每栋房屋的平均客房数 | 连续值 |
+| AGE | 1940年之前建成的自用单位比例 | 连续值 |
+| DIS | 到波士顿5个就业中心的加权距离 | 连续值 |
+| RAD | 到径向公路的可达性指数 | 连续值 |
+| TAX | 全值财产税率 | 连续值 |
+| PTRATIO | 学生与教师的比例 | 连续值 |
+| B | 1000(BK - 0.63)^2，其中BK为黑人占比 | 连续值 |
+| LSTAT | 低收入人群占比 | 连续值 |
+| MEDV | 同类房屋价格的中位数 | 连续值 |
+
+### 数据预处理
+#### 连续值与离散值
+观察一下数据，我们的第一个发现是：所有的13维属性中，有12维的连续值和1维的离散值（CHAS）。离散值虽然也常使用类似0、1、2这样的数字表示，但是其含义与连续值是不同的，因为这里的差值没有实际意义。例如，我们用0、1、2来分别表示红色、绿色和蓝色的话，我们并不能因此说“蓝色和红色”比“绿色和红色”的距离更远。所以通常对一个有$d$个可能取值的离散属性，我们会将它们转为$d$个取值为0或1的二值属性或者将每个可能取值映射为一个多维向量。不过就这里而言，因为CHAS本身就是一个二值属性，就省去了这个麻烦。
+
+#### 属性的归一化
+另外一个稍加观察即可发现的事实是，各维属性的取值范围差别很大（如图2所示）。例如，属性B的取值范围是[0.32, 396.90]，而属性NOX的取值范围是[0.3850, 0.8170]。这里就要用到一个常见的操作-归一化（normalization）了。归一化的目标是把各位属性的取值范围放缩到差不多的区间，例如[-0.5,0.5]。这里我们使用一种很常见的操作方法：减掉均值，然后除以原取值范围。
+
+做归一化（或 [Feature scaling](https://en.wikipedia.org/wiki/Feature_scaling)）至少有以下3个理由：
+- 过大或过小的数值范围会导致计算时的浮点上溢或下溢。
+- 不同的数值范围会导致不同属性对模型的重要性不同（至少在训练的初始阶段如此），而这个隐含的假设常常是不合理的。这会对优化的过程造成困难，使训练时间大大的加长。
+- 很多的机器学习技巧/模型（例如L1，L2正则项，向量空间模型-Vector Space Model）都基于这样的假设：所有的属性取值都差不多是以0为均值且取值范围相近的。
+
+<p align="center">
+    <img src = "https://github.com/PaddlePaddle/book/blob/develop/01.fit_a_line/image/ranges.png?raw=true" width=550><br/>
+    图2. 各维属性的取值范围
+</p>
+
+#### 整理训练集与测试集
+我们将数据集分割为两份：一份用于调整模型的参数，即进行模型的训练，模型在这份数据集上的误差被称为**训练误差**；另外一份被用来测试，模型在这份数据集上的误差被称为**测试误差**。我们训练模型的目的是为了通过从训练数据中找到规律来预测未知的新数据，所以测试误差是更能反映模型表现的指标。分割数据的比例要考虑到两个因素：更多的训练数据会降低参数估计的方差，从而得到更可信的模型；而更多的测试数据会降低测试误差的方差，从而得到更可信的测试误差。我们这个例子中设置的分割比例为$8:2$
+
+
+在更复杂的模型训练过程中，我们往往还会多使用一种数据集：验证集。因为复杂的模型中常常还有一些超参数（[Hyperparameter](https://en.wikipedia.org/wiki/Hyperparameter_optimization)）需要调节，所以我们会尝试多种超参数的组合来分别训练多个模型，然后对比它们在验证集上的表现选择相对最好的一组超参数，最后才使用这组参数下训练的模型在测试集上评估测试误差。由于本章训练的模型比较简单，我们暂且忽略掉这个过程。
+
+## 训练
+
+`fit_a_line/trainer.py`演示了训练的整体过程。
+
+### 配置数据提供器(Datafeeder)
+首先我们引入必要的库：
+```python
+import paddle
+import paddle.fluid as fluid
+import numpy
+from __future__ import print_function
+```
+
+我们通过uci_housing模块引入了数据集合[UCI Housing Data Set](https://archive.ics.uci.edu/ml/datasets/Housing)
+
+其中，在uci_housing模块中封装了：
+
+1. 数据下载的过程。下载数据保存在~/.cache/paddle/dataset/uci_housing/housing.data。
+2. [数据预处理](#数据预处理)的过程。
+
+接下来我们定义了用于训练和测试的数据提供器。提供器每次读入一个大小为`BATCH_SIZE`的数据批次。如果用户希望加一些随机性，她可以同时定义一个批次大小和一个缓存大小。这样的话，每次数据提供器会从缓存中随机读取批次大小那么多的数据。
+
+```python
+BATCH_SIZE = 20
+
+train_reader = paddle.batch(
+    paddle.reader.shuffle(
+        paddle.dataset.uci_housing.train(), buf_size=500),
+    batch_size=BATCH_SIZE)
+
+test_reader = paddle.batch(
+    paddle.reader.shuffle(
+        paddle.dataset.uci_housing.test(), buf_size=500),
+    batch_size=BATCH_SIZE)
+```
+
+### 配置训练程序
+训练程序的目的是定义一个训练模型的网络结构。对于线性回归来讲，它就是一个从输入到输出的简单的全连接层。更加复杂的结果，比如卷积神经网络，递归神经网络等会在随后的章节中介绍。训练程序必须返回`平均损失`作为第一个返回值，因为它会被后面反向传播算法所用到。
+
+```python
+def train_program():
+    y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+
+    # feature vector of length 13
+    x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+    y_predict = fluid.layers.fc(input=x, size=1, act=None)
+
+    loss = fluid.layers.square_error_cost(input=y_predict, label=y)
+    avg_loss = fluid.layers.mean(loss)
+
+    return avg_loss
+```
+
+### Optimizer Function 配置
+
+在下面的 `SGD optimizer`，`learning_rate` 是训练的速度，与网络的训练收敛速度有关系。
+
+```python
+def optimizer_program():
+    return fluid.optimizer.SGD(learning_rate=0.001)
+```
+
+### 定义运算场所
+我们可以定义运算是发生在CPU还是GPU
+
+```python
+use_cuda = False
+place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+```
+
+### 创建训练器
+训练器会读入一个训练程序和一些必要的其他参数：
+
+```python
+trainer = fluid.Trainer(
+    train_func=train_program,
+    place=place,
+    optimizer_func=optimizer_program)
+```
+
+### 开始提供数据
+PaddlePaddle提供了读取数据者发生器机制来读取训练数据。读取数据者会一次提供多列数据，因此我们需要一个Python的list来定义读取顺序。
+
+```python
+feed_order=['x', 'y']
+```
+
+除此之外，可以定义一个事件相应器来处理类似`打印训练进程`的事件：
+
+```python
+# Specify the directory to save the parameters
+params_dirname = "fit_a_line.inference.model"
+
+# Plot data
+from paddle.v2.plot import Ploter
+train_title = "Train cost"
+test_title = "Test cost"
+plot_cost = Ploter(train_title, test_title)
+
+step = 0
+
+# event_handler prints training and testing info
+def event_handler_plot(event):
+    global step
+    if isinstance(event, fluid.EndStepEvent):
+        if step % 10 == 0:   # record a train cost every 10 batches
+            plot_cost.append(train_title, step, event.metrics[0])
+
+        if step % 100 == 0:  # record a test cost every 100 batches
+            test_metrics = trainer.test(
+                reader=test_reader, feed_order=feed_order)
+            plot_cost.append(test_title, step, test_metrics[0])
+            plot_cost.plot()
+
+            if test_metrics[0] < 10.0:
+                # If the accuracy is good enough, we can stop the training.
+                print('loss is less than 10.0, stop')
+                trainer.stop()
+        step += 1
+
+    if isinstance(event, fluid.EndEpochEvent):
+        if event.epoch % 10 == 0:
+            # We can save the trained parameters for the inferences later
+            if params_dirname is not None:
+                trainer.save_params(params_dirname)
+```
+
+### 开始训练
+我们现在可以通过调用`trainer.train()`来开始训练
+
+```python
+%matplotlib inline
+
+# The training could take up to a few minutes.
+trainer.train(
+    reader=train_reader,
+    num_epochs=100,
+    event_handler=event_handler_plot,
+    feed_order=feed_order)
+```
+<div align="center">
+<img src="https://github.com/PaddlePaddle/book/blob/develop/01.fit_a_line/image/train_and_test.png?raw=true" width="400"><br/>
+图3 训练结果
+</div>
+
+
+## 预测
+提供一个`inference_program`和一个`params_dirname`来初始化预测器。`params_dirname`用来存储我们的参数。
+
+### 设定预测程序
+类似于`trainer.train`，预测器需要一个预测程序来做预测。我们可以稍加修改我们的训练程序来把预测值包含进来。
+
+
+```python
+def inference_program():
+    x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+    y_predict = fluid.layers.fc(input=x, size=1, act=None)
+    return y_predict
+```
+
+### 预测
+预测器会从`params_dirname`中读取已经训练好的模型，来对从未遇见过的数据进行预测。
+
+```python
+inferencer = fluid.Inferencer(
+    infer_func=inference_program, param_path=params_dirname, place=place)
+
+batch_size = 10
+test_reader = paddle.batch(paddle.dataset.uci_housing.test(),batch_size=batch_size)
+test_data = test_reader().next()
+test_x = numpy.array([data[0] for data in test_data]).astype("float32")
+test_y = numpy.array([data[1] for data in test_data]).astype("float32")
+
+results = inferencer.infer({'x': test_x})
+
+print("infer results: (House Price)")
+for idx, val in enumerate(results[0]):
+    print("%d: %.2f" % (idx, val))
+
+print("\nground truth:")
+for idx, val in enumerate(test_y):
+    print("%d: %.2f" % (idx, val))
+```
+
+## 总结
+在这章里，我们借助波士顿房价这一数据集，介绍了线性回归模型的基本概念，以及如何使用PaddlePaddle实现训练和测试的过程。很多的模型和技巧都是从简单的线性回归模型演化而来，因此弄清楚线性模型的原理和局限非常重要。
+
+
+## 参考文献
+1. https://en.wikipedia.org/wiki/Linear_regression
+2. Friedman J, Hastie T, Tibshirani R. The elements of statistical learning[M]. Springer, Berlin: Springer series in statistics, 2001.
+3. Murphy K P. Machine learning: a probabilistic perspective[M]. MIT press, 2012.
+4. Bishop C M. Pattern recognition[J]. Machine Learning, 2006, 128.
+
+<br/>
+<a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span> 由 <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作，采用 <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">知识共享 署名-相同方式共享 4.0 国际 许可协议</a>进行许可。
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/index.rst b/doc/fluid/new_docs/beginners_guide/quick_start/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..f5889ba52b8016596108de48bad59f238c16afc0
--- /dev/null
+++ b/doc/fluid/new_docs/beginners_guide/quick_start/index.rst
@@ -0,0 +1,13 @@
+########
+快速入门
+########
+
+..  todo::
+
+    概述
+
+..  toctree::
+    :maxdepth: 2
+
+    fit_a_line/README.cn.md
+    recognize_digits/README.cn.md
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/README.cn.md b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/README.cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..ac36c4ecf6b9b716fe5f0dbe2346e64918c22242
--- /dev/null
+++ b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/README.cn.md
@@ -0,0 +1,447 @@
+# 识别数字
+
+本教程源代码目录在[book/recognize_digits](https://github.com/PaddlePaddle/book/tree/develop/02.recognize_digits)， 初次使用请参考PaddlePaddle[安装教程](https://github.com/PaddlePaddle/book/blob/develop/README.cn.md#运行这本书)，更多内容请参考本教程的[视频课堂](http://bit.baidu.com/course/detail/id/167.html)。
+
+## 背景介绍
+当我们学习编程的时候，编写的第一个程序一般是实现打印"Hello World"。而机器学习（或深度学习）的入门教程，一般都是 [MNIST](http://yann.lecun.com/exdb/mnist/) 数据库上的手写识别问题。原因是手写识别属于典型的图像分类问题，比较简单，同时MNIST数据集也很完备。MNIST数据集作为一个简单的计算机视觉数据集，包含一系列如图1所示的手写数字图片和对应的标签。图片是28x28的像素矩阵，标签则对应着0~9的10个数字。每张图片都经过了大小归一化和居中处理。
+
+<p align="center">
+<img src="https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/image/mnist_example_image.png?raw=true" width="400"><br/>
+图1. MNIST图片示例
+</p>
+
+MNIST数据集是从 [NIST](https://www.nist.gov/srd/nist-special-database-19) 的Special Database 3（SD-3）和Special Database 1（SD-1）构建而来。由于SD-3是由美国人口调查局的员工进行标注，SD-1是由美国高中生进行标注，因此SD-3比SD-1更干净也更容易识别。Yann LeCun等人从SD-1和SD-3中各取一半作为MNIST的训练集（60000条数据）和测试集（10000条数据），其中训练集来自250位不同的标注员，此外还保证了训练集和测试集的标注员是不完全相同的。
+
+Yann LeCun早先在手写字符识别上做了很多研究，并在研究过程中提出了卷积神经网络（Convolutional Neural Network），大幅度地提高了手写字符的识别能力，也因此成为了深度学习领域的奠基人之一。如今的深度学习领域，卷积神经网络占据了至关重要的地位，从最早Yann LeCun提出的简单LeNet，到如今ImageNet大赛上的优胜模型VGGNet、GoogLeNet、ResNet等（请参见[图像分类](https://github.com/PaddlePaddle/book/tree/develop/03.image_classification) 教程），人们在图像分类领域，利用卷积神经网络得到了一系列惊人的结果。
+
+有很多算法在MNIST上进行实验。1998年，LeCun分别用单层线性分类器、多层感知器（Multilayer Perceptron, MLP）和多层卷积神经网络LeNet进行实验，使得测试集上的误差不断下降（从12%下降到0.7%）\[[1](#参考文献)\]。此后，科学家们又基于K近邻（K-Nearest Neighbors）算法\[[2](#参考文献)\]、支持向量机（SVM）\[[3](#参考文献)\]、神经网络\[[4-7](#参考文献)\]和Boosting方法\[[8](#参考文献)\]等做了大量实验，并采用多种预处理方法（如去除歪曲、去噪、模糊等）来提高识别的准确率。
+
+本教程中，我们从简单的模型Softmax回归开始，带大家入门手写字符识别，并逐步进行模型优化。
+
+
+## 模型概览
+
+基于MNIST数据训练一个分类器，在介绍本教程使用的三个基本图像分类网络前，我们先给出一些定义：
+- $X$是输入：MNIST图片是$28\times28$ 的二维图像，为了进行计算，我们将其转化为$784$维向量，即$X=\left ( x_0, x_1, \dots, x_{783} \right )$。
+- $Y$是输出：分类器的输出是10类数字（0-9），即$Y=\left ( y_0, y_1, \dots, y_9 \right )$，每一维$y_i$代表图片分类为第$i$类数字的概率。
+- $L$是图片的真实标签：$L=\left ( l_0, l_1, \dots, l_9 \right )$也是10维，但只有一维为1，其他都为0。
+
+### Softmax回归(Softmax Regression)
+
+最简单的Softmax回归模型是先将输入层经过一个全连接层得到的特征，然后直接通过softmax 函数进行多分类\[[9](#参考文献)\]。
+
+输入层的数据$X$传到输出层，在激活操作之前，会乘以相应的权重 $W$ ，并加上偏置变量 $b$ ，具体如下：
+
+$$ y_i = \text{softmax}(\sum_j W_{i,j}x_j + b_i) $$
+
+其中 $ \text{softmax}(x_i) = \frac{e^{x_i}}{\sum_j e^{x_j}} $
+
+对于有 $N$ 个类别的多分类问题，指定 $N$ 个输出节点，$N$ 维结果向量经过softmax将归一化为 $N$ 个[0,1]范围内的实数值，分别表示该样本属于这 $N$ 个类别的概率。此处的 $y_i$ 即对应该图片为数字 $i$ 的预测概率。
+
+在分类问题中，我们一般采用交叉熵代价损失函数（cross entropy loss），公式如下：
+
+$$  L_{cross-entropy}(label, y) = -\sum_i label_ilog(y_i) $$
+
+图2为softmax回归的网络图，图中权重用蓝线表示、偏置用红线表示、+1代表偏置参数的系数为1。
+
+<p align="center">
+<img src="https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/image/softmax_regression.png?raw=true" width=400><br/>
+图2. softmax回归网络结构图<br/>
+</p>
+
+### 多层感知器(Multilayer Perceptron, MLP)
+
+Softmax回归模型采用了最简单的两层神经网络，即只有输入层和输出层，因此其拟合能力有限。为了达到更好的识别效果，我们考虑在输入层和输出层中间加上若干个隐藏层\[[10](#参考文献)\]。
+
+1.  经过第一个隐藏层，可以得到 $ H_1 = \phi(W_1X + b_1) $，其中$\phi$代表激活函数，常见的有sigmoid、tanh或ReLU等函数。
+2.  经过第二个隐藏层，可以得到 $ H_2 = \phi(W_2H_1 + b_2) $。
+3.  最后，再经过输出层，得到的$Y=\text{softmax}(W_3H_2 + b_3)$，即为最后的分类结果向量。
+
+
+图3为多层感知器的网络结构图，图中权重用蓝线表示、偏置用红线表示、+1代表偏置参数的系数为1。
+
+<p align="center">
+<img src="https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/image/mlp.png?raw=true" width=500><br/>
+图3. 多层感知器网络结构图<br/>
+</p>
+
+### 卷积神经网络(Convolutional Neural Network, CNN)
+
+在多层感知器模型中，将图像展开成一维向量输入到网络中，忽略了图像的位置和结构信息，而卷积神经网络能够更好的利用图像的结构信息。[LeNet-5](http://yann.lecun.com/exdb/lenet/)是一个较简单的卷积神经网络。图4显示了其结构：输入的二维图像，先经过两次卷积层到池化层，再经过全连接层，最后使用softmax分类作为输出层。下面我们主要介绍卷积层和池化层。
+
+<p align="center">
+<img src="https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/image/cnn.png?raw=true" width="400"><br/>
+图4. LeNet-5卷积神经网络结构<br/>
+</p>
+
+#### 卷积层
+
+卷积层是卷积神经网络的核心基石。在图像识别里我们提到的卷积是二维卷积，即离散二维滤波器（也称作卷积核）与二维图像做卷积操作，简单的讲是二维滤波器滑动到二维图像上所有位置，并在每个位置上与该像素点及其领域像素点做内积。卷积操作被广泛应用与图像处理领域，不同卷积核可以提取不同的特征，例如边沿、线性、角等特征。在深层卷积神经网络中，通过卷积操作可以提取出图像低级到复杂的特征。
+
+<p align="center">
+<img src="https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/image/conv_layer.png?raw=true" width='750'><br/>
+图5. 卷积层图片<br/>
+</p>
+
+图5给出一个卷积计算过程的示例图，输入图像大小为$H=5,W=5,D=3$，即$5 \times 5$大小的3通道（RGB，也称作深度）彩色图像。这个示例图中包含两（用$K$表示）组卷积核，即图中滤波器$W_0$和$W_1$。在卷积计算中，通常对不同的输入通道采用不同的卷积核，如图示例中每组卷积核包含（$D=3）$个$3 \times 3$（用$F \times F$表示）大小的卷积核。另外，这个示例中卷积核在图像的水平方向（$W$方向）和垂直方向（$H$方向）的滑动步长为2（用$S$表示）；对输入图像周围各填充1（用$P$表示）个0，即图中输入层原始数据为蓝色部分，灰色部分是进行了大小为1的扩展，用0来进行扩展。经过卷积操作得到输出为$3 \times 3 \times 2$（用$H_{o} \times W_{o} \times K$表示）大小的特征图，即$3 \times 3$大小的2通道特征图，其中$H_o$计算公式为：$H_o = (H - F + 2 \times P)/S + 1$，$W_o$同理。 而输出特征图中的每个像素，是每组滤波器与输入图像每个特征图的内积再求和，再加上偏置$b_o$，偏置通常对于每个输出特征图是共享的。输出特征图$o[:,:,0]$中的最后一个$-2$计算如图5右下角公式所示。
+
+在卷积操作中卷积核是可学习的参数，经过上面示例介绍，每层卷积的参数大小为$D \times F \times F \times K$。在多层感知器模型中，神经元通常是全部连接，参数较多。而卷积层的参数较少，这也是由卷积层的主要特性即局部连接和共享权重所决定。
+
+- 局部连接：每个神经元仅与输入神经元的一块区域连接，这块局部区域称作感受野（receptive field）。在图像卷积操作中，即神经元在空间维度（spatial dimension，即上图示例H和W所在的平面）是局部连接，但在深度上是全部连接。对于二维图像本身而言，也是局部像素关联较强。这种局部连接保证了学习后的过滤器能够对于局部的输入特征有最强的响应。局部连接的思想，也是受启发于生物学里面的视觉系统结构，视觉皮层的神经元就是局部接受信息的。
+
+- 权重共享：计算同一个深度切片的神经元时采用的滤波器是共享的。例如图4中计算$o[:,:,0]$的每个每个神经元的滤波器均相同，都为$W_0$，这样可以很大程度上减少参数。共享权重在一定程度上讲是有意义的，例如图片的底层边缘特征与特征在图中的具体位置无关。但是在一些场景中是无意的，比如输入的图片是人脸，眼睛和头发位于不同的位置，希望在不同的位置学到不同的特征 (参考[斯坦福大学公开课]( http://cs231n.github.io/convolutional-networks/))。请注意权重只是对于同一深度切片的神经元是共享的，在卷积层，通常采用多组卷积核提取不同特征，即对应不同深度切片的特征，不同深度切片的神经元权重是不共享。另外，偏重对同一深度切片的所有神经元都是共享的。
+
+通过介绍卷积计算过程及其特性，可以看出卷积是线性操作，并具有平移不变性（shift-invariant），平移不变性即在图像每个位置执行相同的操作。卷积层的局部连接和权重共享使得需要学习的参数大大减小，这样也有利于训练较大卷积神经网络。
+
+#### 池化层
+
+<p align="center">
+<img src="https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/image/max_pooling.png?raw=true" width="400px"><br/>
+图6. 池化层图片<br/>
+</p>
+
+池化是非线性下采样的一种形式，主要作用是通过减少网络的参数来减小计算量，并且能够在一定程度上控制过拟合。通常在卷积层的后面会加上一个池化层。池化包括最大池化、平均池化等。其中最大池化是用不重叠的矩形框将输入层分成不同的区域，对于每个矩形框的数取最大值作为输出层，如图6所示。
+
+更详细的关于卷积神经网络的具体知识可以参考[斯坦福大学公开课]( http://cs231n.github.io/convolutional-networks/ )和[图像分类]( https://github.com/PaddlePaddle/book/tree/develop/03.image_classification )教程。
+
+### 常见激活函数介绍  
+- sigmoid激活函数： $ f(x) = sigmoid(x) = \frac{1}{1+e^{-x}} $
+
+- tanh激活函数： $ f(x) = tanh(x) = \frac{e^x-e^{-x}}{e^x+e^{-x}} $
+
+  实际上，tanh函数只是规模变化的sigmoid函数，将sigmoid函数值放大2倍之后再向下平移1个单位：tanh(x) = 2sigmoid(2x) - 1 。
+
+- ReLU激活函数： $ f(x) = max(0, x) $
+
+更详细的介绍请参考[维基百科激活函数](https://en.wikipedia.org/wiki/Activation_function)。
+
+## 数据介绍
+
+PaddlePaddle在API中提供了自动加载[MNIST](http://yann.lecun.com/exdb/mnist/)数据的模块`paddle.dataset.mnist`。加载后的数据位于`/home/username/.cache/paddle/dataset/mnist`下：
+
+
+|    文件名称          |       说明              |
+|----------------------|-------------------------|
+|train-images-idx3-ubyte|  训练数据图片，60,000条数据 |
+|train-labels-idx1-ubyte|  训练数据标签，60,000条数据 |
+|t10k-images-idx3-ubyte |  测试数据图片，10,000条数据 |
+|t10k-labels-idx1-ubyte |  测试数据标签，10,000条数据 |
+
+## Fluid API 概述
+
+演示将使用最新的 `Fluid API`。Fluid API是最新的 PaddlePaddle API。它在不牺牲性能的情况下简化了模型配置。
+我们建议使用 Fluid API，因为它更容易学起来。
+
+下面是快速的 Fluid API 概述。
+1. `inference_program`：指定如何从数据输入中获得预测的函数。
+这是指定网络流的地方。
+
+1. `train_program`：指定如何从 `inference_program` 和`标签值`中获取 `loss` 的函数。
+这是指定损失计算的地方。
+
+1. `optimizer_func`: “指定优化器配置的函数。优化器负责减少损失并驱动培训。Paddle 支持多种不同的优化器。
+
+1. `Trainer`：PaddlePaddle Trainer 管理由 `train_program` 和 `optimizer` 指定的训练过程。
+通过 `event_handler` 回调函数，用户可以监控培训的进展。
+
+1. `Inferencer`：Fluid inferencer 加载 `inference_program` 和由 Trainer 训练的参数。
+然后，它可以推断数据和返回预测。
+
+在这个演示中，我们将深入了解它们。
+
+## 配置说明
+加载 PaddlePaddle 的 Fluid API 包。
+
+```python
+import paddle
+import paddle.fluid as fluid
+from __future__ import print_function
+```
+
+### Program Functions 配置
+
+我们需要设置“推理程序”函数。我们想用这个程序来演示三个不同的分类器，每个分类器都定义为 Python 函数。
+我们需要将图像数据馈送到分类器。Paddle 为读取数据提供了一个特殊的层 `layer.data` 层。
+让我们创建一个数据层来读取图像并将其连接到分类网络。
+
+- Softmax回归：只通过一层简单的以softmax为激活函数的全连接层，就可以得到分类的结果。
+
+```python
+def softmax_regression():
+    img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
+    predict = fluid.layers.fc(
+        input=img, size=10, act='softmax')
+    return predict
+```
+
+- 多层感知器：下面代码实现了一个含有两个隐藏层（即全连接层）的多层感知器。其中两个隐藏层的激活函数均采用ReLU，输出层的激活函数用Softmax。
+
+```python
+def multilayer_perceptron():
+    img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
+    # 第一个全连接层，激活函数为ReLU
+    hidden = fluid.layers.fc(input=img, size=200, act='relu')
+    # 第二个全连接层，激活函数为ReLU
+    hidden = fluid.layers.fc(input=hidden, size=200, act='relu')
+    # 以softmax为激活函数的全连接输出层，输出层的大小必须为数字的个数10
+    prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
+    return prediction
+```
+
+- 卷积神经网络LeNet-5: 输入的二维图像，首先经过两次卷积层到池化层，再经过全连接层，最后使用以softmax为激活函数的全连接层作为输出层。
+
+```python
+def convolutional_neural_network():
+    img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
+    # 第一个卷积-池化层
+    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+        input=img,
+        filter_size=5,
+        num_filters=20,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+    conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
+    # 第二个卷积-池化层
+    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+        input=conv_pool_1,
+        filter_size=5,
+        num_filters=50,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+    # 以softmax为激活函数的全连接输出层，输出层的大小必须为数字的个数10
+    prediction = fluid.layers.fc(input=conv_pool_2, size=10, act='softmax')
+    return prediction
+```
+
+#### Train Program 配置
+然后我们需要设置训练程序 `train_program`。它首先从分类器中进行预测。
+在训练期间，它将从预测中计算 `avg_cost`。
+
+**注意:** 训练程序应该返回一个数组，第一个返回参数必须是 `avg_cost`。训练器使用它来计算梯度。
+
+请随意修改代码，测试 Softmax 回归 `softmax_regression`, `MLP` 和 卷积神经网络 `convolutional neural network` 分类器之间的不同结果。
+
+```python
+def train_program():
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+    # predict = softmax_regression() # uncomment for Softmax回归
+    # predict = multilayer_perceptron() # uncomment for 多层感知器
+    predict = convolutional_neural_network() # uncomment for LeNet5卷积神经网络
+    cost = fluid.layers.cross_entropy(input=predict, label=label)
+    avg_cost = fluid.layers.mean(cost)
+    acc = fluid.layers.accuracy(input=predict, label=label)
+    return [avg_cost, acc]
+
+
+```
+
+#### Optimizer Function 配置
+
+在下面的 `Adam optimizer`，`learning_rate` 是训练的速度，与网络的训练收敛速度有关系。
+
+```python
+def optimizer_program():
+    return fluid.optimizer.Adam(learning_rate=0.001)
+```
+
+### 数据集 Feeders 配置
+
+下一步，我们开始训练过程。`paddle.dataset.movielens.train()`和`paddle.dataset.movielens.test()`分别做训练和测试数据集。这两个函数各自返回一个reader——PaddlePaddle中的reader是一个Python函数，每次调用的时候返回一个Python yield generator。
+
+下面`shuffle`是一个reader decorator，它接受一个reader A，返回另一个reader B。reader B 每次读入`buffer_size`条训练数据到一个buffer里，然后随机打乱其顺序，并且逐条输出。
+
+`batch`是一个特殊的decorator，它的输入是一个reader，输出是一个batched reader。在PaddlePaddle里，一个reader每次yield一条训练数据，而一个batched reader每次yield一个minibatch。
+
+```python
+train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.mnist.train(), buf_size=500),
+        batch_size=64)
+
+test_reader = paddle.batch(
+            paddle.dataset.mnist.test(), batch_size=64)
+```
+
+### Trainer 配置
+
+现在，我们需要配置 `Trainer`。`Trainer` 需要接受训练程序 `train_program`, `place` 和优化器 `optimizer`。
+
+```python
+# 该模型运行在单个CPU上
+use_cuda = False # set to True if training with GPU
+place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+
+trainer = fluid.Trainer(
+    train_func=train_program, place=place, optimizer_func=optimizer_program)
+```
+
+#### Event Handler 配置
+
+Fluid API 在训练期间为回调函数提供了一个钩子。用户能够通过机制监控培训进度。
+我们将在这里演示两个 `event_handler` 程序。请随意修改 Jupyter 笔记本 ，看看有什么不同。
+
+`event_handler` 用来在训练过程中输出训练结果
+
+```python
+# Save the parameter into a directory. The Inferencer can load the parameters from it to do infer
+params_dirname = "recognize_digits_network.inference.model"
+lists = []
+def event_handler(event):
+    if isinstance(event, fluid.EndStepEvent):
+        if event.step % 100 == 0:
+            # event.metrics maps with train program return arguments.
+            # event.metrics[0] will yeild avg_cost and event.metrics[1] will yeild acc in this example.
+            print("Pass %d, Batch %d, Cost %f" % (
+                event.step, event.epoch, event.metrics[0]))
+
+    if isinstance(event, fluid.EndEpochEvent):
+        avg_cost, acc = trainer.test(
+            reader=test_reader, feed_order=['img', 'label'])
+
+        print("Test with Epoch %d, avg_cost: %s, acc: %s" % (event.epoch, avg_cost, acc))
+
+        # save parameters
+        trainer.save_params(params_dirname)
+        lists.append((event.epoch, avg_cost, acc))
+```
+
+`event_handler_plot` 可以用来在训练过程中画图如下：
+
+<div align="center">
+<img src="https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/image/train_and_test.png?raw=true" width="400"><br/>
+图7 训练结果
+</div>
+
+
+```python
+from paddle.v2.plot import Ploter
+
+train_title = "Train cost"
+test_title = "Test cost"
+cost_ploter = Ploter(train_title, test_title)
+step = 0
+lists = []
+
+# event_handler to plot a figure
+def event_handler_plot(event):
+    global step
+    if isinstance(event, fluid.EndStepEvent):
+        if step % 100 == 0:
+            # event.metrics maps with train program return arguments.
+            # event.metrics[0] will yeild avg_cost and event.metrics[1] will yeild acc in this example.
+            cost_ploter.append(train_title, step, event.metrics[0])
+            cost_ploter.plot()
+        step += 1
+    if isinstance(event, fluid.EndEpochEvent):
+        # save parameters
+        trainer.save_params(params_dirname)
+
+        avg_cost, acc = trainer.test(
+            reader=test_reader, feed_order=['img', 'label'])
+        cost_ploter.append(test_title, step, avg_cost)
+        lists.append((event.epoch, avg_cost, acc))
+```
+
+#### 开始训练
+
+既然我们设置了 `event_handler` 和 `data reader`，我们就可以开始训练模型了。
+
+`feed_order` 用于将数据目录映射到 `train_program`
+
+```python
+trainer.train(
+    num_epochs=5,
+    event_handler=event_handler,
+    reader=train_reader,
+    feed_order=['img', 'label'])
+```
+
+训练过程是完全自动的，event_handler里打印的日志类似如下所示：
+
+```
+Pass 0, Batch 0, Cost 0.125650
+Pass 100, Batch 0, Cost 0.161387
+Pass 200, Batch 0, Cost 0.040036
+Pass 300, Batch 0, Cost 0.023391
+Pass 400, Batch 0, Cost 0.005856
+Pass 500, Batch 0, Cost 0.003315
+Pass 600, Batch 0, Cost 0.009977
+Pass 700, Batch 0, Cost 0.020959
+Pass 800, Batch 0, Cost 0.105560
+Pass 900, Batch 0, Cost 0.239809
+Test with Epoch 0, avg_cost: 0.053097883707459624, acc: 0.9822850318471338
+```
+
+训练之后，检查模型的预测准确度。用 MNIST 训练的时候，一般 softmax回归模型的分类准确率为约为 92.34%，多层感知器为97.66%，卷积神经网络可以达到 99.20%。
+
+
+## 应用模型
+
+可以使用训练好的模型对手写体数字图片进行分类，下面程序展示了如何使用 `fluid.Inferencer` 接口进行推断。
+
+### Inference 配置
+
+`Inference` 需要一个 `infer_func` 和 `param_path` 来设置网络和经过训练的参数。
+我们可以简单地插入在此之前定义的分类器。
+
+```python
+inferencer = fluid.Inferencer(
+    # infer_func=softmax_regression, # uncomment for softmax regression
+    # infer_func=multilayer_perceptron, # uncomment for MLP
+    infer_func=convolutional_neural_network,  # uncomment for LeNet5
+    param_path=params_dirname,
+    place=place)
+```
+
+### 生成预测输入数据
+
+`infer_3.png` 是数字 3 的一个示例图像。把它变成一个 numpy 数组以匹配数据馈送格式。
+
+```python
+# Prepare the test image
+import os
+import numpy as np
+from PIL import Image
+def load_image(file):
+    im = Image.open(file).convert('L')
+    im = im.resize((28, 28), Image.ANTIALIAS)
+    im = np.array(im).reshape(1, 1, 28, 28).astype(np.float32)
+    im = im / 255.0 * 2.0 - 1.0
+    return im
+
+cur_dir = cur_dir = os.getcwd()
+img = load_image(cur_dir + '/image/infer_3.png')
+```
+
+### 预测
+
+现在我们准备做预测。
+
+```python
+results = inferencer.infer({'img': img})
+lab = np.argsort(results)  # probs and lab are the results of one batch data
+print ("Inference result of image/infer_3.png is: %d" % lab[0][0][-1])
+```
+
+## 总结
+
+本教程的softmax回归、多层感知器和卷积神经网络是最基础的深度学习模型，后续章节中复杂的神经网络都是从它们衍生出来的，因此这几个模型对之后的学习大有裨益。同时，我们也观察到从最简单的softmax回归变换到稍复杂的卷积神经网络的时候，MNIST数据集上的识别准确率有了大幅度的提升，原因是卷积层具有局部连接和共享权重的特性。在之后学习新模型的时候，希望大家也要深入到新模型相比原模型带来效果提升的关键之处。此外，本教程还介绍了PaddlePaddle模型搭建的基本流程，从dataprovider的编写、网络层的构建，到最后的训练和预测。对这个流程熟悉以后，大家就可以用自己的数据，定义自己的网络模型，并完成自己的训练和预测任务了。
+
+## 参考文献
+
+1. LeCun, Yann, Léon Bottou, Yoshua Bengio, and Patrick Haffner. ["Gradient-based learning applied to document recognition."](http://ieeexplore.ieee.org/abstract/document/726791/) Proceedings of the IEEE 86, no. 11 (1998): 2278-2324.
+2. Wejéus, Samuel. ["A Neural Network Approach to Arbitrary SymbolRecognition on Modern Smartphones."](http://www.diva-portal.org/smash/record.jsf?pid=diva2%3A753279&dswid=-434) (2014).
+3. Decoste, Dennis, and Bernhard Schölkopf. ["Training invariant support vector machines."](http://link.springer.com/article/10.1023/A:1012454411458) Machine learning 46, no. 1-3 (2002): 161-190.
+4. Simard, Patrice Y., David Steinkraus, and John C. Platt. ["Best Practices for Convolutional Neural Networks Applied to Visual Document Analysis."](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.160.8494&rep=rep1&type=pdf) In ICDAR, vol. 3, pp. 958-962. 2003.
+5. Salakhutdinov, Ruslan, and Geoffrey E. Hinton. ["Learning a Nonlinear Embedding by Preserving Class Neighbourhood Structure."](http://www.jmlr.org/proceedings/papers/v2/salakhutdinov07a/salakhutdinov07a.pdf) In AISTATS, vol. 11. 2007.
+6. Cireşan, Dan Claudiu, Ueli Meier, Luca Maria Gambardella, and Jürgen Schmidhuber. ["Deep, big, simple neural nets for handwritten digit recognition."](http://www.mitpressjournals.org/doi/abs/10.1162/NECO_a_00052) Neural computation 22, no. 12 (2010): 3207-3220.
+7. Deng, Li, Michael L. Seltzer, Dong Yu, Alex Acero, Abdel-rahman Mohamed, and Geoffrey E. Hinton. ["Binary coding of speech spectrograms using a deep auto-encoder."](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.185.1908&rep=rep1&type=pdf) In Interspeech, pp. 1692-1695. 2010.
+8. Kégl, Balázs, and Róbert Busa-Fekete. ["Boosting products of base classifiers."](http://dl.acm.org/citation.cfm?id=1553439) In Proceedings of the 26th Annual International Conference on Machine Learning, pp. 497-504. ACM, 2009.
+9. Rosenblatt, Frank. ["The perceptron: A probabilistic model for information storage and organization in the brain."](http://psycnet.apa.org/journals/rev/65/6/386/) Psychological review 65, no. 6 (1958): 386.
+10. Bishop, Christopher M. ["Pattern recognition."](http://users.isr.ist.utl.pt/~wurmd/Livros/school/Bishop%20-%20Pattern%20Recognition%20And%20Machine%20Learning%20-%20Springer%20%202006.pdf) Machine Learning 128 (2006): 1-58.
+
+<br/>
+<a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span> 由 <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作，采用 <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">知识共享 署名-相同方式共享 4.0 国际 许可协议</a>进行许可。
diff --git a/doc/fluid/new_docs/faq/faq.rst b/doc/fluid/new_docs/faq/faq.rst
new file mode 100644
index 0000000000000000000000000000000000000000..3b4bd4f895162fa3b0ba12e785e38ad694590b25
--- /dev/null
+++ b/doc/fluid/new_docs/faq/faq.rst
@@ -0,0 +1,12 @@
+###################
+编译安装与单元测试
+###################
+
+1. 通过pip安装的PaddlePaddle在  :code:`import paddle.fluid` 报找不到 :code:`libmkldnn.so` 或 :code:`libmklml_intel.so`
+------------------------------------------------------------------------------------------
+出现这种问题的原因是在导入 :code:`paddle.fluid` 时需要加载 :code:`libmkldnn.so` 和 :code:`libmklml_intel.so`，
+但是系统没有找到该文件。一般通过pip安装PaddlePaddle时会将 :code:`libmkldnn.so` 和 :code:`libmklml_intel.so`
+拷贝到 :code:`/usr/local/lib` 路径下，所以解决办法是将该路径加到 :code:`LD_LIBRARY_PATH` 环境变量下，
+即： :code:`export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH` 。
+
+**注意**：如果是在虚拟环境中安装PaddlePaddle， :code:`libmkldnn.so` 和 :code:`libmklml_intel.so` 可能不在 :code:`/usr/local/lib` 路径下。
diff --git a/doc/fluid/new_docs/faq/index_cn.rst b/doc/fluid/new_docs/faq/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..bb2ed99217609d3a9edd179d4f98ad5b8b649860
--- /dev/null
+++ b/doc/fluid/new_docs/faq/index_cn.rst
@@ -0,0 +1,9 @@
+FAQ
+====
+
+本文档对关于PaddlePaddle的一些常见问题提供了解答。如果您的问题未在此处，请您到 `PaddlePaddle社区 <https://github.com/PaddlePaddle/Paddle/issues>`_ 查找答案或直接提 `issue <https://github.com/PaddlePaddle/Paddle/issues/new>`_ ，我们会及时进行回复。
+ 
+..  toctree::
+  :maxdepth: 1
+  
+  faq.rst
diff --git a/doc/fluid/new_docs/user_guides/howto/basic_concept/fluid_basic_concept.rst b/doc/fluid/new_docs/user_guides/howto/basic_concept/fluid_basic_concept.rst
new file mode 100644
index 0000000000000000000000000000000000000000..55c3c761f932713ffa2b462b35f9f46a8edae536
--- /dev/null
+++ b/doc/fluid/new_docs/user_guides/howto/basic_concept/fluid_basic_concept.rst
@@ -0,0 +1,392 @@
+================================
+PaddleFluid设计思想和基本使用概念
+================================
+
+
+
+Paddle Fluid 是用来让用户像 PyTorch 和 Tensorflow Eager Execution 一样执行程序。
+在这些系统中，不再有模型这个概念，应用也不再包含一个用于描述 Operator 图或者一系列层的符号描述，
+而是像通用程序那样描述训练或者预测的过程。
+
+
+深度学习平台的演化
+================
+
+时至今日，深度学习已成为事实上最流行的机器学习技术。学术界多年研究加上工业界的长期实践提出了若干有效的基本建模单元：
+全连接，卷积，循环神经网络等；设计各类训练技巧：初始化方法，跨层连接，各类 norm 技术等；
+发明了各种新的优化算法：Adadelta，Adam 等；
+各类固定的网络结构：highway, residual, attention 等纷纷涌现，不胜枚举。
+学术界工业界多年的付出共同促成了深度学习方法今日的影响力。
+
+学术研究和生产实践中积累了大量的知识，能够很好的解释神经网络中基本模块各自独的学习能力和特性。
+基本模块和训练技术的组合能够搭建出千变万化的神经网络模型。
+基本模块和训练技术是有限的，但他们的组合却是千变万化，这是深度学习方法的魅力所在，也是难度所在。
+
+正是这样高度的模块化特性，研究者和工程师们都在努力避免重复造轮子以提高研究和生产的效率，
+又进一步催生了深度学习平台技术的发展，深度学习框架已演变成为 AI 基础设施中重要的一部分。
+从 Theano，到 DistBelief，到 TensorFlow；从 Caffe 到 Caffe2；
+从 Torch 到 PyTorch；从 PaddlePaddle 到 PaddleFluid，
+深度学习平台技术也经历了两代的演化，并向着第三代平台技术迈进。
+
+站在历史发展的今天，当我们准备切换尝试使用一个新的深度学习平台作为支持自己学习和研究的工具时，
+平台技术都发生了哪些演化，能够为我们的带来什么便利呢？
+
+先让我们来看看深度学习框架解决的三大问题：
+
+- 如何描述计算以支持未来潜在会出现的新模型？
+- 如何高效利用异构设备最大化算力？
+- 如何利用网络中的计算机进行分布式计算来处理千万亿级别的数据？
+
+以上三个问题中的第一个和使用者研究者最为密切相关。
+这篇文章我们通过分析 PaddleFluid的设计理念，
+来了解一个深度学习框架如何抽象深度学习模型，来看看我们的使用经验如何在不同深度学习平台之间过度和迁移。
+
+如何描述计算
+=============
+
+让我们首先来看看 PaddleFluid 如何描述机器学习模型
+
+
+PaddleFluid之 :code:`Program`
+
+如何描述计算很大程度决定了一个神经网络框架计算功能的完备性。
+深度学习模型和方法历经二十多年的发展：“依次执行一组计算的前向，
+再以和前向计算相反的顺序执行反向计算，中间无分支无交互”，
+这样的模型结构已经无法满足研究者和千千万万框架使用者的想象力。
+
+从 `PaddleFluid 的设计目标 <https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/motivation/fluid.md>`_ 来看，
+在如何描述机器学习模型这一核心问题上，PaddleFluid 的目标是：
+创造一种新的计算描述方式，不但能够描述至今为止人们已知的主流神经网络模型，并且能够支持未来会出现的任意模型。
+
+PaddleFluid 是如何做到支持未来出现的新模型这一目标呢？PaddleFluid 的设计选择是：
+对用户来说，用一段 :code:`Program` （在 PaddleFluid 内部会被转化为一种叫作 :code:`ProgramDesc` 的描述语言），
+而不是用计算图来描述机器学习模型。 :code:`Program` 用符合用户使用直觉的方式，
+提供一种新的描述语言能够描述任意复杂的机器学习模型。
+
+对所有计算机专业同学学习编程语言的第一课一定是建立对“程序语言的三种执行结构：顺序执行，条件选择和循环执行”的认识。
+计算机世界的所有可计算逻辑都是由这三种执行结构表示，用这三种结构描述的逻辑是可计算的。那么同样道理，
+对一个神经网络框架来说，如果可以和程序语言一样提供对这三种执行结构的支持，那么将可以描述任意复杂的，
+可被计算机计算的机器学习模型。PaddleFluid通过提供对这三种执行结构的支持，来做到对任意复杂模型的描述。
+
+具体来说：
+
+1. Fluid 的核心设计理念都可以类比到程序语言，如果已经有写程序的经验，那么使用 Fluid 构建神经网络模型的体验，将非常接近写程序；
+
+2. 在 PaddleFluid 中，用户不会显示地感知“计算图”这样的概念，一个机器学习模型被描述为一个 Fluid :code:`Program` （Fluid 内部称之为 :code:`ProgramDesc` ）；
+
+- 一个 Fluid :code:`Program` 由一组嵌套的 :code:`Block` 构成。 :code:`Block` 的概念可以类比到 C++ 或是 Java 中的一对大括号，或是 Python 语言中的一个缩进快；
+-  :code:`Block` 中的计算由顺序执行、条件选择或者循环执行三种方式组合，构成复杂的计算逻辑。
+
+3. Fluid :code:`Program` 中包含对计算和计算对象的描述。计算的描述称之为 Operator；计算作用的对象（或者说 Operator 的输入和输出）被统一为 Tensor。
+
+在描述计算和计算的作用对象这一问题上，各个深度学习框架的选择是相同的，如果有一个平台的使用经验，那么将非常容易在各个平台之间进行迁移。
+
+核心使用概念
+=============
+
+下面，我们将更详细地了解核心使用概念在PaddlePaddle的使用方法。
+
+数据表示和计算的对象：Tensor
+--------------------------
+
+Tensor 是向量矩阵概念的扩展，是神经网络模型计算操作的基本对象。这在是今天所有主流深度学习平台的共同选择。
+
+可以简单地将 Tensor 理解为一个 N 维向量，它可以有任意多的维度。一个 Tensor 具有两个基本特征：
+
+1. 数据类型：每个 Tensor 的所有元素具有同样的、已知的数据类型；
+
+2. 大小（或者说形状）：即维度的个数（rank，阶）以及各维度的长度。
+
+Tensor 某些维度的长度在定义模型阶段可能是未知的，在实际算法执行时才能确定。例如一个 mini-batch 中包含的样本数目（batch size），或者是一个 mini-batch 中序列的最大长度。
+
+PaddleFluid中的Tensor
+""""""""""""""""""""""
+
+PaddleFluid 中也使用 Tensor 作为神经网络中输入输出数据的统一表示。Tensor 的概念在今天主流的深度学习平台中都是完全相同，可以在各个深度学习框架之间直接无缝迁移。
+
+在 Fluid 中也同样存在三种特殊的 Tensor：
+
+1. 模型中的可学习参数
+
+模型中的可学习参数生存期和整个训练任务一样长，会接受优化算法的更新。在 PaddleFluid 中同样以 :code:`Variable` 表示；
+用户在绝大多数情况下都不需要自己来创建网络中的可学习参数，Fluid 为几乎常见的神经网络基本计算模块都提供了封装。
+以最简单的全连接模型为例，下面的代码片段会直接为全连接层创建连接权值 WW 和偏置（ :code:`bias` ）两个可学习参数，
+无需显示地调用 variable 相关接口创建可学习参数。
+
+
+::
+
+  import paddle.fluid as fluid
+
+  y = fluid.layers.fc(input=x, size=128, bias_attr=True)
+
+2. 输入输出Tensor
+
+整个神经网络的输入数据也是一个特殊的 Tensor，在这个 Tensor 中，
+一些维度的大小在定义模型时无法确定（通常包括：batch size；
+如果 mini-batch 之间，数据可变，也会包括序列的最大长度，图片的宽度和高度等），在定义模型时需要占位；
+PaddleFluid 中使用 :code:`fluid.layers.data` 来接入输入数据， :code:`fluid.layer.data` 需要提供输入 Tensor 的 形状信息，
+当遇到无法确定的维度 时， 相应维度指定为 None ，如下面的代码片段所示：
+
+::
+
+  import paddle.fluid as fluid
+
+  x = fluid.layers.data(name="x", shape=[2, None, 3], dtype="int64")
+
+3. 常量 Tensor 在 PaddleFluid 中需要通过组合 Tensor 和 :code:`fluid.layers.assign` 来实现。
+
+
+计算原语：Operation/Operator
+----------------------------
+
+Tensor 是今天所有主流深度学习框架的统一数据表示（输入、输出、中间计算结果、模型的可学习参数都是 Tensor）。
+另一方面，对数据的操作，在主流深度学习框架中也高度统一为：Operator/Operation。
+在中文中，通常我们会习惯将其称之为算子。
+
+注：在 PaddleFluid 中使用 Operator 称呼对 Tensor 的操作。
+
+Operation/Operator 接受多个 Tensor 作为输入，输出若干个 Tensor，表示了从输入到输出的变化。
+
+PaddleFluid中的Operator
+""""""""""""""""""""""""
+
+PaddleFluid 支持的所有算子，可以在 `API 帮助文档 <http://www.paddlepaddle.org/docs/develop/api/en/fluid/layers.html>`_ 中查看。
+
+为了便于用户使用，在 Python 端，Fluid 中的 Operator 被进一步封装入 :code:`paddle.fluid.layers` ，
+:code:`paddle.fluid.networks` 等模块。这是因为：一些常见的对Tensor的操作可能是有更多基础操作构成，
+例如：l2 norm 内部由 reduce、elementwise_add，scale 等多个 Operator 组合计算逻辑完成，
+为了提高使用的便利性，框架内部对基础 Operator 进行了一些封装，包括创建 Operator 依赖可学习参数，
+可学习参数的初始化细节等，减少用户重复开发的成本。
+
+对所有深度学习框架都面临同样的封装，在绝大多数情况下，用户很少会直接与框架底层的 Operator 直接打交道，而是使用框架提供的 layers，networks 等模块，降低开发的代码量。不论是什么样的概念，他们在各框架之间的本质和作用都是相同的：对 Tensor 的变换。
+
+总结
+>>>>>>
+
+不论叫作 Operation、Operator 还是 layers，他们在各深度学习平台中的含义和作用都是相同的：对 Tensor 的变换。是一个深度学习平台提供的基础计算能力。可以在每个平台各自的 API 帮助文档中查到。
+
+在各个深度学习平台都已加入 ONNX 项目的今天，每个深度学习平台提供给大家的基本算子都已趋同，与此同时，每个平台也各有其特点，会提供一些独特的算子，方便某一类任务的开发。
+
+构建模型并执行
+--------------
+
+整个训练任务运行方法如下：
+
+Fluid中的Program和Executor
+"""""""""""""""""""""""""""
+
+1. Fluid 使用 :code:`Program` 描述神经网络模型，对用户来说，并没有计算图的概念。
+用户定义的所有 Tensor 以及对 Tensor 的操作：Operator 都会被加入一段 :code:`Program` 中；
+
+一段 Program 由嵌套的 :code:`Block` 构成，但用户无需显示地创建 :code:`Block` 或是显示地注意到 :code:`Block` 的存在；
+在 Fluid 程序中， :code:`Block` 是在调用 :code:`while_op` ， :code:`if_op` ， :code:`parallel_do` 等特殊 :code:`Operator` 时，由这些 :code:`Operator` 来创建；
+对用户使用来说，只需要知道自己正在向一段 Fluid Program 中添加变量（ :code:`Tensor` ）和操作（ :code:`Operator` ）即可。
+
+2. Fluid 利用 :code:`Executor` 来执行一段 Fluid :code:`Program` 。
+
+为进一步理解 Fluid 中 :code:`Executor` 的作用，需要先解释一下 Fluid 程序的执行流程。 下图展示单机上，Fluid 程序的执行流程：
+
+.. figure:: fluid_local_train.jpeg
+
+   :scale: 50%
+   :align: center
+
+   Figure.1
+
+   Fluid本地训练任务执行流程图
+
+1. Fluid 设计思想和灵感非常类似于程序设计语言，和高级编译语言 C++/Java 编写程序的过程非常类似，Fluid 程序执行分为两个重要阶段：编译时和运行时；
+
+2. 编译期，用户通过调用 Fluid 提供的算子，向一段 :code:`Program` 中添加变量（Tensor）以及对变量的操作（Operators 或者 Layers）。用户只需要描述核心的前向计算，不需要关心反向计算，分布式下，异构设备下如何计算；
+
+3. 原始的 :code:`Program` 在平台内部转换为中间描述语言： :code:`ProgramDesc` ；
+
+4. 编译期最重要的一个功能模块是 Transpiler。Transpiler 接受一段 :code:`ProgramDesc` ，输出一段变化后的 :code:`ProgramDesc` ，作为后端 Executor 最终需要执行的 :code:`Fluid Program` ；
+
+最为常用的 Transipler 包括：
+
+1. 内存优化 Transipler：通过对变量读写依赖关系分析，插入内存回收 Operator 以维持运行过程中较小的内存开销；
+
+2. 分布式环境下的 Transpiler：接受用户定义的 local Program ，生成 Parameter Client 和 Parameter Server 执行的两段 :code:`Program` 。
+
+3. 后端 Executor 接受 Transpiler 输出的这段 :code:`Program` ，依次执行其中的 Operator（可以类比为程序语言中的指令），在执行过程中会为 Operator 创建所需的输入输出并进行管理。
+
+从上面的过程中可以看到，Fluid 程序的执行过程分为：编译器的定义 :code:`Program` ，和创建 :code:`Executor` 运行 :code:`Program` 。
+ :code:`Executor` 执行一段 :code:`Program` 的过程是不可交互和不可中断的。
+
+在 Fluid 中，可以创建多余一段 :code:`Program` 。默认情况，一个 PaddleFluid 程序中存在 2 段 Program：
+
+1.  :code:`fluid.framework.default_startup_program` ：其中定义了创建模型参数，输入输出，以及模型中可学习参数的初始化等各种操作；
+
+-  :code:`default_startup_program` 可以由框架自动生成，使用时无需显示地创建；
+- 如果调用修改了参数的默认初始化方式，框架会自动的将相关的修改加入 :code:`default_startup_program` 。
+
+2.  :code:`fluid.framework.default_main_program` ：定义了神经网络模型，前向反向计算，以及优化算法对网络中可学习参数的更新；
+
+- 使用 Fluid 的核心就是构建起 :code:`default_main_program` 。
+
+3. PaddleFluid 中的 :code:`Scope` 类似于 TensorFlow 中的 collection 这一概念，但在 Fluid 中 :code:`Scope` 是框架后端概念，用户无法直接操作。因此，在使用框架时无需关心。
+
+总结
+"""""
+
+Fluid 中通过 Executor 来执行一段用户定义的 Fluid :code:`Program` 。
+1. Executor 连接了 Fluid 的前端和后端；
+
+2. Executor 接受用户定义的原始模型（一段 :code:`Program` ），通过调用系统中不同功能更的 :code:`Transpiler` 完成对原始 :code:`Program` 的变化，进行优化。
+
+完整实例：如何完成一个机器学习模型的训练
+===================================
+
+
+
+这一节，我们以 MNIST 手写数字识别问题 —— 机器学习任务的“Hello World”问题和数据，为例，通过一个可以运行的完整实例，来学习上文介绍的概念如何在PaddleFluid 平台使用。
+
+步骤1：定义数据
+----------------
+
+PaddleFluid 中以 :code:`fluid.layers.data` 来接收输入数据。
+
+::
+
+  import numpy as np
+
+  import paddle.fluid as fluid
+  import paddle.v2 as paddle
+
+  # define the input layers for the network.
+  x = fluid.layers.data(name="img", shape=[1, 28, 28], dtype="float32")
+  y_ = fluid.layers.data(name="label", shape=[1], dtype="int64")
+
+Fluid 中 Tensor 的第 0 维度固定为 batch size。在上面代码段中，图像输入 :code:`x` 的形状为：[1, 28, 28]。这三个维度的含义分别是：channel 数目，图像的高度和宽度。
+
+实际上 Fluid 框架内部,一幅图像输入是一个 4-D Tensor，所有 Tensor 的第 0 维固定为 batch size。框架内部会自动为batch size进行填充占位。无需对batch size指定填充占位。
+
+如果除去 batch size（第 0 维度）外，如果 Tensor 某一维度的大小只能在运行时确定，可以在该位置上直接指定 :code:`None` 进行占位。
+
+步骤2：定义模型
+--------------
+
+通过调用 Fluid 提供的算子定义含有一个隐层的神经网络。Fluid 模型的分为模型结构和优化方法两部分。这一点与 TensorFlow 程序十分相似似，使用概念可以直接对应进行迁移。
+
+::
+
+  # define the network topology.
+  y = fluid.layers.fc(input=x, size=10, act="softmax")
+  loss = fluid.layers.cross_entropy(input=y, label=y_)
+  avg_loss = fluid.layers.mean(loss)
+
+  # define the optimization algorithm.
+  optimizer = fluid.optimizer.Adam(learning_rate=1e-3)
+  optimizer.minimize(avg_loss)
+
+Fluid 使用 Program 而不是计算图描述模型，一般情况下，用户无需关心 Program 的细节，当调用以上 layers 时，会向一个全局的 Program： :code:`fluid.framework.default_main_program` 中插入变量（Tensor）和对变量的操作（上述代码段中的 layers 和 optimzier）。
+
+步骤3：参数初始化
+----------------
+
+如上文介绍，Fluid 程序中的 Executor 是连接 Fluid 前端和后端的接口。
+
+默认一个Fluid模型存在至少两段 Program。用于初始化网络中的可学习参数的那一段 :code:`Program` 叫作 :code:`fluid.default_startup_program()` 。
+
+只有执行器 executor 可以执行 Fluid Program，因此，在初始化网络中的可学习参数之前，需要首先创建一个 Fluid executor。
+
+::
+
+  # define the executor.
+  place = fluid.CPUPlace()
+  exe = fluid.Executor(place)
+  exe.run(fluid.default_startup_program())
+
+在以上代码段中， :code:`place` 用于告诉 executor 一段 Fluid Program 在何种设备上执行，
+常见的有 :code:`fluid.CPUPlace()` 和 :code:`fluid.CUDAPlace()` 。
+
+步骤4：数据输入 + 执行模型训练
+----------------------------
+
+我们在步骤 2 中定义的神经网络模型最终被插入一段叫做 :code:`fluid.framework.default_main_program` 的 Fluid Program 中。
+
+网络可学习参数初始化之后，可以通过让执行器 Executor 执行这段 :code:`fluid.framework.default_main_program` 来进行训练。
+
+::
+
+  train_reader = paddle.batch(
+        paddle.reader.shuffle(paddle.dataset.mnist.train(), buf_size=5000),
+        batch_size=BATCH_SIZE)
+  feeder = fluid.DataFeeder(place=place, feed_list=[x, y_])
+
+  for pass_id in range(100):
+    for batch_id, data in enumerate(train_reader()):
+        loss = exe.run(
+            fluid.framework.default_main_program(),
+            feed=feeder.feed(data),
+            fetch_list=[avg_loss])
+        print("Cur Cost : %f" % (np.array(loss[0])[0]))
+
+从上面的代码片段中可以看到，Fluid 程序的训练过程和 TensorFlow 程序的训练过程非常接近，
+都放在一个 :code:`for` 循环中，循环读取一个 mini-batch 数据，
+调用执行器执行 Fluid :code:`default_main_program` ：接收 mini-batch 输入，在其上进行前向，反向和参数更新计算。
+
+`注：上面程序使用了 Fluid 内置的 MNIST 数据，和我们提供给 TensorFlow 示例程序的 MNIST 数据完全一样。`
+
+步骤5：观察模型效果
+-----------------
+
+以上步骤已经构成了完整的 Tensorflow 模型训练程序，每个 batch 观察一次 loss，可以直观看到模型的迭代效果：
+
+.. figure:: fluid_mnist.png
+
+   :scale: 40%
+   :align: center
+
+   Figure.2
+
+   Fluid MNIST手写数字识别任务代价下降曲线
+
+附：完整代码
+------------
+
+::
+
+  import numpy as np
+
+  import paddle.fluid as fluid
+  import paddle.v2 as paddle
+
+
+  def main():
+      BATCH_SIZE = 128
+
+      # define the input layers for the network.
+      x = fluid.layers.data(name="img", shape=[1, 28, 28], dtype="float32")
+      y_ = fluid.layers.data(name="label", shape=[1], dtype="int64")
+
+      # define the network topology.
+      y = fluid.layers.fc(input=x, size=10, act="softmax")
+      loss = fluid.layers.cross_entropy(input=y, label=y_)
+      avg_loss = fluid.layers.mean(loss)
+
+      optimizer = fluid.optimizer.Adam(learning_rate=5e-3)
+      optimizer.minimize(avg_loss)
+
+      # define the executor.
+      place = fluid.CPUPlace()
+      exe = fluid.Executor(place)
+      exe.run(fluid.default_startup_program())
+
+      train_reader = paddle.batch(
+          paddle.reader.shuffle(paddle.dataset.mnist.train(), buf_size=5000),
+          batch_size=BATCH_SIZE)
+      feeder = fluid.DataFeeder(place=place, feed_list=[x, y_])
+
+      for pass_id in range(100):
+          for batch_id, data in enumerate(train_reader()):
+              loss = exe.run(
+                  fluid.framework.default_main_program(),
+                  feed=feeder.feed(data),
+                  fetch_list=[avg_loss])
+              print("Cur Cost : %f" % (np.array(loss[0])[0]))
+
+  if __name__ == "__main__":
+      main()
diff --git a/doc/fluid/new_docs/user_guides/howto/basic_concept/fluid_local_train.jpeg b/doc/fluid/new_docs/user_guides/howto/basic_concept/fluid_local_train.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..0a495901fafb85987e34acc3c454fb87e8160fca
Binary files /dev/null and b/doc/fluid/new_docs/user_guides/howto/basic_concept/fluid_local_train.jpeg differ
diff --git a/doc/fluid/new_docs/user_guides/howto/basic_concept/fluid_mnist.png b/doc/fluid/new_docs/user_guides/howto/basic_concept/fluid_mnist.png
new file mode 100644
index 0000000000000000000000000000000000000000..e5ad0ba058c863cf68ef0789e58fcf67b3115fdb
Binary files /dev/null and b/doc/fluid/new_docs/user_guides/howto/basic_concept/fluid_mnist.png differ
diff --git a/doc/fluid/new_docs/user_guides/howto/configure_simple_model/index.rst b/doc/fluid/new_docs/user_guides/howto/configure_simple_model/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..5946a2ccb7e43004eae39ec4b3c6112c66c1fd04
--- /dev/null
+++ b/doc/fluid/new_docs/user_guides/howto/configure_simple_model/index.rst
@@ -0,0 +1,88 @@
+..  _user_guide_configure_simple_model:
+
+##############
+配置简单的网络
+##############
+
+在解决实际问题时，可以先从逻辑层面对问题进行建模，明确模型所需要的 **输入数据类型**、**计算逻辑**、**求解目标** 以及 **优化算法**。PaddlePaddle提供了丰富的算子来实现模型逻辑。下面以一个简单回归任务举例说明如何使用PaddlePaddle构建模型。该例子完整代码参见 `fit_a_line <https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_fit_a_line.py>`_。
+
+问题描述及定义
+##############
+
+问题描述: 给定一组数据 :math:`<X, Y>`，求解出函数 :math:`f`，使得 :math:`y=f(x)`，其中 :math:`x\subset X` 表示一条样本的特征，为 :math:`13` 维的实数向量；:math:`y \subset Y` 为一实数表示该样本对应的值。
+
+我们可以尝试用回归模型来对问题建模，回归问题的损失函数有很多，这里选择常用的均方误差。为简化问题，这里假定 :math:`f` 为简单的线性变换函数，同时选用随机梯度下降算法来求解模型。
+
++----------------+----------------------------------------------+
+| 输入数据类型   |  样本特征: 13 维 实数                        |
++                +----------------------------------------------+
+|                |  样本标签: 1 维 实数                         |
++----------------+----------------------------------------------+
+| 计算逻辑       | 使用线性模型，产生 1维实数作为模型的预测输出 |
++----------------+----------------------------------------------+
+| 求解目标       | 最小化模型预测输出与样本标签间的均方误差     |
++----------------+----------------------------------------------+
+| 优化算法       | 随机梯度下降                                 |
++----------------+----------------------------------------------+
+
+使用PaddlePadle建模
+###################
+
+从逻辑层面明确了输入数据格式、模型结构、损失函数以及优化算法后，需要使用PaddlePaddle提供的API及算子来实现模型逻辑。一个典型的模型主要包含4个部分，分别是：输入数据格式定义，模型前向计算逻辑，损失函数以及优化算法。
+
+数据层
+------
+
+PaddlePaddle提供了 :code:`fluid.layers.data()` 算子来描述输入数据的格式。
+
+:code:`fluid.layers.data()` 算子的输出是一个Variable。这个Variable的实际类型是Tensor。Tensor具有强大的表征能力，可以表示多维数据。为了精确描述数据结构，通常需要指定数据shape以及数值类型type。其中shape为一个整数向量，type可以是一个字符串类型。目前支持的数据类型参考    :ref:`user_guide_paddle_support_data_types` 。 模型训练一般会使用batch的方式读取数据，而batch的size在训练过程中可能不固定。data算子会依据实际数据来推断batch size，所以这里提供shape时不用关心batch size，只需关心一条样本的shape即可，更高级用法请参考 :ref:`user_guide_customize_batch_size_rank`。从上知，:math:`x` 为 :math:`13` 维的实数向量，:math:`y` 为实数，可使用下面代码定义数据层：
+
+.. code-block:: python
+
+    x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+    y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+
+该模型使用的数据比较简单，事实上data算子还可以描述变长的、嵌套的序列数据。也可以使用 :code:`open_files` 打开文件进行训练。更详细的文档可参照 :ref:`user_guide_prepare_data`。
+
+前向计算逻辑
+------------
+
+实现一个模型最重要的部分是实现计算逻辑，PaddlePaddle提供了丰富的算子。这些算子的封装粒度不同，通常对应一种或一组变换逻辑。算子输出即为对输入数据执行变换后的结果。用户可以灵活使用算子来完成复杂的模型逻辑。比如图像相关任务中会使用较多的卷积算子、序列任务中会使用LSTM/GRU等算子。复杂模型通常会组合多种算子，以完成复杂的变换。PaddlePaddle提供了非常自然的方式来组合算子，一般地可以使用下面的方式：
+
+.. code-block:: python
+
+    op_1_out = fluid.layers.op_1(input=op_1_in, ...)
+    op_2_out = fluid.layers.op_2(input=op_1_out, ...)
+    ...
+
+其中op_1和op_2表示算子类型，可以是fc来执行线性变换(全连接)，也可以是conv来执行卷积变换等。通过算子的输入输出的连接来定义算子的计算顺序以及数据流方向。上面的例子中，op_1的输出是op_2的输入，那么在执行计算时，会先计算op_1，然后计算op_2。更复杂的模型可能需要使用控制流算子，依据输入数据来动态执行，针对这种情况，PaddlePaddle提供了IfElseOp和WhileOp等。算子的文档可参考 :code:`fluid.layers`。具体到这个任务, 我们使用一个fc算子：
+
+.. code-block:: python
+
+    y_predict = fluid.layers.fc(input=x, size=1, act=None)
+
+损失函数
+--------
+
+损失函数对应求解目标，我们可以通过最小化损失来求解模型。大多数模型使用的损失函数，输出是一个实数值。但是PaddlePaddle提供的损失算子一般是针对一条样本计算。当输入一个batch的数据时，损失算子的输出有多个值，每个值对应一条样本的损失，所以通常会在损失算子后面使用mean等算子，来对损失做归约。模型在一次前向迭代后会得到一个损失值，PaddlePaddle会自动执行链式求导法则计算模型里面每个参数和变量对应的梯度值。这里使用均方误差损失：
+
+.. code-block:: python
+
+    cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+    avg_cost = fluid.layers.mean(cost)
+
+优化方法
+--------
+
+确定损失函数后，可以通过前向计算得到损失值，然后通过链式求导法则得到参数的梯度值。获取梯度值后需要更新参数，最简单的算法是随机梯度下降法：:math:`w=w - \eta \cdot g`。但是普通的随机梯度下降算法存在一些问题: 比如收敛不稳定等。为了改善模型的训练速度以及效果，学术界先后提出了很多优化算法，包括： :code:`Momentum`、:code:`RMSProp`、:code:`Adam` 等。这些优化算法采用不同的策略来更新模型参数，一般可以针对具体任务和具体模型来选择优化算法。不管使用何种优化算法，学习率一般是一个需要指定的比较重要的超参数，需要通过实验仔细调整。这里采用随机梯度下降算法：
+
+.. code-block:: python
+
+    sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+
+更多优化算子可以参考 :code:`fluid.optimizer()` 。
+
+下一步做什么？
+##############
+
+使用PaddlePaddle实现模型时需要关注 **数据层**、**前向计算逻辑**、**损失函数** 和 **优化方法**。不同的任务需要的数据格式不同，涉及的计算逻辑不同，损失函数不同，优化方法也不同。PaddlePaddle提供了丰富的模型示例，可以以这些示例为参考来构建自己的模型结构。用户可以访问 `模型库 <https://github.com/PaddlePaddle/models/tree/develop/fluid>`_ 查看官方提供的示例。
diff --git a/doc/fluid/new_docs/user_guides/howto/debug/index.rst b/doc/fluid/new_docs/user_guides/howto/debug/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..0878e17b4069be6b08bc85a35e77ba6421633218
--- /dev/null
+++ b/doc/fluid/new_docs/user_guides/howto/debug/index.rst
@@ -0,0 +1,10 @@
+############
+Debug 工具
+############
+
+PaddlePaddle 提供了如下方式方便 Debug 训练 情况
+
+.. toctree::
+   :maxdepth: 2
+
+   visualdl.md
diff --git a/doc/fluid/new_docs/user_guides/howto/debug/visualdl.md b/doc/fluid/new_docs/user_guides/howto/debug/visualdl.md
new file mode 100644
index 0000000000000000000000000000000000000000..99f8bee5ca1519ccf5d7c35ad2a64da4a8841ada
--- /dev/null
+++ b/doc/fluid/new_docs/user_guides/howto/debug/visualdl.md
@@ -0,0 +1,219 @@
+# VisualDL (Visualize the Deep Learning)
+<p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/VisualDL/develop/docs/images/vs-logo.png" width="60%" />
+</p>
+
+## 介绍
+VisualDL是一个面向深度学习任务设计的可视化工具，包含了scalar、参数分布、模型结构、图像可视化等功能，项目正处于高速迭代中，新的组件会不断加入。
+
+目前大多数DNN平台均使用Python作为配置语言，VisualDL原生支持python的使用，
+通过在模型的Python配置中添加几行，便可以为训练过程提供丰富的可视化支持。
+
+除了Python SDK之外，VisualDL底层采用C++编写，其暴露的C++ SDK也可以集成到其他平台中，
+实现原生的性能和定制效果。
+
+## 组件
+VisualDL 目前支持4种组件：
+
+- graph
+- scalar
+- image
+- histogram
+
+### Graph
+兼容 ONNX(Open Neural Network Exchange)[https://github.com/onnx/onnx], 通过与 python SDK的结合，VisualDL可以兼容包括 PaddlePaddle, pytorch, mxnet在内的大部分主流DNN平台。
+
+<p align="center">
+  <img src="https://raw.githubusercontent.com/daming-lu/large_files/master/graph_demo.gif" width="60%" />
+</p>
+
+### Scalar
+可以用于展示训练测试的误差趋势
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/daming-lu/large_files/master/loss_scalar.gif" width="60%"/>
+</p>
+
+### Image
+可以用于可视化任何tensor，或模型生成的图片
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/daming-lu/large_files/master/loss_image.gif" width="60%"/>
+</p>
+
+### Histogram
+
+用于可视化任何tensor中元素分布的变化趋势
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/daming-lu/large_files/master/histogram.gif" width="60%"/>
+</p>
+
+## 快速尝试
+请使用下面的命令，来快速测试 VisualDL。
+
+```
+# 安装，建議是在虚拟环境或anaconda下。
+pip install --upgrade visualdl
+
+# 运行一个例子，vdl_create_scratch_log 将创建测试日志
+vdl_create_scratch_log
+visualDL --logdir=scratch_log --port=8080
+
+# 访问 http://127.0.0.1:8080
+```
+
+如果以上步骤出现问题，很可能是因为python或pip不同版本或不同位置所致，以下安装方法能解决。
+
+## 使用 virtualenv 安装
+
+[Virtualenv](https://virtualenv.pypa.io/en/stable/) 能创建独立Python环境，也能确保Python和pip的相对位置正确。
+
+在macOS上，安装pip和virtualenv如下：
+```
+sudo easy_install pip
+pip install --upgrade virtualenv
+```
+
+在Linux上，安装pip和virtualenv如下:
+```
+sudo apt-get install python3-pip python3-dev python-virtualenv
+```
+
+然后创建一个虚拟环境：
+```
+virtualenv ~/vdl  # for Python2.7
+virtualenv -p python3 ~/vdl for Python 3.x
+```
+
+```~/vdl``` 是你的Virtualenv目录, 你也可以选择任一目录。
+
+激活虚拟环境如下：
+```
+source ~/vdl/bin/activate
+```
+
+现在再安装 VisualDL 和运行范例：
+
+```
+pip install --upgrade visualdl
+
+# 运行一个例子，vdl_create_scratch_log 将创建测试日志
+vdl_create_scratch_log
+visualDL --logdir=scratch_log --port=8080
+
+# 访问 http://127.0.0.1:8080
+```
+如果出现`TypeError: __init__() got an unexpected keyword argument 'file'`, 是因为protobuf不是3.5以上，运行`pip install --upgrade protobuf`就能解决。
+
+如果在虚拟环境下仍然遇到安装问题，请尝试以下方法。
+
+
+## 使用 Anaconda 安装
+
+Anaconda是一个用于科学计算的Python发行版，提供了包管理与环境管理的功能，可以很方便地解决多版本python并存、切换以及各种第三方包安装问题。
+
+请根据[Anaconda下载网站](https://www.anaconda.com/download) 的指示去下载和安装Anaconda.
+下载Python 3.6版本的command-Line installer.
+
+创建conda环境名字为```vdl```或任何名字:
+```
+conda create -n vdl pip python=2.7 # or python=3.3, etc.
+```
+
+激活conda环境如下:
+```
+source activate vdl
+```
+
+现在再安装 VisualDL 和运行范例：
+
+```
+pip install --upgrade visualdl
+
+# 运行一个例子，vdl_create_scratch_log 将创建测试日志
+vdl_create_scratch_log
+visualDL --logdir=scratch_log --port=8080
+
+# 访问 http://127.0.0.1:8080
+```
+
+如果仍然遇到安装问题，请尝试以下用源代码安装方法。
+
+### 使用代码安装
+```
+#建議是在虚拟环境或anaconda下。
+git clone https://github.com/PaddlePaddle/VisualDL.git
+cd VisualDL
+
+python setup.py bdist_wheel
+pip install --upgrade dist/visualdl-*.whl
+```
+
+如果打包和安装遇到其他问题，不安装只想运行Visual DL可以看[这里](https://github.com/PaddlePaddle/VisualDL/blob/develop/docs/develop/how_to_dev_frontend_cn.md)
+
+
+## SDK
+VisualDL 同时提供了python SDK 和 C++ SDK 来实现不同方式的使用。
+
+### Python SDK
+VisualDL 现在支持 Python 2和 Python 3。
+
+以最简单的Scalar组件为例，尝试创建一个scalar组件并插入多个时间步的数据：
+
+```python
+import random
+from visualdl import LogWriter
+
+logdir = "./tmp"
+logger = LogWriter(logdir, sync_cycle=10000)
+
+# mark the components with 'train' label.
+with logger.mode("train"):
+    # create a scalar component called 'scalars/scalar0'
+    scalar0 = logger.scalar("scalars/scalar0")
+
+# add some records during DL model running.
+for step in range(100):
+    scalar0.add_record(step, random.random())
+```
+
+### C++ SDK
+上面 Python SDK 中代码完全一致的C++ SDK用法如下
+```c++
+#include <cstdlib>
+#include <string>
+#include "visualdl/sdk.h"
+
+namespace vs = visualdl;
+namespace cp = visualdl::components;
+
+int main() {
+  const std::string dir = "./tmp";
+  vs::LogWriter logger(dir, 10000);
+
+  logger.SetMode("train");
+  auto tablet = logger.AddTablet("scalars/scalar0");
+
+  cp::Scalar<float> scalar0(tablet);
+
+  for (int step = 0; step < 1000; step++) {
+    float v = (float)std::rand() / RAND_MAX;
+    scalar0.AddRecord(step, v);
+  }
+
+  return 0;
+}
+```
+## 启动Board
+当训练过程中已经产生了日志数据，就可以启动board进行实时预览可视化信息
+
+```
+visualDL --logdir <some log dir>
+```
+
+board 还支持一下参数来实现远程的访问：
+
+- `--host` 设定IP
+- `--port` 设定端口
+- `--model_pb` 指定 ONNX 格式的模型文件
diff --git a/doc/fluid/new_docs/user_guides/howto/evaluation/index.rst b/doc/fluid/new_docs/user_guides/howto/evaluation/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..6f6698cadcba4d9645fdc4a8a74d899598b96d99
--- /dev/null
+++ b/doc/fluid/new_docs/user_guides/howto/evaluation/index.rst
@@ -0,0 +1,10 @@
+############
+模型评估和调试
+############
+
+PaddlePaddle Fluid提供了常用的模型评估指标，并提供了VisualDL工具可视化模型效果。
+
+.. toctree::
+   :maxdepth: 2
+
+   metrics
diff --git a/doc/fluid/new_docs/user_guides/howto/evaluation/metrics.rst b/doc/fluid/new_docs/user_guides/howto/evaluation/metrics.rst
new file mode 100644
index 0000000000000000000000000000000000000000..f37968a50350a90e698cb1a63bd501635753e7fb
--- /dev/null
+++ b/doc/fluid/new_docs/user_guides/howto/evaluation/metrics.rst
@@ -0,0 +1,62 @@
+############
+模型评估
+############
+
+模型评估是用指标反映模型在预期目标下精度，根据模型任务决定观察指标，作为在训练中调整超参数，评估模型效果的重要依据。
+metric函数的输入为当前模型的预测preds和labels，输出是自定义的。metric函数和loss函数非常相似，但是metric并不是模型训练网络组成部分。
+
+用户可以通过训练网络得到当前的预测preds和labels，在Python端定制metric函数；也可以通过定制c++ Operator的方式，在GPU上加速metric计算。
+
+paddle.fluid.metrics模块包含该功能
+
+
+常用指标
+############
+
+metric函数根据模型任务不同，指标构建方法因任务而异。
+
+回归类型任务labels是实数，因此loss和metric函数构建相同，可参考MSE的方法。
+分类任务常用指标为分类指标，本文提到的一般是二分类指标，多分类和多标签需要查看对应的API文档。例如排序指标auc，多分类可以作为0，1分类任务，auc指标仍然适用。
+Fluid中包含了常用分类指标，例如Precision, Recall, Accuracy等,更多请阅读API文档。以 :ref:`Precision` 为例，具体方法为
+
+.. code-block:: python
+
+   >>> import paddle.fluid as fluid
+   >>> labels = fluid.layers.data(name="data", shape=[1], dtype="int32")
+   >>> data = fluid.layers.data(name="data", shape=[32, 32], dtype="int32")
+   >>> pred = fluid.layers.fc(input=data, size=1000, act="tanh")
+   >>> acc = fluid.metrics.Precision()
+   >>> for pass in range(PASSES):
+   >>>   acc.reset()
+   >>>   for data in train_reader():
+   >>>       loss, preds, labels = exe.run(fetch_list=[cost, preds, labels])
+   >>>   acc.update(preds=preds, labels=labels)
+   >>>   numpy_acc = acc.eval()
+      
+
+其他任务例如MultiTask Learning，Metric Learning，Learning To Rank各种指标构造方法请参考API文档。
+
+自定义指标
+############
+Fluid支持自定义指标，灵活支持各类计算任务。下文通过一个简单的计数器metric函数，实现对模型的评估。
+其中preds是模型预测值，labels是给定的标签。
+
+.. code-block:: python
+
+   >>> class MyMetric(MetricBase):
+   >>>     def __init__(self, name=None):
+   >>>         super(MyMetric, self).__init__(name)
+   >>>         self.counter = 0  # simple counter
+
+   >>>     def reset(self):
+   >>>         self.counter = 0
+
+   >>>     def update(self, preds, labels):
+   >>>         if not _is_numpy_(preds):
+   >>>             raise ValueError("The 'preds' must be a numpy ndarray.")
+   >>>         if not _is_numpy_(labels):
+   >>>             raise ValueError("The 'labels' must be a numpy ndarray.")
+   >>>         self.counter += sum(preds == labels)
+
+   >>>     def eval(self):
+   >>>         return self.counter
diff --git a/doc/fluid/new_docs/user_guides/howto/inference/build_and_install_lib_cn.rst b/doc/fluid/new_docs/user_guides/howto/inference/build_and_install_lib_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..3884284ea020fe94ed9c03ec84c856ee44aa8c3f
--- /dev/null
+++ b/doc/fluid/new_docs/user_guides/howto/inference/build_and_install_lib_cn.rst
@@ -0,0 +1,99 @@
+.. _install_or_build_cpp_inference_lib:
+
+安装与编译C++预测库
+===========================
+
+直接下载安装
+-------------
+
+======================   ========================================
+版本说明                            C++预测库   
+======================   ========================================
+cpu_avx_mkl              `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/fluid.tgz>`_ 
+cpu_avx_openblas         `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/fluid.tgz>`_
+cpu_noavx_openblas       `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/fluid.tgz>`_
+cuda7.5_cudnn5_avx_mkl   `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/fluid.tgz>`_
+cuda8.0_cudnn5_avx_mkl   `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/fluid.tgz>`_
+cuda8.0_cudnn7_avx_mkl   `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/fluid.tgz>`_
+cuda9.0_cudnn7_avx_mkl   `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/fluid.tgz>`_
+======================   ========================================
+
+从源码编译
+----------
+用户也可以从 PaddlePaddle 核心代码编译C++预测库，只需在编译时配制下面这些编译选项：
+
+=================   =========
+选项                 值   
+=================   =========
+CMAKE_BUILD_TYPE    Release
+FLUID_INSTALL_DIR   安装路径    
+WITH_FLUID_ONLY     ON（推荐）
+WITH_SWIG_PY        OFF（推荐
+WITH_PYTHON         OFF（推荐）
+WITH_GPU            ON/OFF
+WITH_MKL            ON/OFF
+=================   =========
+
+建议按照推荐值设置，以避免链接不必要的库。其它可选编译选项按需进行设定。
+
+下面的代码片段从github拉取最新代码，配制编译选项（需要将PADDLE_ROOT替换为PaddlePaddle预测库的安装路径）：
+
+  .. code-block:: bash
+
+     pip install paddlepaddle-gpu
+     PADDLE_ROOT=/path/of/capi
+     git clone https://github.com/PaddlePaddle/Paddle.git
+     cd Paddle
+     mkdir build
+     cd build
+     cmake -DFLUID_INSTALL_DIR=$PADDLE_ROOT \
+           -DCMAKE_BUILD_TYPE=Release \
+           -DWITH_FLUID_ONLY=ON \
+           -DWITH_SWIG_PY=OFF \
+           -DWITH_PYTHON=OFF \
+           -DWITH_MKL=OFF \
+           -DWITH_GPU=OFF  \
+           ..
+      make
+      make inference_lib_dist
+
+成功编译后，使用C++预测库所需的依赖（包括：（1）编译出的PaddlePaddle预测库和头文件；（2）第三方链接库和头文件；（3）版本信息与编译选项信息）
+均会存放于PADDLE_ROOT目录中。目录结构如下：
+
+  .. code-block:: text
+
+     PaddleRoot/
+     ├── CMakeCache.txt
+     ├── paddle
+     │   └── fluid
+     │       ├── framework
+     │       ├── inference
+     │       ├── memory
+     │       ├── platform
+     │       ├── pybind
+     │       └── string
+     ├── third_party
+     │   ├── boost
+     │   │   └── boost
+     │   ├── eigen3
+     │   │   ├── Eigen
+     │   │   └── unsupported
+     │   └── install
+     │       ├── gflags
+     │       ├── glog
+     │       ├── mklml
+     │       ├── protobuf
+     │       ├── snappy
+     │       ├── snappystream
+     │       └── zlib
+     └── version.txt
+     
+version.txt 中记录了该预测库的版本信息，包括Git Commit ID、使用OpenBlas或MKL数学库、CUDA/CUDNN版本号，如：
+
+  .. code-block:: text
+
+     GIT COMMIT ID: c95cd4742f02bb009e651a00b07b21c979637dc8
+     WITH_MKL: ON
+     WITH_GPU: ON
+     CUDA version: 8.0
+     CUDNN version: v5
diff --git a/doc/fluid/new_docs/user_guides/howto/inference/index.rst b/doc/fluid/new_docs/user_guides/howto/inference/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..45e1a2883773b92ed47ef8d51417bbdcd060b4ec
--- /dev/null
+++ b/doc/fluid/new_docs/user_guides/howto/inference/index.rst
@@ -0,0 +1,11 @@
+############
+模型预测部署
+############
+
+PaddlePaddle Fluid 提供了 C++ API 来支持模型的部署上线
+
+.. toctree::
+   :maxdepth: 2
+
+   build_and_install_lib_cn.rst
+   native_infer.rst
diff --git a/doc/fluid/new_docs/user_guides/howto/inference/native_infer.rst b/doc/fluid/new_docs/user_guides/howto/inference/native_infer.rst
new file mode 100644
index 0000000000000000000000000000000000000000..6d6f3035c0b5c985cd39d45df9f1bcce50dcefa0
--- /dev/null
+++ b/doc/fluid/new_docs/user_guides/howto/inference/native_infer.rst
@@ -0,0 +1,106 @@
+Paddle 预测 API
+===============
+
+为了更简单方便的预测部署，Fluid 提供了一套高层 API
+用来隐藏底层不同的优化实现。
+
+`预测库相关代码 <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/inference/api>`_
+包括
+
+-  头文件 ``paddle_inference_api.h`` 定义了所有的接口
+-  库文件\ ``libpaddle_fluid.so`` 或 ``libpaddle_fluid.a``
+
+
+编译和依赖可以参考 :ref:`install_or_build_cpp_inference_lib` 。
+
+下面是一些 API 概念的介绍
+
+PaddleTensor
+------------
+
+PaddleTensor 定义了预测最基本的输入输出的数据格式，其定义是
+
+.. code:: cpp
+
+    struct PaddleTensor {
+      std::string name;  // variable name.
+      std::vector<int> shape;
+      PaddleBuf data;  // blob of data.
+      PaddleDType dtype;
+    };
+
+-  ``name`` 用于指定输入数据对应的 模型中variable 的名字
+   （暂时没有用，但会在后续支持任意 target 时启用）
+-  ``shape`` 表示一个 Tensor 的 shape
+-  ``data`` 数据以连续内存的方式存储在\ ``PaddleBuf``
+   中，\ ``PaddleBuf``
+   可以接收外面的数据或者独立\ ``malloc``\ 内存，详细可以参考头文件中相关定义。
+-  ``dtype`` 表示 Tensor 的数据类型
+
+engine
+------
+
+高层 API 底层有多种优化实现，我们称之为 engine，目前有三种 engine
+
+-  原生 engine，由 paddle 原生的 forward operator
+   组成，可以天然支持所有paddle 训练出的模型，
+-  Anakin engine，封装了
+   `Anakin <https://github.com/PaddlePaddle/Anakin>`__
+   ，在某些模型上性能不错，但只能接受自带模型格式，无法支持所有 paddle
+   模型，
+-  TensorRT mixed engine，用子图的方式支持了
+   `TensorRT <https://developer.nvidia.com/tensorrt>`__ ，支持所有paddle
+   模型，并自动切割部分计算子图到 TensorRT 上加速（WIP）
+
+其实现为
+
+.. code:: cpp
+
+    enum class PaddleEngineKind {
+      kNative = 0,       // Use the native Fluid facility.
+      kAnakin,           // Use Anakin for inference.
+      kAutoMixedTensorRT // Automatically mixing TensorRT with the Fluid ops.
+    };
+
+预测部署过程
+------------
+
+总体上分为以下步骤
+
+1. 用合适的配置创建 ``PaddlePredictor``
+2. 创建输入用的 ``PaddleTensor``\ ，传入到 ``PaddlePredictor`` 中
+3. 获取输出的 ``PaddleTensor`` ，将结果取出
+
+下面完整演示一个简单的模型，部分细节代码隐去
+
+.. code:: cpp
+
+    #include "paddle_inference_api.h"
+
+    // 创建一个 config，并修改相关设置
+    paddle::NativeConfig config;
+    config.model_dir = "xxx";
+    config.use_gpu = false;
+    // 创建一个原生的 PaddlePredictor
+    auto predictor =
+          paddle::CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+    // 创建输入 tensor
+    int64_t data[4] = {1, 2, 3, 4};
+    paddle::PaddleTensor tensor{.name = "",
+                                .shape = std::vector<int>({4, 1}),
+                                .data = PaddleBuf(data, sizeof(data)),
+                                .dtype = PaddleDType::INT64};
+    // 创建输出 tensor，输出 tensor 的内存可以复用
+    std::vector<paddle::PaddleTensor> outputs;
+    // 执行预测
+    CHECK(predictor->Run(slots, &outputs));
+    // 获取 outputs ...
+
+编译时，联编 ``libpaddle_fluid.a/.so`` 便可。
+
+详细代码参考
+------------
+
+-  `inference
+   demos <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/inference/api/demo_ci>`__
+-  `复杂单线程/多线程例子 <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/api/api_impl_tester.cc>`__
diff --git a/doc/fluid/new_docs/user_guides/howto/modification/foo.rst b/doc/fluid/new_docs/user_guides/howto/modification/foo.rst
new file mode 100644
index 0000000000000000000000000000000000000000..9d43c91a8544c3b281b2e8d556cb8b8e069d7e0a
--- /dev/null
+++ b/doc/fluid/new_docs/user_guides/howto/modification/foo.rst
@@ -0,0 +1,3 @@
+###
+FAQ
+###
diff --git a/doc/fluid/new_docs/user_guides/howto/prepare_data/feeding_data.rst b/doc/fluid/new_docs/user_guides/howto/prepare_data/feeding_data.rst
new file mode 100644
index 0000000000000000000000000000000000000000..c3bf033bb8316eeb4901c0cdc61e0556c8816dac
--- /dev/null
+++ b/doc/fluid/new_docs/user_guides/howto/prepare_data/feeding_data.rst
@@ -0,0 +1,169 @@
+.. _user_guide_use_numpy_array_as_train_data:
+
+###########################
+使用Numpy Array作为训练数据
+###########################
+
+PaddlePaddle Fluid支持使用 :code:`fluid.layers.data()` 配置数据层；
+再使用 Numpy Array 或者直接使用Python创建C++的
+:code:`fluid.LoDTensor` , 通过 :code:`Executor.run(feed=...)` 传给
+:code:`fluid.Executor` 或 :code:`fluid.ParallelExecutor` 。
+
+数据层配置
+##########
+
+通过 :code:`fluid.layers.data()` 可以配置神经网络中需要的数据层。具体方法为:
+
+.. code-block:: python
+
+   import paddle.fluid as fluid
+
+   image = fluid.layers.data(name="image", shape=[3, 224, 224])
+   label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+
+   # use image/label as layer input
+   prediction = fluid.layers.fc(input=image, size=1000, act="softmax")
+   loss = fluid.layers.cross_entropy(input=prediction, label=label)
+   ...
+
+上段代码中，:code:`image` 和 :code:`label` 是通过 :code:`fluid.layers.data`
+创建的两个输入数据层。其中 :code:`image` 是 :code:`[3, 224, 224]` 维度的浮点数据;
+:code:`label` 是 :code:`[1]` 维度的整数数据。这里需要注意的是:
+
+1. Fluid中默认使用 :code:`-1` 表示 batch size 维度，默认情况下会在 :code:`shape`
+   的第一个维度添加 :code:`-1` 。 所以 上段代码中， 我们可以接受将一个
+   :code:`[32, 3, 224, 224]` 的numpy array传给 :code:`image` 。 如果想自定义batch size
+   维度的位置的话，请设置 :code:`fluid.layers.data(append_batch_size=False)` 。
+   请参考进阶使用中的 :ref:`user_guide_customize_batch_size_rank` 。
+
+
+2. Fluid中用来做类别标签的数据类型是 :code:`int64`，并且标签从0开始。可用数据类型请参考 :ref:`user_guide_paddle_support_data_types`。
+
+.. _user_guide_feed_data_to_executor:
+
+传递训练数据给执行器
+####################
+
+:code:`Executor.run` 和 :code:`ParallelExecutor.run` 都接受一个 :code:`feed` 参数。
+这个参数是一个Python的字典。它的键是数据层的名字，例如上文代码中的 :code:`image`。
+它的值是对应的numpy array。
+
+例如:
+
+.. code-block:: python
+
+   exe = fluid.Executor(fluid.CPUPlace())
+   exe.run(feed={
+      "image": numpy.random.random(size=(32, 3, 224, 224)).astype('float32'),
+      "label": numpy.random.random(size=(32, 1)).astype('int64')
+   })
+
+进阶使用
+########
+
+如何传入序列数据
+----------------
+
+序列数据是PaddlePaddle Fluid支持的特殊数据类型，可以使用 :code:`LoDTensor` 作为
+输入数据类型。它需要用户: 1. 传入一个mini-batch需要被训练的所有数据;
+2.每个序列的长度信息。
+用户可以使用 :code:`fluid.create_lod_tensor` 来创建 :code:`LoDTensor`。
+
+传入序列信息的时候，需要设置序列嵌套深度，:code:`lod_level`。
+例如训练数据是词汇组成的句子，:code:`lod_level=1`；训练数据是 词汇先组成了句子，
+句子再组成了段落，那么 :code:`lod_level=2`。
+
+例如:
+
+.. code-block:: python
+
+   sentence = fluid.layers.data(name="sentence", dtype="int64", shape=[1], lod_level=1)
+
+   ...
+
+   exe.run(feed={
+     "sentence": create_lod_tensor(
+       data=numpy.array([1, 3, 4, 5, 3, 6, 8], dtype='int64').reshape(-1, 1),
+       lod=[4, 1, 2],
+       place=fluid.CPUPlace()
+     )
+   })
+
+训练数据 :code:`sentence` 包含三个样本，他们的长度分别是 :code:`4, 1, 2`。
+他们分别是 :code:`data[0:4]`， :code:`data[4:5]` 和 :code:`data[5:7]`。
+
+如何分别设置ParallelExecutor中每个设备的训练数据
+------------------------------------------------
+
+用户将数据传递给使用 :code:`ParallelExecutor.run(feed=...)` 时，
+可以显示指定每一个训练设备(例如GPU)上的数据。
+用户需要将一个列表传递给 :code:`feed` 参数，列表中的每一个元素都是一个字典。
+这个字典的键是数据层的名字，值是数据层的值。
+
+例如:
+
+.. code-block:: python
+
+   parallel_executor = fluid.ParallelExecutor()
+   parallel_executor.run(
+     feed=[
+        {
+          "image": numpy.random.random(size=(32, 3, 224, 224)).astype('float32'),
+          "label": numpy.random.random(size=(32, 1)).astype('int64')
+        },
+        {
+          "image": numpy.random.random(size=(16, 3, 224, 224)).astype('float32'),
+          "label": numpy.random.random(size=(16, 1)).astype('int64')
+        },
+     ]
+   )
+
+上述代码中，GPU0会训练 32 个样本，而 GPU1训练 16 个样本。
+
+
+.. _user_guide_customize_batch_size_rank:
+
+自定义BatchSize维度
+-------------------
+
+PaddlePaddle Fluid默认batch size是数据的第一维度，以 :code:`-1` 表示。但是在高级
+使用中，batch_size 可以固定，也可以是其他维度或者多个维度来表示。这都需要设置
+:code:`fluid.layers.data(append_batch_size=False)` 来完成。
+
+1. 固定batch size维度
+
+  .. code-block:: python
+
+     image = fluid.layers.data(name="image", shape=[32, 784], append_batch_size=False)
+
+  这里，:code:`image` 永远是一个 :code:`[32, 784]` 大小的矩阵。
+
+2. 使用其他维度表示batch size
+
+  .. code-block:: python
+
+     sentence = fluid.layers.data(name="sentence",
+                                  shape=[80, -1, 1],
+                                  append_batch_size=False,
+                                  dtype="int64")
+
+  这里 :code:`sentence` 的中间维度是batch size。这种数据排布会用在定长的循环神经
+  网络中。
+
+
+.. _user_guide_paddle_support_data_types:
+
+Fluid目前支持的数据类型
+-----------------------
+
+PaddlePaddle Fluid目前支持的数据类型包括:
+
+   * float16： 部分操作支持
+   * float32:  主要实数类型
+   * float64:  次要实数类型，支持大部分操作
+   * int32:  次要标签类型
+   * int64: 主要标签类型
+   * uint64: 次要标签类型
+   * bool: 控制流数据类型
+   * int16: 次要标签类型
+   * uint8: 输入数据类型，可用于图像像素
\ No newline at end of file
diff --git a/doc/fluid/new_docs/user_guides/howto/prepare_data/index.rst b/doc/fluid/new_docs/user_guides/howto/prepare_data/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..cca3684b78518867eae95d82e1347b52427ddc81
--- /dev/null
+++ b/doc/fluid/new_docs/user_guides/howto/prepare_data/index.rst
@@ -0,0 +1,51 @@
+..  _user_guide_prepare_data:
+
+########
+准备数据
+########
+
+PaddlePaddle Fluid支持两种传入数据的方式:
+
+1. 用户需要使用 :code:`fluid.layers.data`
+配置数据输入层，并在 :code:`fluid.Executor` 或 :code:`fluid.ParallelExecutor`
+中，使用 :code:`executor.run(feed=...)` 传入训练数据。
+
+2. 用户需要先将训练数据
+转换成 Paddle 识别的 :code:`fluid.recordio_writer` ， 再使用
+:code:`fluid.layers.open_files` 以及 :code:`fluid.layers.reader` 配置数据读取。
+
+这两种准备数据方法的比较如下:
+
+.. _user_guide_prepare_data_comparision:
+
++------------+----------------------------------+---------------------------------------+
+|            |        Feed数据                  |         使用Reader                    |
++============+==================================+=======================================+
+| API接口    | :code:`executor.run(feed=...)`   |         :code:`fluid.layers.reader`       |
++------------+----------------------------------+---------------------------------------+
+| 数据格式   |           Numpy Array            | :code:`fluid.recordio_writer` |
++------------+----------------------------------+---------------------------------------+
+| 数据增强   | Python端使用其他库完成           | 使用Fluid中的Operator 完成            |
++------------+----------------------------------+---------------------------------------+
+|   速度     |                 慢               |                 快                    |
++------------+----------------------------------+---------------------------------------+
+| 推荐用途   |   调试模型                       |   工业训练                            |
++------------+----------------------------------+---------------------------------------+
+
+这些准备数据的详细使用方法，请参考:
+
+.. toctree::
+   :maxdepth: 2
+
+   feeding_data
+
+Python Reader
+#############
+
+为了方便用户在Python中定义数据处理流程，PaddlePaddle Fluid支持 Python Reader，
+具体请参考:
+
+.. toctree::
+   :maxdepth: 2
+
+   reader.md
diff --git a/doc/fluid/new_docs/user_guides/howto/prepare_data/reader.md b/doc/fluid/new_docs/user_guides/howto/prepare_data/reader.md
new file mode 100644
index 0000000000000000000000000000000000000000..aa50e4d26166536eaf8044d527debd8ad46060f6
--- /dev/null
+++ b/doc/fluid/new_docs/user_guides/howto/prepare_data/reader.md
@@ -0,0 +1,210 @@
+```eval_rst
+.. _user_guide_reader:
+```
+
+# Python Reader
+
+During the training and testing phases, PaddlePaddle programs need to read data. To help the users write code that performs reading input data, we define the following:
+
+- A *reader*: A function that reads data (from file, network, random number generator, etc) and yields the data items.
+- A *reader creator*: A function that returns a reader function.
+- A *reader decorator*: A function, which takes in one or more readers, and returns a reader.
+- A *batch reader*: A function that reads data (from *reader*, file, network, random number generator, etc) and yields a batch of data items.
+
+and also provide a function which can convert a reader to a batch reader, frequently used reader creators and reader decorators.
+
+## Data Reader Interface
+
+*Data reader* doesn't have to be a function that reads and yields data items. It can just be any function without any parameters that creates an iterable (anything can be used in `for x in iterable`) as follows:
+
+```
+iterable = data_reader()
+```
+
+The item produced from the iterable should be a **single** entry of data and **not** a mini batch. The entry of data could be a single item or a tuple of items. Item should be of one of the [supported types](http://www.paddlepaddle.org/doc/ui/data_provider/pydataprovider2.html?highlight=dense_vector#input-types) (e.g., numpy 1d array of float32, int, list of int etc.)
+
+An example implementation for single item data reader creator is as follows:
+
+```python
+def reader_creator_random_image(width, height):
+    def reader():
+        while True:
+            yield numpy.random.uniform(-1, 1, size=width*height)
+    return reader
+```
+
+An example implementation for multiple item data reader creator is as follows:
+```python
+def reader_creator_random_image_and_label(width, height, label):
+    def reader():
+        while True:
+            yield numpy.random.uniform(-1, 1, size=width*height), label
+    return reader
+```
+
+## Batch Reader Interface
+
+*Batch reader* can be any function without any parameters that creates an iterable (anything can be used in `for x in iterable`). The output of the iterable should be a batch (list) of data items. Each item inside the list should be a tuple.
+
+Here are some valid outputs:
+
+```python
+# a mini batch of three data items. Each data item consist three columns of data, each of which is 1.
+[(1, 1, 1),
+(2, 2, 2),
+(3, 3, 3)]
+
+# a mini batch of three data items, each data item is a list (single column).
+[([1,1,1],),
+([2,2,2],),
+([3,3,3],)]
+```
+
+Please note that each item inside the list must be a tuple, below is an invalid output:
+```python
+ # wrong, [1,1,1] needs to be inside a tuple: ([1,1,1],).
+ # Otherwise it is ambiguous whether [1,1,1] means a single column of data [1, 1, 1],
+ # or three columns of data, each of which is 1.
+[[1,1,1],
+[2,2,2],
+[3,3,3]]
+```
+
+It is easy to convert from a reader to a batch reader:
+
+```python
+mnist_train = paddle.dataset.mnist.train()
+mnist_train_batch_reader = paddle.batch(mnist_train, 128)
+```
+
+It is also straight forward to create a custom batch reader:
+
+```python
+def custom_batch_reader():
+    while True:
+        batch = []
+        for i in xrange(128):
+            batch.append((numpy.random.uniform(-1, 1, 28*28),)) # note that it's a tuple being appended.
+        yield batch
+
+mnist_random_image_batch_reader = custom_batch_reader
+```
+
+## Usage
+
+Following is how we can use the reader with PaddlePaddle:
+The batch reader, a mapping from item(s) to data layer, the batch size and the number of total passes will be passed into `paddle.train` as follows:
+
+```python
+# two data layer is created:
+image_layer = paddle.layer.data("image", ...)
+label_layer = paddle.layer.data("label", ...)
+
+# ...
+batch_reader = paddle.batch(paddle.dataset.mnist.train(), 128)
+paddle.train(batch_reader, {"image":0, "label":1}, 128, 10, ...)
+```
+
+## Data Reader Decorator
+
+The *Data reader decorator* takes in a single reader or multiple data readers and returns a new data reader. It is similar to a [python decorator](https://wiki.python.org/moin/PythonDecorators), but it does not use `@` in the syntax.
+
+Since we have a strict interface for data readers (no parameters and return a single data item), a data reader can be used in a flexible way using data reader decorators. Following are a few examples:
+
+### Prefetch Data
+
+Since reading data may take some time and training can not proceed without data, it is generally a good idea to prefetch the data.
+
+Use `paddle.reader.buffered` to prefetch data:
+
+```python
+buffered_reader = paddle.reader.buffered(paddle.dataset.mnist.train(), 100)
+```
+
+`buffered_reader` will try to buffer (prefetch) `100` data entries.
+
+### Compose Multiple Data Readers
+
+For example, if we want to use a source of real images (say reusing mnist dataset), and a source of random images as input for [Generative Adversarial Networks](https://arxiv.org/abs/1406.2661).
+
+We can do the following :
+
+```python
+def reader_creator_random_image(width, height):
+    def reader():
+        while True:
+            yield numpy.random.uniform(-1, 1, size=width*height)
+    return reader
+
+def reader_creator_bool(t):
+    def reader:
+        while True:
+            yield t
+    return reader
+
+true_reader = reader_creator_bool(True)
+false_reader = reader_creator_bool(False)
+
+reader = paddle.reader.compose(paddle.dataset.mnist.train(), data_reader_creator_random_image(20, 20), true_reader, false_reader)
+# Skipped 1 because paddle.dataset.mnist.train() produces two items per data entry.
+# And we don't care about the second item at this time.
+paddle.train(paddle.batch(reader, 128), {"true_image":0, "fake_image": 2, "true_label": 3, "false_label": 4}, ...)
+```
+
+### Shuffle
+
+Given the shuffle buffer size `n`, `paddle.reader.shuffle` returns a data reader that buffers `n` data entries and shuffles them before a data entry is read.
+
+Example:
+```python
+reader = paddle.reader.shuffle(paddle.dataset.mnist.train(), 512)
+```
+
+## Q & A
+
+### Why does a reader return only a single entry, and not a mini batch?
+
+Returning a single entry makes reusing existing data readers much easier (for example, if an existing reader returns 3 entries instead if a single entry, the training code will be more complicated because it need to handle cases like a batch size 2).
+
+We provide a function: `paddle.batch` to turn (a single entry) reader into a batch reader.
+
+### Why do we need a batch reader, isn't is sufficient to give the reader and batch_size as arguments during training ?
+
+In most of the cases, it would be sufficient to give the reader and batch_size as arguments to the train method. However sometimes the user wants to customize the order of data entries inside a mini batch, or even change the batch size dynamically. For these cases using a batch reader is very efficient and helpful.
+
+### Why use a dictionary instead of a list to provide mapping?
+
+Using a dictionary (`{"image":0, "label":1}`) instead of a list (`["image", "label"]`) gives the advantage that the user can easily reuse the items (e.g., using `{"image_a":0, "image_b":0, "label":1}`) or even skip an item (e.g., using `{"image_a":0, "label":2}`).
+
+### How to create a custom data reader creator ?
+
+```python
+def image_reader_creator(image_path, label_path, n):
+    def reader():
+        f = open(image_path)
+        l = open(label_path)
+        images = numpy.fromfile(
+            f, 'ubyte', count=n * 28 * 28).reshape((n, 28 * 28)).astype('float32')
+        images = images / 255.0 * 2.0 - 1.0
+        labels = numpy.fromfile(l, 'ubyte', count=n).astype("int")
+        for i in xrange(n):
+            yield images[i, :], labels[i] # a single entry of data is created each time
+        f.close()
+        l.close()
+    return reader
+
+# images_reader_creator creates a reader
+reader = image_reader_creator("/path/to/image_file", "/path/to/label_file", 1024)
+paddle.train(paddle.batch(reader, 128), {"image":0, "label":1}, ...)
+```
+
+### How is `paddle.train` implemented
+
+An example implementation of paddle.train is:
+
+```python
+def train(batch_reader, mapping, batch_size, total_pass):
+    for pass_idx in range(total_pass):
+        for mini_batch in batch_reader(): # this loop will never end in online learning.
+            do_forward_backward(mini_batch, mapping)
+```
diff --git a/doc/fluid/new_docs/user_guides/howto/training/checkpoint_doc_cn.md b/doc/fluid/new_docs/user_guides/howto/training/checkpoint_doc_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..c4afd536c67b24a17e4437ecedf779ddcddcbc98
--- /dev/null
+++ b/doc/fluid/new_docs/user_guides/howto/training/checkpoint_doc_cn.md
@@ -0,0 +1,60 @@
+# Checkpoint功能使用指南
+
+## 背景
+单机/多机在训练过程中会由于软件/硬件的问题出现异常，导致训练中断，进而导致训练无结果或结果不可用，浪费大量时间和机器性能。
+
+## 目的
+Checkpoint功能能够在训练中途对训练数据中间数据进行保存，出现异常恢复训练的时候能够加载中途保存的数据继续训练， 实现单机/多机的容错训练的功能。
+
+## 说明
+### 目前已实现的参数保存：
+1. 基于Trainer 0 实现训练过程中的参数保存
+2. 基于PServer 实现了```Distribute Lookup Table```相关参数保存
+### Fluid Checkpoint 保存数据目录结构：
+
+```
+checkpoint_dir (用户定义的checkpoint目录)
+├── checkpoint_0 (第一次保存)
+│   ├── __lockup_table__ (Distribute Lookup Table 目录)
+│   │   ├── table_pserver_0 (Pserver 0 号保存的lookup table 数据)
+│   │   └── table_pserver_1
+│   ├── __model__ (model 目录)
+│   │   └── var.w_1
+│   └── trainer_0 (trainer 自有数据保存)
+│       ├── epoch_id
+│       └── step_id
+└── checkpoint_1 (第二次保存)
+```
+
+## 使用方法
+### 声明Fluid.CheckpointConfig
+用户对checkpoint功能的配置，主要是配置对象```Fluid```中的```CheckpointConfig```.
+
+```CheckpointConfig``` 包括4个参数：
+
+| 参数 | 类型 | 说明 | 
+| - | :-: | - | 
+| checkpoint_dir | int| checkpoint存储目录 | 
+| max_num_checkpoints | int | 最大保存的checkpoint副本数 | 
+| epoch_interval | int | 每隔epoch_interval轮epoch |
+| step_interval | int | 每隔step_interval轮step |
+
+### 在Fluid.Trainer对象的声明中加入Fluid.CheckpointConfig的声明
+Trainer的__init__方法的参数中包含了对```CheckpointConfig```， 需要传入在声明Trainer前声明的```CheckpointConfig```对象。
+如：
+```python
+config = CheckpointConfig(
+    checkpoint_dir = "/tmp/ckpt", max_num_checkpoints = 2, 
+    epoch_interval = 2, step_interval = 10)
+trainer = Trainer(..., checkpoint_config=config)
+```
+定义和声明完成后， 训练在运行过程中就会在指定的step和epoch处进行保存，出现异常时，就会自动从最新的checkpoint目录进行参数恢复啦！
+
+## 相关API
+[Trainer API 说明](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/trainer.py)
+
+## 注意
+1. 保证每个训练的```checkpoint_dir``` 与其他训练独立。
+2. 最大副本数量```max_num_checkpoints```需要根据磁盘容量以及模型的大小进行调整， 保证磁盘的可用性。
+3. ```epoch_interval```  和 ```step_interval```  不宜过小， 频繁的进行checkpoint会拖慢训练速度。
+4. **分布式训练**的过程中：每个Trainer都会在```checkpoint_dir```目录中保存当前Trainer的参数（只有Trainer 0会保存模型的参数），需要**分布式文件系统(HDFS等)**将同```checkpoint_dir```目录的数据进行合并才能得到完整的数据，恢复训练的时候需要用完整的数据进行恢复。
diff --git a/doc/fluid/new_docs/user_guides/howto/training/checkpoint_doc_en.md b/doc/fluid/new_docs/user_guides/howto/training/checkpoint_doc_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..14d37246ca0cab8715e244fda9624d0d59f8ec5f
--- /dev/null
+++ b/doc/fluid/new_docs/user_guides/howto/training/checkpoint_doc_en.md
@@ -0,0 +1,62 @@
+# Checkpoint User Guide
+
+## Background
+In many cases, Stand-alone training and Distributed training can be aborted by the software problem or hardware problem. More seriously, we waste so much time and the performance of the machine but get nothing, which makes us frustrating and we have to restart it again.
+
+## Purpose
+The feature of ```Checkpoint``` can save Intermediate model variables, lookup table variable, and other needs data in checkpoint directory. When the exception occurs, we can load these variables from the checkpoint directory immediately.
+## Introduce
+### Complete Features Currently：
+1. The Trainer 0 will save model variables in training.
+2. Each of the Trainer will save its own arguments needed.
+3. Each of the Parameter Server will save ```Distribute Lookup Table``` variables in training.
+### Fluid Checkpoint directory structure：
+
+```
+checkpoint_dir (the checkpoint directory user define)
+├── checkpoint_0 (the first save directory)
+│   ├── __lockup_table__ (Distribute Lookup Table directory)
+│   │   ├── table_pserver_0 (Lookup table's data about Pserver 0)
+│   │   └── table_pserver_1
+│   ├── __model__ (model directory)
+│   │   └── var.w_1
+│   └── trainer_0 (each trainer will save its own data)
+│       ├── epoch_id
+│       └── step_id
+└── checkpoint_1 (the second save directory)
+```
+
+## usage
+### Fluid.CheckpointConfig construct
+When the user wants to use ```Checkpoint``` feature, the main thing user have to do is declare ```CheckpointConfig``` and construct it.
+
+```CheckpointConfig``` has 4 member variables need to be initialized：
+
+| Member Variable | Type | Comment | 
+| - | :-: | - | 
+| checkpoint_dir | int| checkpoint directory | 
+| max_num_checkpoints | int | Maximum number of checkpoint copies | 
+| epoch_interval | int |  epoch interval times |
+| step_interval | int | step interval times |
+
+### Add Fluid.CheckpointConfig's declaration in Fluid.Trainer
+Because the initialization of Trainer needs an instance of ```CheckpointConfig```., we should declare ```CheckpointConfig``` in ```Fluid``` first.
+
+For example：
+```python
+config = CheckpointConfig(
+    checkpoint_dir = "/tmp/ckpt", max_num_checkpoints = 2, 
+    epoch_interval = 2, step_interval = 10)
+trainer = Trainer(..., checkpoint_config=config)
+```
+
+After all the things done, the train will save checkpoint at the specified epoch and step, when the train is aborted, the user can restart it, the train will restore from the latest copy.
+
+## Related API
+[Related Trainer API](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/trainer.py)
+
+## Attention
+1. Make the ```checkpoint_dir``` only be used by one train job.
+2. The number of ```max_num_checkpoints``` need to be adjusted by the disk size and model size.
+3. Too frequently to slow down the train speed, so too ```small epoch_interval``` and ```step_interval``` are not suitable.
+4. **In distributed train**, each Trainer will save arguments in its ```checkpoint_dir``` (Only Trainer 0 will save model variables). We need **distributed file system (HDFS, etc)** to merge all the ```checkpoint_dir``` to get the whole data.
diff --git a/doc/fluid/new_docs/user_guides/howto/training/cluster_howto.rst b/doc/fluid/new_docs/user_guides/howto/training/cluster_howto.rst
new file mode 100644
index 0000000000000000000000000000000000000000..00ec9e819c81fae3263b1f1e6bcedf524f2b3991
--- /dev/null
+++ b/doc/fluid/new_docs/user_guides/howto/training/cluster_howto.rst
@@ -0,0 +1,160 @@
+.. _cluster_howto
+
+Fluid分布式训练使用手册
+====================
+
+分布式训练基本思想
+---------------
+
+分布式深度学习训练通常分为两种并行化方法：数据并行，模型并行，参考下图：
+
+.. image:: src/parallelism.png
+
+在模型并行方式下，模型的层和参数将被分布在多个节点上，模型在一个mini-batch的前向和反向训练中，将经过多次跨\
+节点之间的通信。每个节点只保存整个模型的一部分；在数据并行方式下，每个节点保存有完整的模型的层和参数，每个节点\
+独自完成前向和反向计算，然后完成梯度的聚合并同步的更新所有节点上的参数。Fluid目前版本仅提供数据并行方式，另外\
+诸如模型并行的特例实现（超大稀疏模型训练）功能将在后续的文档中予以说明。
+
+在数据并行模式的训练中，Fluid使用了两种通信模式，用于应对不同训练任务对分布式训练的要求，分别为RPC通信和Collective
+通信。其中RPC通信方式使用 `gRPC <https://github.com/grpc/grpc/>`_ ，Collective通信方式使用
+`NCCL2 <https://developer.nvidia.com/nccl>`_ 。
+
+.. csv-table:: 下面是一个RPC通信和Collective通信的横向对比：
+   :header: "Feature", "Coolective", "RPC"
+
+   "Ring-Based通信", "Yes", "No"
+   "异步训练", "Yes", "Yes"
+   "分布式模型", "No", "Yes"
+   "容错训练", "No", "Yes"
+   "性能", "Faster", "Fast"
+
+- RPC通信方式的结构：
+
+  .. image:: src/dist_train_pserver.png
+
+  使用RPC通信方式的数据并行分布式训练，会启动多个pserver进程和多个trainer进程，每个pserver进程\
+  会保存一部分模型参数，并负责接收从trainer发送的梯度并更新这些模型参数；每个trainer进程会保存一份\
+  完整的模型，并使用一部分数据进行训练，然后向pserver发送梯度，最后从pserver拉取更新后的参数。
+
+  pserver进程可以在和trainer完全不同的计算节点上，也可以和trainer公用节点。一个分布式任务所需要的\
+  pserver进程个数通常需要根据实际情况调整，已达到最佳的性能，然而通常来说pserver的进程不会比trainer\
+  更多。
+
+  在使用GPU训练时，pserver可以选择使用GPU或只使用CPU，如果pserver也使用GPU，则会增加一次从CPU拷贝\
+  接收到的梯度数据到GPU的开销，在某些情况下会导致整体训练性能降低。
+
+- NCCL2通信方式的结构：
+
+  .. image:: src/dist_train_nccl2.png
+
+  使用NCCL2（Collective通信方式）进行分布式训练，是不需要启动pserver进程的，每个trainer进程都保存\
+  一份完整的模型参数，在完成计算梯度之后通过trainer之间的相互通信，Reduce梯度数据到所有节点的所有设备\
+  然后每个节点在各自完成参数更新。
+
+使用parameter server方式的训练
+------------------------------
+
+使用 :code:`trainer` API，程序可以自动的通过识别环境变量决定是否已分布式方式执行。
+
+.. csv-table:: 需要在您的分布式环境中配置的环境变量包括：
+   :header: "环境变量", "说明"
+
+   "PADDLE_TRAINING_ROLE", "当前进程的角色，可以是PSERVER或TRAINER"
+   "PADDLE_PSERVER_PORT", "parameter使用的端口"
+   "PADDLE_PSERVER_IPS", "parameter server的IP地址列表，用逗号分开"
+   "PADDLE_TRAINERS", "分布式任务中trainer节点的个数"
+   "PADDLE_CURRENT_IP", "当前节点的IP"
+   "PADDLE_TRAINER_ID", "trainer节点的id，从0~n-1，不能有重复"
+
+使用更加底层的 :code:`transpiler` API可以提供自定义的分布式训练的方法，比如可以在同一台机器上，
+启动多个pserver和trainer进行训练，使用底层API的方法可以参考下面的样例代码：
+
+.. code-block:: python
+
+   role = "PSERVER"
+   trainer_id = 0
+   pserver_endpoints = "127.0.0.1:6170,127.0.0.1:6171"
+   current_endpoint = "127.0.0.1:6170"
+   trainers = 4
+   t = fluid.DistributeTranspiler()
+   t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
+   if role == "PSERVER":
+       pserver_prog = t.get_pserver_program(current_endpoint)
+       pserver_startup = t.get_startup_program(current_endpoint,
+                                               pserver_prog)
+       exe.run(pserver_startup)
+       exe.run(pserver_prog)
+   elif role == "TRAINER":
+       train_loop(t.get_trainer_program())
+
+
+选择同步或异步训练
+++++++++++++++++++
+
+Fluid分布式任务可以支持同步训练或异步训练，在同步训练方式下，所有的trainer节点，会在每个mini-batch
+同步地合并所有节点的梯度数据并发送给parameter server完成更新，在异步训练方式下，每个trainer没有相互\
+同步等待的过程，可以独立的parameter server的参数。通常情况下，使用异步训练方式，可以在trainer节点\
+更多的时候比同步训练方式有更高的总体吞吐量。
+
+在调用 :code:`transpile` 函数时，默认会生成同步训练的分布式程序，通过指定 :code:`sync_mode=False`
+参数即可生成异步训练的程序：
+
+.. code-block:: python
+
+   t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers, sync_mode=False)
+
+
+选择参数分布方法
+++++++++++++++++
+
+参数 :code:`split_method` 可以指定参数在parameter server上的分布方式。
+
+Fluid默认使用 `RoundRobin <https://en.wikipedia.org/wiki/Round-robin_scheduling>`_
+方式将参数分布在多个parameter server上。此方式在默认未关闭参数切分的情况下，参数会较平均的分布在所有的
+parameter server上。如果需要使用其他，可以传入其他的方法，目前可选的方法有： :code:`RoundRobin` 和
+:code:`HashName` 。也可以使用自定义的分布方式，只需要参考
+`这里 <https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/transpiler/ps_dispatcher.py#L44>`_
+编写自定义的分布函数。
+
+
+关闭切分参数
+++++++++++++
+
+参数 :code:`slice_var_up` 指定是否将较大（大于8192个元素）的参数切分到多个parameter server已均衡计算负载，默认为开启。
+
+当模型中的可训练参数体积比较均匀或者使用自定义的参数分布方法是参数均匀分布在多个parameter server上，
+可以选择关闭切分参数，这样可以降低切分和重组带来的计算和拷贝开销：
+
+.. code-block:: python
+
+   t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers, slice_var_up=False)
+
+
+使用NCCL2通信方式的训练
+--------------------
+
+注NCCL2模式目前仅支持trainer API，NCCL2方式并没有很多可选项，也没有"transpiler"，所以并没有底层API。
+使用NCCL2方式同样需要配置每个节点的环境变量，此处与parameter server模式有所不同，并不需要启动独立的\
+parameter server的进程，只需要启动多个trainer进程即可。
+
+
+.. csv-table:: NCCL2模式环境变量说明：
+   :header: "环境变量", "说明"
+
+   "PADDLE_TRAINER_IPS", "所有Trainer节点的IP列表，用逗号分隔"
+   "PADDLE_TRAINER_ID", "trainer节点的id，从0~n-1，不能有重复"
+   "PADDLE_PSERVER_PORT", "一个端口，用于在NCCL2初始化时，广播NCCL ID"
+   "PADDLE_CURRENT_IP", "当前节点的IP"
+
+目前使用NCCL2进行分布式训练仅支持同步训练方式。使用NCCL2方式的分布式训练，更适合模型体积较大，并需要使用\
+同步训练和GPU训练，如果硬件设备支持RDMA和GPU Direct，可以达到很高的分布式训练性能。
+
+注意如果系统中有多个网络设备，需要手动指定NCCL2使用的设备，
+假设需要使用 :code:`eth2` 为通信设备，需要设定如下环境变量：
+
+.. code-block:: bash
+
+   export NCCL_SOCKET_IFNAME=eth2
+
+另外NCCL2提供了其他的开关环境变量，比如指定是否开启GPU Direct，是否使用RDMA等，详情可以参考
+`ncclknobs <https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/index.html#ncclknobs>`_ 。
diff --git a/doc/fluid/new_docs/user_guides/howto/training/cluster_quick_start.rst b/doc/fluid/new_docs/user_guides/howto/training/cluster_quick_start.rst
new file mode 100644
index 0000000000000000000000000000000000000000..6131c92d6f5386c7e91b2917d25dd7ae830ff182
--- /dev/null
+++ b/doc/fluid/new_docs/user_guides/howto/training/cluster_quick_start.rst
@@ -0,0 +1,143 @@
+..  _cluster_quick_start:
+
+分布式训练快速开始
+==================
+
+准备工作
+--------
+
+在本篇文章中，我们将会在介绍如何快速在一个集群中启动一个 PaddlePaddle
+的分布式训练任务，在开始之前，请按如下步骤做些准备工作：
+
+1. 准备一个至少4个节点的集群，并且保证网络可以联通，在本文中我们使用
+   ``*.paddlepaddle.com`` 来表示每个节点的主机名称，您可以根据集群的实际情况来修改它。
+
+2. 在开始之前确保已经阅读过 :ref:`how_to_install`
+   并且可以在集群的所有节点上可以正常运行 PaddlePaddle。
+
+启动集群训练任务
+----------------
+
+在启动集群训练脚本时，需要在不同的节点上指定不同的环境变量，具体如下：
+
++-----------------+-----------------+-----------------+---------------------+
+| 环境变量        | 数据类型        | 样例            | 描述                |
++=================+=================+=================+=====================+
+| PADDLE_TRAINING | str             | PSERVER,TRAINER | 训练节点的角色      |
+| _ROLE           |                 |                 |                     |
++-----------------+-----------------+-----------------+---------------------+
+| PADDLE_PSERVER_ | str             | ps0.paddlepaddl | 所有 pserver        |
+| IPS             |                 | e.com,ps1.paddl | 节点的 IP           |
+|                 |                 | epaddle.com…    | 地址或              |
+|                 |                 |                 | hostname,           |
+|                 |                 |                 | 用“,”分隔           |
++-----------------+-----------------+-----------------+---------------------+
+| PADDLE_PSERVER_ | int             | 6174            | pserver             |
+| PORT            |                 |                 | 节点监听的端口      |
++-----------------+-----------------+-----------------+---------------------+
+| PADDLE_TRAINERS | int             | 2               | 训练任务中          |
+|                 |                 |                 | trainer             |
+|                 |                 |                 | 节点的数量          |
++-----------------+-----------------+-----------------+---------------------+
+| PADDLE_CURRENT_ | str             | ps0.paddlepaddl | 当前 pserver        |
+| IP              |                 | e.com           | 节点的 IP           |
+|                 |                 |                 | 地址或 hostanme     |
++-----------------+-----------------+-----------------+---------------------+
+| PADDLE_TRAINER_ | int             | 0               | 当前 trainer        |
+| ID              |                 |                 | 节点的唯一 ID,      |
+|                 |                 |                 | 取值范围为从0开始到 |
+|                 |                 |                 | PADDLE_TRAINERS-1   |
++-----------------+-----------------+-----------------+---------------------+
+
+样例代码
+~~~~~~~~
+
+将下面程序代码保存为 ``fluid_dist.py``
+
+.. code:: python
+
+   import paddle
+   import paddle.fluid as fluid
+   import contextlib
+   import numpy
+   import unittest
+
+   # train reader
+   BATCH_SIZE = 20
+
+   train_reader = paddle.batch(
+       paddle.reader.shuffle(
+           paddle.dataset.uci_housing.train(), buf_size=500),
+       batch_size=BATCH_SIZE)
+
+   test_reader = paddle.batch(
+       paddle.reader.shuffle(
+           paddle.dataset.uci_housing.test(), buf_size=500),
+       batch_size=BATCH_SIZE)
+
+
+   def train_program():
+       y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+       x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+       y_predict = fluid.layers.fc(input=x, size=1, act=None)
+
+       loss = fluid.layers.square_error_cost(input=y_predict, label=y)
+       avg_loss = fluid.layers.mean(loss)
+
+       return avg_loss
+
+   def optimizer_func():
+       return fluid.optimizer.SGD(learning_rate=0.001)
+
+   def train(use_cuda, train_program):
+       place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+
+       trainer = fluid.Trainer(
+           train_func=train_program, place=place, optimizer_func=optimizer_func)
+
+       def event_handler(event):
+           if isinstance(event, fluid.EndStepEvent):
+               if event.step == 10:
+                   test_metrics = trainer.test(
+                       reader=test_reader, feed_order=['x', 'y'])
+                   print("step {0}, loss: {1}".format(event.step, test_metrics))
+                   trainer.stop()
+
+       trainer.train(
+           reader=train_reader,
+           num_epochs=100,
+           event_handler=event_handler,
+           feed_order=['x', 'y'])
+
+   train(False, train_program)
+
+启动trainer节点和pserver节点
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. list-table::
+   :header-rows: 1
+
+   * - 启动节点
+     - 启动命令
+     - 说明
+   * - ps0.paddlepaddle.com
+     - :code:`PADDLE_TRAINING_ROLE=PSERVER PADDLE_CURRENT_IP=ps0.paddlepaddle.com PADDLE_PSERVER_IPS=ps0.paddlepaddle.com,ps1.paddlepaddle.com PADDLE_TRAINERS=2 PADDLE_PSERVER_PORT=6174 python fluid_dist.py`
+     - 启动 pserver 节点
+   * - ps1.paddlepaddle.com
+     - :code:`PADDLE_TRAINING_ROLE=PSERVER PADDLE_CURRENT_IP=ps1.paddlepaddle.com PADDLE_PSERVER_IPS=ps0.paddlepaddle.com,ps1.paddlepaddle.com PADDLE_TRAINERS=2 PADDLE_PSERVER_PORT=6174 python fluid_dist.py`
+     - 启动 pserver 节点
+   * - trainer0.paddlepaddle.com
+     - :code:`PADDLE_TRAINING_ROLE=TRAINER PADDLE_PSERVER_IPS=ps0.paddlepaddle.com,ps1.paddlepaddle.com PADDLE_TRAINERS=2 PADDLE_TRAINER_ID=0 PADDLE_PSERVER_PORT=6174 python fluid_dist.py`
+     - 启动第0号 trainer 节点
+   * - trainer1.paddlepaddle.com
+     - :code:`PADDLE_TRAINING_ROLE=TRAINER PADDLE_PSERVER_IPS=ps0.paddlepaddle.com,ps1.paddlepaddle.com PADDLE_TRAINERS=2 PADDLE_TRAINER_ID=1 PADDLE_PSERVER_PORT=6174 python fluid_dist.py`
+     - 启动第1号 trainer 节点
+
+**注意**
+
+-  需要先启动pserver节点再启动trainer节点
+-  看到trainer节点输出如下日志表示训练任务执行正确
+
+   .. code:: bash
+
+      step 10, loss: [258.2326202392578]
diff --git a/doc/fluid/new_docs/user_guides/howto/training/index.rst b/doc/fluid/new_docs/user_guides/howto/training/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..68475101e26b3f695c8003995cc1c6a95426ff27
--- /dev/null
+++ b/doc/fluid/new_docs/user_guides/howto/training/index.rst
@@ -0,0 +1,12 @@
+############
+训练神经网络
+############
+
+PaddlePaddle Fluid支持单机训练，和多节点训练。每种训练模式下，都支持多种训练方法。
+
+.. toctree::
+   :maxdepth: 2
+
+   single_node
+   multi_node
+   save_load_variables
diff --git a/doc/fluid/new_docs/user_guides/howto/training/multi_node.rst b/doc/fluid/new_docs/user_guides/howto/training/multi_node.rst
new file mode 100644
index 0000000000000000000000000000000000000000..24316f0be0d8f211e680fa15cb432732b5967c79
--- /dev/null
+++ b/doc/fluid/new_docs/user_guides/howto/training/multi_node.rst
@@ -0,0 +1,9 @@
+########
+多机训练
+########
+
+.. toctree::
+   :maxdepth: 2
+
+   cluster_quick_start.rst
+   cluster_howto.rst
diff --git a/doc/fluid/new_docs/user_guides/howto/training/save_load_variables.rst b/doc/fluid/new_docs/user_guides/howto/training/save_load_variables.rst
new file mode 100644
index 0000000000000000000000000000000000000000..a96776f4a17a1d6da170bdff9d81771c38912bb5
--- /dev/null
+++ b/doc/fluid/new_docs/user_guides/howto/training/save_load_variables.rst
@@ -0,0 +1,172 @@
+.. _user_guide_save_load_vars:
+
+##################
+保存与载入模型变量
+##################
+
+模型变量分类
+############
+
+在PaddlePaddle Fluid中，所有的模型变量都用 :code:`fluid.Variable()` 作为基类进行表示。
+在该基类之下，模型变量主要可以分为以下几种类别：
+
+1. 模型参数
+  模型参数是深度学习模型中被训练和学习的变量，在训练过程中，训练框架根据反向传播算法计算出每一个模型参数当前的梯度，
+  并用优化器根据梯度对参数进行更新。模型的训练过程本质上可以看做是模型参数不断迭代更新的过程。
+  在PaddlePaddle Fluid中，模型参数用 :code:`fluid.framework.Parameter` 来表示，
+  这是一个 :code:`fluid.Variable()` 的派生类，除了 :code:`fluid.Variable()` 具有的各项性质以外，
+  :code:`fluid.framework.Parameter` 还可以配置自身的初始化方法、更新率等属性。
+
+2. 长期变量
+  长期变量指的是在整个训练过程中持续存在、不会因为一个迭代的结束而被销毁的变量，例如动态调节的全局学习率等。
+  在PaddlePaddle Fluid中，长期变量通过将 :code:`fluid.Variable()` 的 :code:`persistable`
+  属性设置为 :code:`True` 来表示。所有的模型参数都是长期变量，但并非所有的长期变量都是模型参数。
+
+3. 临时变量
+  不属于上面两个类别的所有模型变量都是临时变量，这种类型的变量只在一个训练迭代中存在，在每一个迭代结束后，
+  所有的临时变量都会被销毁，然后在下一个迭代开始之前，又会先构造出新的临时变量供本轮迭代使用。
+  一般情况下模型中的大部分变量都属于这一类别，例如输入的训练数据、一个普通的layer的输出等等。
+
+
+
+如何保存模型变量
+################
+
+根据用途的不同，我们需要保存的模型变量也是不同的。例如，如果我们只是想保存模型用来进行以后的预测，
+那么只保存模型参数就够用了。但如果我们需要保存一个checkpoint以备将来恢复训练，
+那么我们应该将各种长期变量都保存下来，甚至还需要记录一下当前的epoch和step的id。
+因为一些模型变量虽然不是参数，但对于模型的训练依然必不可少。
+
+因此，根据需求的不同，我们提供了两套API来分别进行模型的参数和checkpoint的保存。
+
+保存模型用于对新样本的预测
+==========================
+
+如果我们保存模型的目的是用于对新样本的预测，那么只保存模型参数就足够了。我们可以使用
+:code:`fluid.io.save_params()` 接口来进行模型参数的保存。
+
+例如：
+
+.. code-block:: python
+
+    import paddle.fluid as fluid
+
+    exe = fluid.Executor(fluid.CPUPlace())
+    param_path = "./my_paddle_model"
+    prog = fluid.default_main_program()
+    fluid.io.save_params(executor=exe, dirname=param_path, main_program=None)
+
+上面的例子中，通过调用 :code:`fluid.io.save_params` 函数，PaddlePaddle Fluid会对默认
+:code:`fluid.Program` 也就是 :code:`prog` 中的所有模型变量进行扫描，
+筛选出其中所有的模型参数，并将这些模型参数保存到指定的 :code:`param_path` 之中。
+
+
+保存checkpoint用于将来恢复训练
+==============================
+
+在训练过程中，我们可能希望在一些节点上将当前的训练状态保存下来，
+以便在将来需要的时候恢复训练环境继续进行训练。这一般被称作“checkpoint”。
+想要保存checkpoint，可以使用 :code:`fluid.io.save_checkpiont()` 接口。
+
+例如：
+
+.. code-block:: python
+
+    import paddle.fluid as fluid
+
+    exe = fluid.Executor(fluid.CPUPlace())
+    path = "./checkpoints"
+    prog = fluid.default_main_program()
+    trainer_args = {"epoch_id": 200,
+                    "step_id": 20} # just an example
+    fluid.io.save_checkpoint(executor=exe,
+                                checkpoint_dir=path,
+                                trainer_id=0,
+                                trainer_args=trainer_args,
+                                main_program=prog,
+                                max_num_checkpoints=3)
+
+上面的例子中，通过调用 :code:`fluid.io.save_checkpoint` 函数，PaddlePaddle Fluid会对默认
+:code:`fluid.Program` 也就是 :code:`prog` 中的所有模型变量进行扫描，
+根据一系列内置的规则自动筛选出其中所有需要保存的变量，并将他们保存到指定的 :code:`path` 目录下。
+
+:code:`fluid.io.save_checkpoint` 的各个参数中， :code:`trainer_id` 在单机情况下设置为0即可； :code:`trainer_args`
+为一个Python dict，用于给定当前的epoch_id和step_id；
+:code:`max_num_checkpoints` 用于表示的最大checkpoint数量，
+如果目录中已经存在的checkpoint数量超过这个值，那最早的checkpoint将被删除。
+
+如何载入模型变量
+################
+
+与模型变量的保存相对应，我们提供了两套API来分别载入模型的参数和载入模型的checkpoint。
+
+载入模型用于对新样本的预测
+==========================
+
+对于通过 :code:`fluid.io.save_params` 保存的模型，可以使用 :code:`fluid.io.load_params`
+来进行载入。
+
+例如：
+
+.. code-block:: python
+
+    import paddle.fluid as fluid
+
+    exe = fluid.Executor(fluid.CPUPlace())
+    param_path = "./my_paddle_model"
+    prog = fluid.default_main_program()
+    fluid.io.load_params(executor=exe, dirname=param_path,
+                         main_program=prog)
+
+上面的例子中，通过调用 :code:`fluid.io.load_params` 函数，PaddlePaddle Fluid会对
+:code:`prog` 中的所有模型变量进行扫描，筛选出其中所有的模型参数，
+并尝试从 :code:`param_path` 之中读取加载它们。
+
+需要格外注意的是，这里的 :code:`prog` 必须和调用 :code:`fluid.io.save_params`
+时所用的 :code:`prog` 中的前向部分完全一致，且不能包含任何参数更新的操作。如果两者存在不一致，
+那么可能会导致一些变量未被正确加载；如果错误地包含了参数更新操作，那可能会导致正常预测过程中参数被更改。
+这两个 :code:`fluid.Program` 之间的关系类似于训练 :code:`fluid.Program`
+和测试 :code:`fluid.Program` 之间的关系，详见： :ref:`user_guide_test_while_training`。
+
+另外，需特别注意运行 :code:`fluid.default_startup_program()` 必须在调用 :code:`fluid.io.load_params`
+之前。如果在之后运行，可能会覆盖已加载的模型参数导致错误。
+
+
+载入checkpoint用于恢复训练
+==========================
+
+对于通过 :code:`fluid.io.save_checkpoint` 保存的模型，可以使用 :code:`fluid.io.load_checkpoint`
+来进行载入。
+
+例如：
+
+.. code-block:: python
+
+    import paddle.fluid as fluid
+
+    exe = fluid.Executor(fluid.CPUPlace())
+    path = "./checkpoints"
+    prog = fluid.default_main_program()
+    fluid.io.load_checkpoint(executor=exe, checkpoint_dir=path,
+                             serial=9, main_program=prog)
+
+上面的例子中，通过调用 :code:`fluid.io.save_checkpoint` 函数，PaddlePaddle Fluid会对
+:code:`prog` 中的所有模型变量进行扫描，根据内置规则自动筛选出需要加载的变量，
+并尝试从 :code:`path` 之中加载它们。
+
+参数 :code:`serial` 用来标记具体要加载的checkpoint的版本号。在保存checkpoint的时候，
+一个checkpoint会被保存在一个子目录中，并在目录名上体现出自己的版本号。
+一般越大的版本号表示这个checkpoint越新。
+
+这里的 :code:`prog` 必须和调用 :code:`fluid.io.save_checkpoint` 时所用的 :code:`prog`
+完全一致，否则会导致变量加载错误或者未加载。另外，与 :code:`fluid.io.save_params` 类似，
+运行 :code:`fluid.default_startup_program()` 也必须在 :code:`fluid.io.load_checkpoint`
+之前进行。
+
+多机checkpoint保存
+##################
+
+.. toctree::
+   :maxdepth: 2
+
+   checkpoint_doc_cn.md
\ No newline at end of file
diff --git a/doc/fluid/new_docs/user_guides/howto/training/single_node.rst b/doc/fluid/new_docs/user_guides/howto/training/single_node.rst
new file mode 100644
index 0000000000000000000000000000000000000000..23eac0f831f2d6d052b7fc35b536d4ab633df851
--- /dev/null
+++ b/doc/fluid/new_docs/user_guides/howto/training/single_node.rst
@@ -0,0 +1,119 @@
+########
+单机训练
+########
+
+准备工作
+########
+
+要进行PaddlePaddle Fluid单机训练，需要先 :ref:`user_guide_prepare_data` 和
+:ref:`user_guide_configure_simple_model` 。当\
+:ref:`user_guide_configure_simple_model` 完毕后，可以得到两个\
+:code:`fluid.Program`， :code:`startup_program` 和 :code:`main_program`。
+默认情况下，可以使用 :code:`fluid.default_startup_program()` 与\ :code:`fluid.default_main_program()` 获得全局的 :code:`fluid.Program`。
+
+例如:
+
+.. code-block:: python
+
+   import paddle.fluid as fluid
+
+   image = fluid.layers.data(name="image", shape=[784])
+   label = fluid.layers.data(name="label", shape=[1])
+   hidden = fluid.layers.fc(input=image, size=100, act='relu')
+   prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
+   loss = fluid.layers.mean(
+       fluid.layers.cross_entropy(
+           input=prediction,
+           label=label
+       )
+   )
+
+   sgd = fluid.optimizer.SGD(learning_rate=0.001)
+   sgd.minimize(loss)
+
+   # Here the fluid.default_startup_program() and fluid.default_main_program()
+   # has been constructed.
+
+在上述模型配置执行完毕后， :code:`fluid.default_startup_program()` 与\
+:code:`fluid.default_main_program()` 配置完毕了。
+
+初始化参数
+##########
+
+参数随机初始化
+==============
+
+用户配置完模型后，参数初始化操作会被写入到\
+:code:`fluid.default_startup_program()` 中。使用 :code:`fluid.Executor()` 运行
+这一程序，即可在全局 :code:`fluid.global_scope()` 中随机初始化参数。例如:
+
+.. code-block:: python
+
+   exe = fluid.Executor(fluid.CUDAPlace(0))
+   exe.run(program=fluid.default_startup_program())
+
+值得注意的是: 如果使用多GPU训练，参数需要先在GPU0上初始化，再经由\
+:code:`fluid.ParallelExecutor` 分发到多张显卡上。
+
+
+载入预定义参数
+==============
+
+在神经网络训练过程中，经常会需要载入预定义模型，进而继续进行训练。\
+如何载入预定义参数，请参考 :ref:`user_guide_save_load_vars`。
+
+
+单卡训练
+########
+
+执行单卡训练可以使用 :code:`fluid.Executor()` 中的 :code:`run()` 方法，运行训练\
+:code:`fluid.Program` 即可。在运行的时候，用户可以通过 :code:`run(feed=...)`\
+参数传入数据；用户可以通过 :code:`run(fetch=...)` 获取持久的数据。例如:\
+
+.. code-block:: python
+
+   ...
+   loss = fluid.layers.mean(...)
+
+   exe = fluid.Executor(...)
+   # the result is an numpy array
+   result = exe.run(feed={"image": ..., "label": ...}, fetch_list=[loss])
+
+这里有几点注意事项:
+
+1. feed的数据格式，请参考文章 :ref:`user_guide_feed_data_to_executor`。
+2. :code:`Executor.run` 的返回值是 :code:`fetch_list=[...]` 的variable值。被fetch\
+   的Variable必须是persistable的。 :code:`fetch_list` 可以传入Variable的列表，\
+   也可以传入Variable的名字列表。:code:`Executor.run` 返回Fetch结果列表。
+3. 如果需要取回的数据包含序列信息，可以设置
+   :code:`exe.run(return_numpy=False, ...)` 直接返回 :code:`fluid.LoDTensor`
+   。用户可以直接访问 :code:`fluid.LoDTensor` 中的信息。
+
+多卡训练
+########
+
+执行多卡训练可以使用 :code:`fluid.ParallelExecutor` 运行训练
+:code:`fluid.Program`。例如:
+
+.. code-block:: python
+
+   train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=loss.name,
+                                main_program=fluid.default_main_program())
+   train_exe.run(fetch_list=[loss.name], feed={...})
+
+这里有几点注意事项:
+
+1. :code:`ParallelExecutor` 的构造函数需要指明要执行的 :code:`fluid.Program` ,
+   并在执行过程中不能修改。默认值是 :code:`fluid.default_main_program()` 。
+2. :code:`ParallelExecutor` 需要明确指定是否使用 CUDA 显卡进行训练。在显卡训练\
+   模式下会占用全部显卡。用户可以配置 `CUDA_VISIBLE_DEVICES <http://www.acceleware.com/blog/cudavisibledevices-masking-gpus>`_ 来修改占用\
+   的显卡。
+
+进阶使用
+########
+
+.. toctree::
+   :maxdepth: 2
+
+   test_while_training
+   save_load_variables
diff --git a/doc/fluid/new_docs/user_guides/howto/training/src/dist_train_nccl2.graffle b/doc/fluid/new_docs/user_guides/howto/training/src/dist_train_nccl2.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..16f6b8835c4ffb82babca56b62ba44494fd6a947
Binary files /dev/null and b/doc/fluid/new_docs/user_guides/howto/training/src/dist_train_nccl2.graffle differ
diff --git a/doc/fluid/new_docs/user_guides/howto/training/src/dist_train_nccl2.png b/doc/fluid/new_docs/user_guides/howto/training/src/dist_train_nccl2.png
new file mode 100644
index 0000000000000000000000000000000000000000..587a1a48affdde6809d7f8bf77e1055db7cd8c14
Binary files /dev/null and b/doc/fluid/new_docs/user_guides/howto/training/src/dist_train_nccl2.png differ
diff --git a/doc/fluid/new_docs/user_guides/howto/training/src/dist_train_pserver.graffle b/doc/fluid/new_docs/user_guides/howto/training/src/dist_train_pserver.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..046c4903231e8ca441884674c08b381766c0bbae
Binary files /dev/null and b/doc/fluid/new_docs/user_guides/howto/training/src/dist_train_pserver.graffle differ
diff --git a/doc/fluid/new_docs/user_guides/howto/training/src/dist_train_pserver.png b/doc/fluid/new_docs/user_guides/howto/training/src/dist_train_pserver.png
new file mode 100644
index 0000000000000000000000000000000000000000..cd2f92ad1a14ac12efc2c257c8aa3d1ae403b2b1
Binary files /dev/null and b/doc/fluid/new_docs/user_guides/howto/training/src/dist_train_pserver.png differ
diff --git a/doc/fluid/new_docs/user_guides/howto/training/src/parallelism.png b/doc/fluid/new_docs/user_guides/howto/training/src/parallelism.png
new file mode 100644
index 0000000000000000000000000000000000000000..6c078b5241559a05219447db67b5d8a35aeefd3f
Binary files /dev/null and b/doc/fluid/new_docs/user_guides/howto/training/src/parallelism.png differ
diff --git a/doc/fluid/new_docs/user_guides/howto/training/test_while_training.rst b/doc/fluid/new_docs/user_guides/howto/training/test_while_training.rst
new file mode 100644
index 0000000000000000000000000000000000000000..37d5c0d78179ccead7a81dffb4ae2f0d835a5949
--- /dev/null
+++ b/doc/fluid/new_docs/user_guides/howto/training/test_while_training.rst
@@ -0,0 +1,120 @@
+.. _user_guide_test_while_training:
+
+##################
+训练过程中评测模型
+##################
+
+模型的测试评价与训练的 :code:`fluid.Program` 不同。在测试评价中:
+
+1. 评价测试不进行反向传播，不优化更新参数。
+2. 评价测试执行的操作可以不同。
+
+   * 例如 BatchNorm 操作，在训练和测试时执行不同的算法。
+
+   * 评价模型与训练相比可以是完全不同的模型。
+
+生成测试 :code:`fluid.Program`
+#################################
+
+通过克隆训练 :code:`fluid.Program` 生成测试 :code:`fluid.Program`
+=======================================================================
+
+:code:`Program.clone()` 方法可以复制出新的 :code:`fluid.Program` 。 通过设置
+:code:`Program.clone(for_test=True)` 复制含有用于测试的操作Program。简单的使用方法如下:
+
+.. code-block:: python
+
+   import paddle.fluid as fluid
+
+   img = fluid.layers.data(name="image", shape=[784])
+   prediction = fluid.layers.fc(
+     input=fluid.layers.fc(input=img, size=100, act='relu'),
+     size=10,
+     act='softmax'
+   )
+   label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+   loss = fluid.layers.mean(fluid.layers.cross_entropy(input=prediction, label=label))
+   acc = fluid.layers.accuracy(input=prediction, label=label)
+
+   test_program = fluid.default_main_program().clone(for_test=True)
+
+   adam = fluid.optimizer.Adam(learning_rate=0.001)
+   adam.minimize(loss)
+
+在使用 :code:`Optimizer` 之前，将 :code:`fluid.default_main_program()` 复制\
+成一个 :code:`test_program` 。之后使用测试数据运行 :code:`test_program`,\
+就可以做到运行测试程序，而不影响训练结果。
+
+分别配置训练 :code:`fluid.Program` 和测试 :code:`fluid.Program`
+=====================================================================
+
+如果训练程序和测试程序相差较大时，用户也可以通过完全定义两个不同的
+:code:`fluid.Program`，分别进行训练和测试。在PaddlePaddle Fluid中，\
+所有的参数都有名字。如果两个不同的操作，甚至两个不同的网络使用了同样名字的参数，\
+那么他们的值和内存空间都是共享的。
+
+PaddlePaddle Fluid中使用 :code:`fluid.unique_name` 包来随机初始化用户未定义的\
+参数名称。通过 :code:`fluid.unique_name.guard` 可以确保多次调用某函数\
+参数初始化的名称一致。
+
+例如:
+
+.. code-block:: python
+
+   import paddle.fluid as fluid
+
+   def network(is_test):
+       file_obj = fluid.layers.open_files(filenames=["test.recordio"] if is_test else ["train.recordio"], ...)
+       img, label = fluid.layers.read_file(file_obj)
+       hidden = fluid.layers.fc(input=img, size=100, act="relu")
+       hidden = fluid.layers.batch_norm(input=hidden, is_test=is_test)
+       ...
+       return loss
+
+   with fluid.unique_name.guard():
+       train_loss = network(is_test=False)
+       sgd = fluid.optimizer.SGD(0.001)
+       sgd.minimize(train_loss)
+
+   test_program = fluid.Program()
+   with fluid.unique_name.guard():
+       with fluid.program_gurad(test_program, fluid.Program()):
+           test_loss = network(is_test=True)
+
+   # fluid.default_main_program() is the train program
+   # fluid.test_program is the test program
+
+执行测试 :code:`fluid.Program`
+#################################
+
+使用 :code:`Executor` 执行测试 :code:`fluid.Program`
+=======================================================
+
+用户可以使用 :code:`Executor.run(program=...)` 来执行测试
+:code:`fluid.Program`。
+
+例如
+
+.. code-block:: python
+
+   exe = fluid.Executor(fluid.CPUPlace())
+   test_acc = exe.run(program=test_program, feed=test_data_batch, fetch_list=[acc])
+   print 'Test accuracy is ', test_acc
+
+使用 :code:`ParallelExecutor` 执行测试 :code:`fluid.Program`
+===============================================================
+
+用户可以使用训练用的 :code:`ParallelExecutor` 与测试 :code:`fluid.Program`
+一起新建一个测试的 :code:`ParallelExecutor` ；再使用测试
+:code:`ParallelExecutor.run` 来执行测试。
+
+例如:
+
+.. code-block:: python
+
+   train_exec = fluid.ParallelExecutor(use_cuda=True, loss_name=loss.name)
+
+   test_exec = fluid.ParallelExecutor(use_cuda=True, share_vars_from=train_exec,
+                                      main_program=test_program)
+   test_acc = test_exec.run(fetch_list=[acc], ...)
+
diff --git a/doc/fluid/new_docs/user_guides/index.rst b/doc/fluid/new_docs/user_guides/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..377631109d8f65c149b12cd2a0e4da920fdf4def
--- /dev/null
+++ b/doc/fluid/new_docs/user_guides/index.rst
@@ -0,0 +1,19 @@
+########
+使用指南
+########
+
+
+..  todo::
+
+    完善导引介绍
+
+..  toctree::
+    :maxdepth: 2
+
+    howto/prepare_data/index
+    howto/configure_simple_model/index
+    howto/training/index
+    howto/debug/index
+    howto/evaluation/index
+    howto/inference/index
+    models/index.rst
diff --git a/doc/fluid/new_docs/user_guides/models/index.rst b/doc/fluid/new_docs/user_guides/models/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..998e95c4885dc313d9449f5466f80c53d34fe82a
--- /dev/null
+++ b/doc/fluid/new_docs/user_guides/models/index.rst
@@ -0,0 +1,137 @@
+Fluid 模型库
+============
+
+图像分类
+--------
+
+图像分类是根据图像的语义信息对不同类别图像进行区分，是计算机视觉中重要的基础问题，是物体检测、图像分割、物体跟踪、行为分析、人脸识别等其他高层视觉任务的基础，在许多领域都有着广泛的应用。如：安防领域的人脸识别和智能视频分析等，交通领域的交通场景识别，互联网领域基于内容的图像检索和相册自动归类，医学领域的图像识别等。
+
+在深度学习时代，图像分类的准确率大幅度提升，在图像分类任务中，我们向大家介绍了如何在经典的数据集ImageNet上，训练常用的模型，包括AlexNet、VGG、GoogLeNet、ResNet、Inception-v4、MobileNet、DPN(Dual
+Path
+Network)、SE-ResNeXt模型，也开源了\ `训练的模型 <https://github.com/PaddlePaddle/models/blob/develop/fluid/image_classification/README_cn.md#已有模型及其性能>`__\ 方便用户下载使用。同时提供了能够将Caffe模型转换为PaddlePaddle
+Fluid模型配置和参数文件的工具。
+
+-  `AlexNet <https://github.com/PaddlePaddle/models/tree/develop/fluid/image_classification/models>`__
+-  `VGG <https://github.com/PaddlePaddle/models/tree/develop/fluid/image_classification/models>`__
+-  `GoogleNet <https://github.com/PaddlePaddle/models/tree/develop/fluid/image_classification/models>`__
+-  `Residual
+   Network <https://github.com/PaddlePaddle/models/tree/develop/fluid/image_classification/models>`__
+-  `Inception-v4 <https://github.com/PaddlePaddle/models/tree/develop/fluid/image_classification/models>`__
+-  `MobileNet <https://github.com/PaddlePaddle/models/tree/develop/fluid/image_classification/models>`__
+-  `Dual Path
+   Network <https://github.com/PaddlePaddle/models/tree/develop/fluid/image_classification/models>`__
+-  `SE-ResNeXt <https://github.com/PaddlePaddle/models/tree/develop/fluid/image_classification/models>`__
+-  `Caffe模型转换为Paddle
+   Fluid配置和模型文件工具 <https://github.com/PaddlePaddle/models/tree/develop/fluid/image_classification/caffe2fluid>`__
+
+目标检测
+--------
+
+目标检测任务的目标是给定一张图像或是一个视频帧，让计算机找出其中所有目标的位置，并给出每个目标的具体类别。对于人类来说，目标检测是一个非常简单的任务。然而，计算机能够“看到”的是图像被编码之后的数字，很难解图像或是视频帧中出现了人或是物体这样的高层语义概念，也就更加难以定位目标出现在图像中哪个区域。与此同时，由于目标会出现在图像或是视频帧中的任何位置，目标的形态千变万化，图像或是视频帧的背景千差万别，诸多因素都使得目标检测对计算机来说是一个具有挑战性的问题。
+
+在目标检测任务中，我们介绍了如何基于\ `PASCAL
+VOC <http://host.robots.ox.ac.uk/pascal/VOC/>`__\ 、\ `MS
+COCO <http://cocodataset.org/#home>`__\ 数据训练通用物体检测模型，当前介绍了SSD算法，SSD全称Single Shot MultiBox Detector，是目标检测领域较新且效果较好的检测算法之一，具有检测速度快且检测精度高的特点。
+
+开放环境中的检测人脸，尤其是小的、模糊的和部分遮挡的人脸也是一个具有挑战的任务。我们也介绍了如何基于 `WIDER FACE <http://mmlab.ie.cuhk.edu.hk/projects/WIDERFace/>`_ 数据训练百度自研的人脸检测PyramidBox模型，该算法于2018年3月份在WIDER FACE的多项评测中均获得 `第一名 <http://mmlab.ie.cuhk.edu.hk/projects/WIDERFace/WiderFace_Results.html>`_。
+
+-  `Single Shot MultiBox
+   Detector <https://github.com/PaddlePaddle/models/blob/develop/fluid/object_detection/README_cn.md>`__
+-  `Face Detector: PyramidBox <https://github.com/PaddlePaddle/models/tree/develop/fluid/face_detection/README_cn.md>`_
+
+图像语义分割
+------------
+
+图像语意分割顾名思义是将图像像素按照表达的语义含义的不同进行分组/分割，图像语义是指对图像内容的理解，例如，能够描绘出什么物体在哪里做了什么事情等，分割是指对图片中的每个像素点进行标注，标注属于哪一类别。近年来用在无人车驾驶技术中分割街景来避让行人和车辆、医疗影像分析中辅助诊断等。
+
+在图像语义分割任务中，我们介绍如何基于图像级联网络(Image Cascade
+Network,ICNet)进行语义分割，相比其他分割算法，ICNet兼顾了准确率和速度。
+
+-  `ICNet <https://github.com/PaddlePaddle/models/tree/develop/fluid/icnet>`__
+
+场景文字识别
+------------
+
+许多场景图像中包含着丰富的文本信息，对理解图像信息有着重要作用，能够极大地帮助人们认知和理解场景图像的内容。场景文字识别是在图像背景复杂、分辨率低下、字体多样、分布随意等情况下，将图像信息转化为文字序列的过程，可认为是一种特别的翻译过程：将图像输入翻译为自然语言输出。场景图像文字识别技术的发展也促进了一些新型应用的产生，如通过自动识别路牌中的文字帮助街景应用获取更加准确的地址信息等。
+
+在场景文字识别任务中，我们介绍如何将基于CNN的图像特征提取和基于RNN的序列翻译技术结合，免除人工定义特征，避免字符分割，使用自动学习到的图像特征，完成端到端地无约束字符定位和识别。当前，介绍了CRNN-CTC模型，后续会引入基于注意力机制的序列到序列模型。
+
+-  `CRNN-CTC模型 <https://github.com/PaddlePaddle/models/tree/develop/fluid/ocr_recognition>`__
+
+语音识别
+--------
+
+自动语音识别（Automatic Speech Recognition,
+ASR）是将人类声音中的词汇内容转录成计算机可输入的文字的技术。语音识别的相关研究经历了漫长的探索过程，在HMM/GMM模型之后其发展一直较为缓慢，随着深度学习的兴起，其迎来了春天。在多种语言识别任务中，将深度神经网络(DNN)作为声学模型，取得了比GMM更好的性能，使得
+ASR
+成为深度学习应用最为成功的领域之一。而由于识别准确率的不断提高，有越来越多的语言技术产品得以落地，例如语言输入法、以智能音箱为代表的智能家居设备等
+—— 基于语言的交互方式正在深刻的改变人类的生活。
+
+与 `DeepSpeech <https://github.com/PaddlePaddle/DeepSpeech>`__
+中深度学习模型端到端直接预测字词的分布不同，本实例更接近传统的语言识别流程，以音素为建模单元，关注语言识别中声学模型的训练，利用\ `kaldi <http://www.kaldi-asr.org>`__\ 进行音频数据的特征提取和标签对齐，并集成
+kaldi 的解码器完成解码。
+
+-  `DeepASR <https://github.com/PaddlePaddle/models/blob/develop/fluid/DeepASR/README_cn.md>`__
+
+机器翻译
+--------
+
+机器翻译（Machine
+Translation）将一种自然语言(源语言)转换成一种自然语言（目标语音），是自然语言处理中非常基础和重要的研究方向。在全球化的浪潮中，机器翻译在促进跨语言文明的交流中所起的重要作用是不言而喻的。其发展经历了统计机器翻译和基于神经网络的神经机器翻译(Nueural
+Machine Translation, NMT)等阶段。在 NMT
+成熟后，机器翻译才真正得以大规模应用。而早阶段的 NMT
+主要是基于循环神经网络 RNN
+的，其训练过程中当前时间步依赖于前一个时间步的计算，时间步之间难以并行化以提高训练速度。因此，非
+RNN 结构的 NMT 得以应运而生，例如基于卷积神经网络 CNN
+的结构和基于自注意力机制（Self-Attention）的结构。
+
+本实例所实现的 Transformer
+就是一个基于自注意力机制的机器翻译模型，其中不再有RNN或CNN结构，而是完全利用
+Attention 学习语言中的上下文依赖。相较于RNN/CNN,
+这种结构在单层内计算复杂度更低、易于并行化、对长程依赖更易建模，最终在多种语言之间取得了最好的翻译效果。
+
+-  `Transformer <https://github.com/PaddlePaddle/models/blob/develop/fluid/neural_machine_translation/transformer/README_cn.md>`__
+
+强化学习
+--------
+
+强化学习是近年来一个愈发重要的机器学习方向，特别是与深度学习相结合而形成的深度强化学习(Deep
+Reinforcement Learning,
+DRL)，取得了很多令人惊异的成就。人们所熟知的战胜人类顶级围棋职业选手的
+AlphaGo 就是 DRL
+应用的一个典型例子，除游戏领域外，其它的应用还包括机器人、自然语言处理等。
+
+深度强化学习的开山之作是在Atari视频游戏中的成功应用，
+其可直接接受视频帧这种高维输入并根据图像内容端到端地预测下一步的动作，所用到的模型被称为深度Q网络(Deep
+Q-Network, DQN)。本实例就是利用PaddlePaddle Fluid这个灵活的框架，实现了
+DQN 及其变体，并测试了它们在 Atari 游戏中的表现。
+
+-  `DeepQNetwork <https://github.com/PaddlePaddle/models/blob/develop/fluid/DeepQNetwork/README_cn.md>`__
+
+中文词法分析
+------------
+
+中文分词(Word Segmentation)是将连续的自然语言文本，切分出具有语义合理性和完整性的词汇序列的过程。因为在汉语中，词是承担语义的最基本单位，切词是文本分类、情感分析、信息检索等众多自然语言处理任务的基础。 词性标注（Part-of-speech Tagging）是为自然语言文本中的每一个词汇赋予一个词性的过程，这里的词性包括名词、动词、形容词、副词等等。 命名实体识别（Named Entity Recognition，NER）又称作“专名识别”，是指识别自然语言文本中具有特定意义的实体，主要包括人名、地名、机构名、专有名词等。 我们将这三个任务统一成一个联合任务，称为词法分析任务，基于深度神经网络，利用海量标注语料进行训练，提供了一个端到端的解决方案。
+
+我们把这个联合的中文词法分析解决方案命名为LAC。LAC既可以认为是Lexical Analysis of Chinese的首字母缩写，也可以认为是LAC Analyzes Chinese的递归缩写。
+
+- `LAC <https://github.com/baidu/lac/blob/master/README.md>`__
+
+情感倾向分析
+------------
+
+情感倾向分析针对带有主观描述的中文文本，可自动判断该文本的情感极性类别并给出相应的置信度。情感类型分为积极、消极、 中性。情感倾向分析能够帮助企业理解用户消费习惯、分析热点话题和危机舆情监控，为企业提供有力的决策支持。本次我们开放 AI开放平台中情感倾向分析采用的模型(http://ai.baidu.com/tech/nlp/sentiment_classify )， 提供给用户使用。
+
+- `Senta <https://github.com/baidu/Senta/blob/master/README.md>`__
+
+AnyQ
+----
+
+`AnyQ <https://github.com/baidu/AnyQ>`__\ (ANswer Your Questions)
+开源项目主要包含面向FAQ集合的问答系统框架、文本语义匹配工具SimNet。
+问答系统框架采用了配置化、插件化的设计，各功能均通过插件形式加入，当前共开放了20+种插件。开发者可以使用AnyQ系统快速构建和定制适用于特定业务场景的FAQ问答系统，并加速迭代和升级。
+
+SimNet是百度自然语言处理部于2013年自主研发的语义匹配框架，该框架在百度各产品上广泛应用，主要包括BOW、CNN、RNN、MM-DNN等核心网络结构形式，同时基于该框架也集成了学术界主流的语义匹配模型，如MatchPyramid、MV-LSTM、K-NRM等模型。使用SimNet构建出的模型可以便捷的加入AnyQ系统中，增强AnyQ系统的语义匹配能力。
+
+-  `SimNet in PaddlePaddle
+   Fluid <https://github.com/baidu/AnyQ/blob/master/tools/simnet/train/paddle/README.md>`__
diff --git a/doc/survey/dynamic_graph.md b/doc/survey/dynamic_graph.md
index 6b80b014b1b1dc50f425e1296f70984c9e9b1cbd..7f62eeadff43af1f0a3c81e284a6508bf063b21e 100644
--- a/doc/survey/dynamic_graph.md
+++ b/doc/survey/dynamic_graph.md
@@ -2,42 +2,47 @@
 
 ## Automatic Differentiation
 
-A key challenge in the field of deep learning is to automatically derive the backward pass from the forward pass described algorithmically by researchers.  Such a derivation, or a transformation of the forward pass program, has been long studied before the recent prosperity of deep learning in the field known as [automatic differentiation](https://arxiv.org/pdf/1502.05767.pdf).
+A key challenge in deep learning is to automatically derive the backward pass given the forward pass as a program, which has been long studied in the field of [automatic differentiation](https://arxiv.org/pdf/1502.05767.pdf), or autodiff, before the prosperity of deep learning.
 
-## The Tape
+## Program Transformation v.s. Backtracking
 
-Given the forward pass program (usually in Python in practices), there are two strategies to derive the backward pass:
+Given the forward pass program, there are two strategies to derive the backward pass:
 
-1. from the forward pass program itself, or
-1. from the execution trace of the forward pass program, which is often known as the *tape*.
+1. by transforming the forward pass program without executing it, or
+1. by backtracking the execution process of the forward pass program.
 
-This article surveys systems that follow the latter strategy.
+This article is about the latter strategy. 
 
-## Dynamic Network
+## The Tape and Dynamic Networks
 
-When we train a deep learning model, the tape changes every iteration as the input data change, so we have to re-derive the backward pass every iteration.  This is known as *dynamic network*.
+We refer to the trace of the execution of the forward pass program as a *tape* [[1]](http://www.bcl.hamilton.ie/~barak/papers/toplas-reverse.pdf).  When we train a deep learning model, the tape changes every iteration as the input data change, so we'd have to re-derive the backward pass, which is time-consuming, but also eases the case that the forward program includes control flows like if-else and for/while. With these control flows, the execution trace might change with iterations.  Such changes are known as *dynamic networks* in the field of deep learning.
 
-Deep learning systems that utilize the idea of dynamic network gained their popularities in recent years.  This article surveys two representative systems: [PyTorch](https://pytorch.org/) and [DyNet](https://dynet.readthedocs.io/en/latest/).
+## Typical Systems
 
-## An Overview
+Deep learning systems that utilize the idea of dynamic networks gained their popularities in recent years.  This article surveys the following typical systems: 
 
-Both frameworks record a ‘tape’ of the computation and interpreting (or run-time compiling) a transformation of the tape played back in reverse. This tape is a different kind of entity than the original program.[[link]](http://www.bcl.hamilton.ie/~barak/papers/toplas-reverse.pdf)
+- [DyNet](https://dynet.readthedocs.io/en/latest/)
+- [PyTorch](https://pytorch.org/)
+- Chainer
+- Autograd from HIPS
 
-Consider the following code feedforward model.
+Before diving into these systems, let us pose an example forward pass program:
 
 ```python
 x = Variable(randn(20, 1)))
 label = Variable(randint(1))
 W_1, W_2 = Variable(randn(20, 20)), Variable(randn(10, 20))
 h = matmul(W_1, x)
-pred = matmul(W_2, x)
+pred = matmul(W_2, h)
 loss = softmax(pred, label)
 loss.backward()
 ```
 
-### 1) Dynet uses List to encode the Tape
+## The Representation of Tapes
 
-During the forward execution, a list of operators, in this case `matmul`, `matmul` and `softmax`, are recorded in the tape, along with the necessary information needed to do the backward such as pointers to the inputs and outputs. Then the tape is played in reverse order at `loss.backward()`.
+### DyNet: the Tape as a List
+
+DyNet uses a linear data structure, a list, to represent the tape. During the execution of the above example, it is a list of operators: `matmul`, `matmul`, and `softmax`.  The list also includes information needed to do the backward pass, such as pointers to the inputs and outputs. Then the tape is played in reverse order at `loss.backward().`
 
 <details> 
 <summary></summary>
@@ -69,9 +74,9 @@ digraph g {
 
 ![Alt text](https://g.gravizo.com/svg?digraph%20g%20{%20graph%20[%20rankdir%20=%20%22LR%22%20];%20node%20[%20fontsize%20=%20%2216%22%20shape%20=%20%22ellipse%22%20];%20edge%20[];%20%22node0%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20matmul%20|%20%3Cf1%3E%20input:%20W_1,%20x%20|%20%3Cf2%3E%20output:%20h%22%20shape%20=%20%22record%22%20];%20%22node1%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20matmul%20|%20%3Cf1%3E%20input:%20W_2,%20h%20|%20%3Cf2%3E%20output:%20pred%22%20shape%20=%20%22record%22%20];%20%22node2%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20softmax%20|%20%3Cf1%3E%20input:%20pred,%20label%20|%20%3Cf2%3E%20output:%20loss%22%20shape%20=%20%22record%22%20];%20%22node0%22:f0%20-%3E%20%22node1%22:f0%20[%20id%20=%200%20];%20%22node1%22:f0%20-%3E%20%22node2%22:f0%20[%20id%20=%201%20];%20})
 
-### 2) Pytorch uses Node Graph to encode the Tape
+### PyTorch: the Tape as a Graph
 
-The graph is composed of `Variable`s and `Function`s. During the forward execution, a `Variable` records its creator function, e.g. `h.creator = matmul`. And a Function records its inputs' previous/dependent functions `prev_func` through `creator`, e.g. `matmul.prev_func = matmul1`. At `loss.backward()`, a topological sort is performed on all `prev_func`s. Then the grad op is performed by the sorted order.
+The graph is composed of `Variable`s and `Function`s. During the forward execution, a `Variable` records its creator function, e.g. `h.creator = matmul`. And a Function records its inputs' previous/dependent functions `prev_func` through `creator`, e.g. `matmul.prev_func = matmul1`. At `loss.backward()`, a topological sort is performed on all `prev_func`s. Then the grad op is performed by the sorted order.  Please be aware that a `Function` might have more than one `prev_func`s.
 
 <details> 
 <summary></summary>
@@ -132,27 +137,22 @@ digraph g {
 
 ![Alt text](https://g.gravizo.com/svg?digraph%20g%20{%20graph%20[%20rankdir%20=%20%22LR%22%20];%20subgraph%20function%20{%20node%20[%20fontsize%20=%20%2216%22%20style%20=%20filled%20shape%20=%20%22record%22%20];%20%22matmul0%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20matmul%20|%20prev_func:%20None%22%20];%20%22matmul1%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20matmul%20|%20prev_func:%20matmul%22%20];%20%22softmax%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20softmax%20|%20prev_func:%20matmul%22%20];%20}%20subgraph%20variable%20{%20node%20[%20fontsize%20=%20%2216%22%20shape%20=%20%22Mrecord%22%20style%20=%20filled%20fillcolor%20=%20white%20];%20%22x%22%20[%20label%20=%20%22%3Cf0%3E%20x%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22label%22%20[%20label%20=%20%22%3Cf0%3E%20label%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22W_1%22%20[%20label%20=%20%22%3Cf0%3E%20W_1%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22W_2%22%20[%20label%20=%20%22%3Cf0%3E%20W_2%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22h%22%20[%20label%20=%20%22%3Cf0%3E%20h%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22pred%22%20[%20label%20=%20%22%3Cf0%3E%20pred%20|%20%3Cf1%3E%20creator:%20matmul%22%20];%20%22loss%22%20[%20label%20=%20%22%3Cf0%3E%20loss%20|%20%3Cf1%3E%20creator:%20softmax%22%20];%20}%20subgraph%20data_flow%20{%20%22x%22:f0%20-%3E%20%22matmul0%22:f0;%20%22W_1%22:f0%20-%3E%20%22matmul0%22:f0;%20%22matmul0%22:f0%20-%3E%20%22h%22:f0;%20%22h%22:f0%20-%3E%20%22matmul1%22:f0;%20%22W_2%22:f0%20-%3E%20%22matmul1%22:f0;%20%22matmul1%22:f0%20-%3E%20%22pred%22:f0;%20%22pred%22:f0%20-%3E%20%22softmax%22:f0;%20%22label%22:f0%20-%3E%20%22softmax%22:f0;%20%22softmax%22:f0%20-%3E%20%22loss%22:f0;%20}%20subgraph%20prev_func%20{%20edge%20[color=%22red%22,%20arrowsize=%220.6%22,%20penwidth=%221%22,%20constraint=false];%20%22matmul1%22:f1%20-%3E%20%22matmul0%22:f0;%20%22softmax%22:f1%20-%3E%20%22matmul1%22:f0;%20label%20=%20%22prev_func%22;%20}%20})
 
-Chainer and Autograd uses the similar techniques to record the forward pass. For details please refer to the appendix.
-
-## Design choices
+Chainer and Autograd use the similar techniques to record the forward pass. For details, please refer to the appendix.
 
-### 1) Dynet's List vs Pytorch's Node Graph
+## Comparison: List v.s. Graph
 
-What's good about List:
-1. It avoids a topological sort. One only needs to traverse the list of operators in reverse and calling the corresponding backward operator.
-1. It promises effient data parallelism implementations. One could count the time of usage of a certain variable during the construction list. Then in the play back, one knows the calculation of a variable has completed. This enables communication and computation overlapping.
+The list of DyNet could be considered the result of the topological sort of the graph of PyTorch. Or, the graph is the raw representation of the tape, which gives us the chance to *prune* part of the graph that is irrelevant with the backward pass before the topological sort [[2]](https://openreview.net/pdf?id=BJJsrmfCZ). Consider the following example, PyTorch only does backward on `SmallNet` while DyNet does both `SmallNet` and `BigNet`:
 
-What's good about Node Graph:
-1. More flexibility. PyTorch users can mix and match independent graphs however they like, in whatever threads they like (without explicit synchronization). An added benefit of structuring graphs this way is that when a portion of the graph becomes dead, it is automatically freed. [[2]](https://openreview.net/pdf?id=BJJsrmfCZ) Consider the following example, Pytorch only does backward on SmallNet while Dynet does both BigNet and SmallNet.
 ```python
 result = BigNet(data)
 loss = SmallNet(data)
 loss.backward()
 ```
 
-### 2) Dynet's Lazy evaluation vs Pytorch's Immediate evaluation
+## Lazy v.s. Immediate Evaluation
+
+Another difference between DyNet and PyTorch is that DyNet lazily evaluates the forward pass, whereas PyTorch executes it immediately. Consider the following example:
 
-Dynet builds the list in a symbolic matter. Consider the following example
 ```python
 for epoch in range(num_epochs):
     for in_words, out_label in training_data:
@@ -164,16 +164,17 @@ for epoch in range(num_epochs):
         loss_val = loss_sym.value()
         loss_sym.backward()
 ```
+
 The computation of `lookup`, `concat`, `matmul` and `softmax` didn't happen until the call of `loss_sym.value()`. This defered execution is useful because it allows some graph-like optimization possible, e.g. kernel fusion.
 
-Pytorch chooses immediate evaluation. It avoids ever materializing a "forward graph"/"tape" (no need to explicitly call `dy.renew_cg()` to reset the list), recording only what is necessary to differentiate the computation, i.e. `creator` and `prev_func`.
+PyTorch chooses immediate evaluation. It avoids ever materializing a "forward graph"/"tape" (no need to explicitly call `dy.renew_cg()` to reset the list), recording only what is necessary to differentiate the computation, i.e. `creator` and `prev_func`.
 
 
-## What can fluid learn from them?
+## Fluid: Learning the Lessons
 
 Please refer to `paddle/contrib/dynamic/`.
 
-# Appendix
+## Appendix
 
 ### Overview
 
diff --git a/doc/survey/op_fusion_design.md b/doc/survey/op_fusion_design.md
new file mode 100644
index 0000000000000000000000000000000000000000..d6e48f4f58269b67450cb012f6dcc59e1083abba
--- /dev/null
+++ b/doc/survey/op_fusion_design.md
@@ -0,0 +1,20 @@
+# Operator fusion  
+Fusing multiple operators together is an important method to optimize the program execution, particularly for GPU or other specialized accelerators. An obvious benefit is to avoid the overhead of saving the intermediate result back into global memory.   
+
+There are generally two ways to fuse operators, fusing directly connected operators and fusing non directly connected operators. The first method is mainly used by [NNVM Compiler](https://github.com/dmlc/tvm/) and [XLA](https://www.tensorflow.org/performance/xla/). The second method is mainly used by Dynet and TensorFlow Fold to do auto-batching. The principle of fusing operator is according to some rules to combine multiple operations into one, for example, `Y = X * W` and `Z = Y + B` can be fused to `Z = X * W + B`, and `Y1 = X1 * W` and `Y2 = X2 * W` can be fused to `[Y1;Y2] = [X1;X2] * W`. In order to get a short-term profit, we decided to try to manually specify these rules.   
+
+## Challenge
+The challenge of fusing operators is:
+  - how to make the rules.
+  - how to implement these rules efficiently.
+
+### How to make the rules?
+
+The problem of determining the best single location for a fusion operator is an NP-hard combinatorial problem. After analysis the operators of the DL model, we found there are two group of operators can be fused explicitly, one is the simple and adjacent operations, for example, `tmp = x + y` and `z = Relu(tmp)`, and the other is the operators that have the same function, for example, a serials of `SGD` or `Momentum`. They usually appear in the model in a large number. So we should think about how to fuse them separately first.
+
+### How to implement these rules efficiently?
+#### How to fuse the adjacent operations efficiently?
+Here we use a template function to represent the fused operations. The pros of using a template function are that it is simple and efficient, and the cons are that it is not easy to expand, and it can only be used to express some simple operations. So taking into account our current needs, the template function is more appropriate.
+
+#### How to fuse the operators that have the same function efficiently?
+We take SGD operator as an example, the training model may have hundreds of parameters and correspondingly have the same number of SGD operators. The expression(`w = w - lr*w_g`) of those operators is the same, so during of training, the executor will execute this expression hundreds time in CPU or other specialized accelerators. If we can fuse them and make the address of all `w` and all `w_g` continuous respectively, we only need execute one time. For some accelerators, the time of launching kernel is not neglected, so the time of hundreds of times of launching and executing kernel may be larger than launching and executing only once. There usually are many operators that similar to `SGD` in the DL model, such as `AllReduce` and `FC`.
diff --git a/doc/v2/api/index_en.rst b/doc/v2/api/index_en.rst
index 70c5c524aaf0a9ae003bf4340c3f268c225d4419..5813509dce46677444f0234db8e0eaa4f113e3a0 100644
--- a/doc/v2/api/index_en.rst
+++ b/doc/v2/api/index_en.rst
@@ -4,7 +4,6 @@ API
 ..  toctree::
     :maxdepth: 1
 
-    overview.rst
     model_configs.rst
     data.rst
     run_logic.rst
diff --git a/doc/v2/build_and_install/build_from_source_cn.rst b/doc/v2/build_and_install/build_from_source_cn.rst
index 6421c5308271c2508597d849c79709255caf349a..d0dacb104f148c2aeb323365cbd6f014ae00ed5a 100644
--- a/doc/v2/build_and_install/build_from_source_cn.rst
+++ b/doc/v2/build_and_install/build_from_source_cn.rst
@@ -35,11 +35,16 @@ PaddlePaddle需要使用Docker环境完成编译，这样可以免去单独安
    # 2. 可选步骤：源码中构建用于编译PaddlePaddle的Docker镜像
    docker build -t paddle:dev .
    # 3. 执行下面的命令编译CPU-Only的二进制
-   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh build
+   docker run -it -v $PWD:/paddle -w /paddle -e "PYTHON_ABI=cp27-cp27mu" -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh build
    # 4. 或者也可以使用为上述可选步骤构建的镜像（必须先执行第2步）
    docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev ./paddle/scripts/paddle_build.sh build
 
-注：上述命令把当前目录（源码树根目录）映射为 container 里的 :code:`/paddle` 目录。
+注：
+
+- 上述命令把当前目录（源码树根目录）映射为 container 里的 :code:`/paddle` 目录。
+
+- 如果您使用的是 manylinux 的镜像进行编译, 那么您需要通过环境变量 :code:`PYTHON_ABI` 来指定一个 `Python ABI <https://www.python.org/dev/peps/pep-0425/#id8>`__.
+PaddlePaddle目前支持的 Python ABI 有 :code:`cp27-cp27m` 和 :code:`cp27-cp27mu`.
 
 编译完成后会在build/python/dist目录下生成输出的whl包，可以选在在当前机器安装也可以拷贝到目标机器安装：
 
diff --git a/doc/v2/build_and_install/build_from_source_en.rst b/doc/v2/build_and_install/build_from_source_en.rst
index b08b45d43ec7f1deb2889832079a731ee724a44c..664b68da8b7dd3e005ebf3ec34de77729e5ab355 100644
--- a/doc/v2/build_and_install/build_from_source_en.rst
+++ b/doc/v2/build_and_install/build_from_source_en.rst
@@ -36,13 +36,18 @@ If you don't wish to use docker，you need to install several compile dependenci
    # 2. Optional: build development docker image from source
    docker build -t paddle:dev .
    # 3. Run the following command to build a CPU-Only binaries
-   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh build
+   docker run -it -v $PWD:/paddle -w /paddle -e "PYTHON_ABI=cp27-cp27mu" -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh build
    # 4. Or, use your built Docker image to build PaddlePaddle (must run step 2)
    docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev ./paddle/scripts/paddle_build.sh build
 
-NOTE: The above command try to mount the current working directory (root directory of source code)
+NOTE: 
+
+- The above command try to mount the current working directory (root directory of source code)
 into :code:`/paddle` directory inside docker container.
 
+- You need to pass in the required environment variable :code:`PYTHON_ABI` to specify a `Python ABI <https://www.python.org/dev/peps/pep-0425/#id8>`__.
+Currently PaddlePaddle supported Python ABIs include :code:`cp27-cp27m` and :code:`cp27-cp27mu` .
+
 When the compile finishes, you can get the output whl package under
 build/python/dist, then you can choose to install the whl on local
 machine or copy it to the target machine.
diff --git a/doc/v2/faq/parameter/index_en.rst b/doc/v2/faq/parameter/index_en.rst
index 61c7845af7e531013a06125f7c35b59081dafb42..9edb8dd620f972d019db9c0063cefce616de0ebd 100644
--- a/doc/v2/faq/parameter/index_en.rst
+++ b/doc/v2/faq/parameter/index_en.rst
@@ -1,5 +1,198 @@
-#################
-Parameter Setting
-#################
+##################
+Parameter Settings
+##################
 
-TBD
+.. contents::
+
+1. How to Choose the Learning Rate of SGD Algorithm
+--------------------------
+
+An important issue when training with :code:`sgd/async_sgd` is to choose the correct value for :code:`learning_rate`. If it is too large, the training may not converge. If too small, the convergence may be slow, resulting in a long training time.
+
+Usually, we start with a relatively large learning rate. If the training does not converge, then we need to reduce the learning rate continuously by a factor of 10 until the training converges. We examine the convergence of the training by estimating the minimum cost at a constant output of the model.
+
+If the cost of the training process is significantly higher than the cost of the output, then we judge that the training does not converge. For example, if we have a three-class problem and use multi-class-cross-entropy as the cost, the ratio of 0, 1, and 2 in the data will be :code:`0.2, 0.5, 0.3`. The minimum cost thus will be :code:`-(0.2*log(0.2)+0.5*log(0.5)+0.3*log(0.3))=1.03`. If the cost is greater than this number after training a pass (or even before), then the training may not be converged and the learning rate should be reduced.
+
+2. How to Implement Learning Rate Annealing
+------------------------------------------------
+
+We use the Adam algorithm as an example. Set the parameters of :code:`learning_rate_schedule` in the corresponding optimization algorithm as follows:
+
+.. code-block:: python
+
+    Optimizer = paddle.optimizer.Adam(
+        Learning_rate=1e-3,
+        Learning_rate_decay_a=0.5,
+        Learning_rate_decay_b=0.75,
+        Learning_rate_schedule="poly",)
+
+PaddlePaddle currently supports 8 learning rate schedules. The 8 learning rate schedules and their corresponding learning rates are calculated as follows:
+
+* "constant"
+  
+  Lr = learning_rate
+
+* "poly"
+
+  Lr = learning_rate * pow(1 + learning_rate_decay_a * num_samples_processed, -learning_rate_decay_b)
+
+  Variable :code:`num_samples_processed` is the number of trained samples.
+
+* "caffe_poly"
+
+  Lr = learning_rate * pow(1.0 - num_samples_processed / learning_rate_decay_a, learning_rate_decay_b)
+
+* "exp"
+
+  Lr = learning_rate * pow(learning_rate_decay_a, num_samples_processed / learning_rate_decay_b)
+
+* "discexp"
+
+  Lr = learning_rate * pow(learning_rate_decay_a, floor(num_samples_processed / learning_rate_decay_b))
+
+* "linear"
+
+  Lr = max(learning_rate - learning_rate_decay_a * num_samples_processed, learning_rate_decay_b)
+
+* "manual"
+
+  This is a learning rate annealing method that is segmented by the number of trained samples. When using this learning rate schedule, we modify the learning rate attenuation factor piecewise function by changing the parameter :code:`learning_rate_args`. The current learning rate is the product of :code:`learning_rate` and the current attenuation factor. Take the Adam algorithm as an example:
+
+  .. code-block:: python
+
+      Optimizer = paddle.optimizer.Adam(
+          Learning_rate=1e-3,
+          Learning_rate_schedule="manual",
+          Learning_rate_args="1000:1.0,2000:0.9,3000:0.8",)
+
+  In this example, when the number of trained samples is less than or equal to 1000, the learning rate is: code:`1e-3*1.0`; when the number of trained samples is greater than 1000 or less than or equal to 2000, the learning rate is:code:`1e- 3 * 0.9`; when the number of trained samples is greater than 2,000, the learning rate is: code:`1e-3*0.8`.
+
+* "pass_manual"
+
+  This is a learning rate annealing method that piecewisely pick values according to the number of trained passes. When using this learning rate schedule, we set the learning rate attenuation factor piecewise function by the parameter :code:`learning_rate_args`. The current learning rate is the product of :code:`learning_rate` and the current attenuation factor. Take the Adam algorithm as an example:
+
+  .. code-block:: python
+
+      Optimizer = paddle.optimizer.Adam(
+          Learning_rate=1e-3,
+          Learning_rate_schedule="pass_manual",
+          Learning_rate_args="1:1.0,2:0.9,3:0.8",)
+
+  In this example, when the number of trained passes is less than or equal to 1, the learning rate is :code:`1e-3*1.0`; when the number of trained passes is greater than 1 or less than 2, the learning rate is :code:`1e- 3 * 0.9`; when the number of trained passes is greater than 2, the learning rate is :code:`1e-3*0.8`.
+
+3. How to Initialize Parameters
+-----------------
+
+By default, PaddlePaddle initializes parameters with an average of 0 and a standard deviation of :math:`\frac{1}{\sqrt{d}}`, where :math:`d` is the width of the parameter matrix. This initialization method does not produce bad results under normal circumstances. If users want to customize the initialization method, PaddlePaddle provides two ways to initialize the parameters:
+
+* Gaussian distribution. Set :code:`param_attr` to :code:`param_attr=ParamAttr(initial_mean=0.0, initial_std=1.0)`
+* Uniform distribution. Set :code:`param_attr` to :code:`param_attr=ParamAttr(initial_max=1.0, initial_min=-1.0)`
+
+For example, to set a full connection layer parameter initialization mode and bias initialization mode, you can use the following code:
+
+.. code-block:: python
+
+    Hidden = fc_layer(input=ipt, param_attr=ParamAttr(initial_max=1.0, initial_min=-1.0),
+                      Bias_attr=ParamAttr(initial_mean=1.0, initial_std=0.0))
+
+The above code initializes the bias to 1.0 and initializes the parameters to a uniform distribution of :code:`[1.0, -1.0]`.
+
+4. How to Share Parameters
+---------------
+
+PaddlePaddle's parameters use :code:`name` as the ID. Parameters with the same name will share parameters//. We can set the name of the parameters using :code:`ParamAttr(name="YOUR_PARAM_NAME")`. More conveniently, we can make the parameters to be shared use the same :code:`ParamAttr` object.
+
+A simple fully connected network has its configuration of parameter sharing as follows \:
+
+.. literalinclude:: ../../python/paddle/trainer_config_helpers/tests/configs/shared_fc.py
+
+Here :code:`hidden_a` and :code:`hidden_b` have the same parameter and bias. The two input of the softmax layer also use the same parameter :code:`softmax_param`.
+
+5. How to Load Pre-training Parameters
+------------------------
+* For layers that load pre-training parameters, set :code:`is_static = True` so that the parameters of that layer remain unchanged during the training process. Take the embedding layer as an example, the code is as follows:
+
+.. code-block:: python
+
+    Emb_para = paddle.attr.Param(name='emb', is_static=True)
+    Paddle.layer.embedding(size=word_dim, input=x, param_attr=emb_para)
+
+
+* Load pre-training parameters from the model file into :code:`numpy.array`. After creating the parameters, load the pre-training parameters using :code:`parameters.set()`. The first 16 bytes of the model parameter file saved by PaddlePaddle is the header information. The user must loads : :code:`numpy.array` starting with the 17th byte. Take the embedding layer as an example, the code is as follows:
+
+.. code-block:: python
+
+    Def load_parameter(file_name, h, w):
+        With open(file_name, 'rb') as f:
+            F.read(16) # skip header.
+            Return np.fromfile(f, dtype=np.float32).reshape(h, w)
+
+    Parameters = paddle.parameters.create(my_cost)
+    Parameters.set('emb', load_parameter(emb_param_file, 30000, 256))
+
+6. Format of the Stored Parameter and How to Convert the File to Plain Text
+--------------------------------------------------
+
+The model parameter file saved by PaddlePaddle consists of 16 bytes of header information and network parameters. In the header information, the first four bytes show PaddlePaddle's version information. The user should fill in with 0s. The next four bytes represent the number of bytes occupied by each parameter. If the saved network parameter is a float type, the number is four; if it is a double, the number is eight. The third group of four bytes represents the total number of saved parameters.
+
+When restoring the model parameters saved by PaddlePaddle back to plain text, we use the corresponding data type :code:`numpy.array` to load specific network parameters. At this time, you can skip the header information of the PaddlePaddle model parameter file. If not specified to compile with a precision for double in PaddlePaddle, then the parameter file will be caiculated with a precision for float, and the argument will be stored as a float. In this case, when using :code:`numpy.array`, generally we set :code:`dtype=float32`. An example is as follows:
+
+.. code-block:: python
+
+    Def read_parameter(fname, width):
+        s = open(fname).read()
+        # skip header
+        Vec = np.fromstring(s[16:], dtype=np.float32)
+        # width is the size of the corresponding layer
+        Np.savetxt(fname + ".csv", vec.reshape(width, -1),
+                Fmt="%.6f", delimiter=",")
+
+
+When the plaintext parameters are converted into PaddlePaddle loadable model parameters, the header information is constructed first, then the network parameters are written. The following code converts the randomly generated matrix into model parameters that can be loaded by PaddlePaddle:
+
+.. code-block:: python
+
+    Def gen_rand_param(param_file, width, height, need_trans):
+        Np.random.seed()
+        Header = struct.pack("iil", 0, 4, height * width)
+        Param = np.float32(np.random.rand(height, width))
+        With open(param_file, "w") as fparam:
+            Fparam.write(header + param.tostring())
+
+7. A Protocol Message Rejected Because of its Large Size
+-------------------------------------------------- ----------
+
+If you are training NLP related models, and the following error occurs:
+
+.. code-block:: bash
+
+    [libprotobuf ERROR google/protobuf/io/coded_stream.cc:171] A protocol message was rejected because it was too big (more than 67108864 bytes). To increase the limit (or to disable these warnings), see CodedInputStream::SetTotalBytesLimit( ) in google/protobuf/io/coded_stream.h.
+    F1205 14:59:50.295174 14703 TrainerConfigHelper.cpp:59] Check failed: m->conf.ParseFromString(configProtoStr)
+
+The possible reason is that one of the args passed to the dataprovider is too large, which is usually caused by directly passing a large dictionary. A wrongly defineed `_py_data_sources2` is similar to:
+
+.. code-block:: python
+
+     Src_dict = dict()
+     For line_count, line in enumerate(open(src_dict_path, "r")):
+        Src_dict[line.strip()] = line_count
+
+     Define_py_data_sources2(
+        Train_list,
+        Test_list,
+        Module="dataprovider",
+        Obj="process",
+        Args={"src_dict": src_dict})
+
+The solution is to pass the address of the dictionary as args to the dataprovider, and then load the dictionary according to the address in the dataprovider. Change `_py_data_sources2` to:
+
+.. code-block:: python
+
+     Define_py_data_sources2(
+        Train_list,
+        Test_list,
+        Module="dataprovider",
+        Obj="process",
+        Args={"src_dict_path": src_dict_path})
+
+The full source code can be found in the `sequence_recurrent <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_recurrent.py>`_ example.
diff --git a/doc/v2/howto/capi/compile_paddle_lib_cn.md b/doc/v2/howto/capi/compile_paddle_lib_cn.md
index 2c87e9afc6911526cd51d6c691f262960accc9e8..8878ee9d85064ba27708ed92790aa9b83ba316e5 100644
--- a/doc/v2/howto/capi/compile_paddle_lib_cn.md
+++ b/doc/v2/howto/capi/compile_paddle_lib_cn.md
@@ -22,23 +22,23 @@
 </tr>
 <tr>
 <td>cpu_noavx_openblas</td>
-<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddle.tgz/?branch=0.14.0" rel="nofollow">paddle.tgz</a></td>
 </tr>
 <tr>
 <td>cuda7.5_cudnn5_avx_mkl</td>
-<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz/?branch=0.14.0" rel="nofollow">paddle.tgz</a></td>
 </tr>
 <tr>
 <td>cuda8.0_cudnn5_avx_mkl</td>
-<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz/?branch=0.14.0" rel="nofollow">paddle.tgz</a></td>
 </tr>
 <tr>
 <td>cuda8.0_cudnn7_avx_mkl</td>
-<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz/?branch=0.14.0" rel="nofollow">paddle.tgz</a></td>
 </tr>
 <tr>
 <td>cuda9.0_cudnn7_avx_mkl</td>
-<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/paddle.tgz/?branch=0.14.0" rel="nofollow">paddle.tgz</a></td>
 </tr>
 </tbody></table>
 
diff --git a/doc/v2/howto/capi/compile_paddle_lib_en.md b/doc/v2/howto/capi/compile_paddle_lib_en.md
index 3fa8a18a9fbea21b494c416e6b938990fbb68337..70a6edef27e75af6b38d7d4824c928eba0d29b9a 100644
--- a/doc/v2/howto/capi/compile_paddle_lib_en.md
+++ b/doc/v2/howto/capi/compile_paddle_lib_en.md
@@ -13,31 +13,31 @@
 <tbody>
 <tr>
 <td>cpu_avx_mkl</td>
-<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz/?branch=0.14.0" rel="nofollow">paddle.tgz</a></td>
 </tr>
 <tr>
 <td>cpu_avx_openblas</td>
-<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddle.tgz/?branch=0.14.0" rel="nofollow">paddle.tgz</a></td>
 </tr>
 <tr>
 <td>cpu_noavx_openblas</td>
-<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddle.tgz/?branch=0.14.0" rel="nofollow">paddle.tgz</a></td>
 </tr>
 <tr>
 <td>cuda7.5_cudnn5_avx_mkl</td>
-<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz/?branch=0.14.0" rel="nofollow">paddle.tgz</a></td>
 </tr>
 <tr>
 <td>cuda8.0_cudnn5_avx_mkl</td>
-<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz/?branch=0.14.0" rel="nofollow">paddle.tgz</a></td>
 </tr>
 <tr>
 <td>cuda8.0_cudnn7_avx_mkl</td>
-<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz/?branch=0.14.0" rel="nofollow">paddle.tgz</a></td>
 </tr>
 <tr>
 <td>cuda9.0_cudnn7_avx_mkl</td>
-<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/paddle.tgz/?branch=0.14.0" rel="nofollow">paddle.tgz</a></td>
 </tr>
 </tbody></table>
 
diff --git a/doc/v2/howto/capi/workflow_of_capi_cn.md b/doc/v2/howto/capi/workflow_of_capi_cn.md
index 3acdbae28e9b35f8a9104a89c9a5799f8c892334..db1568a2afbea3cca0d4e1fe053ba9536a60ab3d 100644
--- a/doc/v2/howto/capi/workflow_of_capi_cn.md
+++ b/doc/v2/howto/capi/workflow_of_capi_cn.md
@@ -28,9 +28,9 @@
 
 ### 准备预测模型
 
-准备预测模型部分，我们以手写数字识别任务为例进行介绍。手写数字识别任务定义了一个含有[两个隐层的简单全连接网络](https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/README.cn.md#softmax回归softmax-regression)，网络接受一幅图片作为输入，将图片分类到 0 ~ 9 类别标签之一。完整代码可以查看[此目录](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference/dense) 中的相关脚本。
+准备预测模型部分，我们以手写数字识别任务为例进行介绍。手写数字识别任务定义了一个含有[两个隐层的简单全连接网络](https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/README.cn.md#softmax回归softmax-regression)，网络接受一幅图片作为输入，将图片分类到 0 ~ 9 类别标签之一。完整代码可以查看[此目录](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/legacy/capi/examples/model_inference/dense) 中的相关脚本。
 
-调用C-API开发预测程序需要一个训练好的模型，运行[MNIST手写数字识别目录](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference/dense)下的[mnist_v2.py](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/examples/model_inference/dense/mnist_v2.py)脚本，在终端执行`python mnist_v2.py`，会使用 PaddlePaddle 内置的 [MNIST 数据集](http://yann.lecun.com/exdb/mnist/)进行训练。训练好的模型默认保存在当前运行目录下的`models`目录中。
+调用C-API开发预测程序需要一个训练好的模型，运行[MNIST手写数字识别目录](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/legacy/capi/examples/model_inference/dense)下的[mnist_v2.py](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/capi/examples/model_inference/dense/mnist_v2.py)脚本，在终端执行`python mnist_v2.py`，会使用 PaddlePaddle 内置的 [MNIST 数据集](http://yann.lecun.com/exdb/mnist/)进行训练。训练好的模型默认保存在当前运行目录下的`models`目录中。
 
 下面，我们将训练结束后存储下来的模型转换成预测模型。
 
@@ -48,7 +48,7 @@
     dump_v2_config(predict, "trainer_config.bin", True)
     ```
 
-    对[手写数字识别](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference/dense)这个示例，[`mnist_v2.py`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference/dense/mnist_v2.py)脚本集成了序列化神经网络结构的过程，可以直接运行 `python mnist_v2.py --task dump_config` 对神经网络结构进行序列化，结果会写入当前运行目录下的`trainer_config.bin`文件中。
+    对[手写数字识别](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/legacy/capi/examples/model_inference/dense)这个示例，[`mnist_v2.py`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/legacy/capi/examples/model_inference/dense/mnist_v2.py)脚本集成了序列化神经网络结构的过程，可以直接运行 `python mnist_v2.py --task dump_config` 对神经网络结构进行序列化，结果会写入当前运行目录下的`trainer_config.bin`文件中。
 
     使用这种方式，需要**在运行时将神经网络的多个可学习参数放在同一个目录中**，C-API可以通过分别指定序列化后的网络结构文件和参数目录来加载训练好的模型。
 
@@ -68,7 +68,7 @@
     merge_v2_model(net, param_file, output_file)
     ```
 
-    对[手写数字识别](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference/dense)这个示例，可直接运行 `python` [merge_v2_model.py](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference/dense/merge_v2_model.py)。序列化结果会写入当前运行目录下的`output.paddle.model`文件中。使用这种方式，运行时C-API可以通过指定`output.paddle.model`文件的路径来加载预测模型。
+    对[手写数字识别](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/legacy/capi/examples/model_inference/dense)这个示例，可直接运行 `python` [merge_v2_model.py](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/legacy/capi/examples/model_inference/dense/merge_v2_model.py)。序列化结果会写入当前运行目录下的`output.paddle.model`文件中。使用这种方式，运行时C-API可以通过指定`output.paddle.model`文件的路径来加载预测模型。
 
 #### 注意事项
 1. 为使用C-API，在调用`dump_v2_config`序列化神经网络结构时，参数`binary`必须指定为`True`。
@@ -77,10 +77,10 @@
 
 ### 编写预测代码
 
-预测代码更多详细示例代码请参考[C-API使用示例](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference) 目录下的代码示例。这一节对图1中预测代码编写的5个步骤进行介绍和说明。
+预测代码更多详细示例代码请参考[C-API使用示例](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/legacy/capi/examples/model_inference) 目录下的代码示例。这一节对图1中预测代码编写的5个步骤进行介绍和说明。
 
 #### step 1. 初始化PaddlePaddle运行环境
-第一步需调用[`paddle_init`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/main.h#L27) 初始化PaddlePaddle运行环境，该接口接受两个参数：参数的个数和参数列表。
+第一步需调用[`paddle_init`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/capi/main.h#L27) 初始化PaddlePaddle运行环境，该接口接受两个参数：参数的个数和参数列表。
 
 #### step2. 加载模型
 
@@ -88,8 +88,8 @@
 
 概念上，在 PaddlePaddle 内部，一个GradientMachine类的对象管理着一组计算层（PaddlePaddle Layers）来完成前向和反向计算，并处理与之相关的所有细节。在调用C-API预测时，只需进行前向计算而无需调用反向计算。这篇文档之后部分会使用`gradient machine`来特指调用PaddlePaddle C-API创建的GradientMachine类的对象。每一个 `gradient machine` 都会管理维护一份训练好的模型，下面是C-API提供的，两种常用的模型加载方式：
 
-1. 调用[`paddle_gradient_machine_load_parameter_from_disk`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/gradient_machine.h#L61)接口，从磁盘加载预测模型。这时`gradient machine`会独立拥有一份训练好的模型；
-1. 调用[`paddle_gradient_machine_create_shared_param`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/gradient_machine.h#L88)接口，与其它`gradient machine`的共享已经加载的预测模型。这种情况多出现在使用多线程预测时，通过多个线程共享同一个模型来减少内存开销。可参考[此示例](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/examples/model_inference/multi_thread/main.c)。
+1. 调用[`paddle_gradient_machine_load_parameter_from_disk`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/capi/gradient_machine.h#L61)接口，从磁盘加载预测模型。这时`gradient machine`会独立拥有一份训练好的模型；
+1. 调用[`paddle_gradient_machine_create_shared_param`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/capi/gradient_machine.h#L88)接口，与其它`gradient machine`的共享已经加载的预测模型。这种情况多出现在使用多线程预测时，通过多个线程共享同一个模型来减少内存开销。可参考[此示例](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/capi/examples/model_inference/multi_thread/main.c)。
 
 - 注意事项
 
@@ -117,7 +117,7 @@ C-API支持的所有输入数据类型和他们的组织方式，请参考“输
 
 #### step 4. 前向计算
 
-完成上述准备之后，通过调用 [`paddle_gradient_machine_forward`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/gradient_machine.h#L73) 接口完成神经网络的前向计算。
+完成上述准备之后，通过调用 [`paddle_gradient_machine_forward`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/capi/gradient_machine.h#L73) 接口完成神经网络的前向计算。
 
 #### step 5. 清理
 
diff --git a/doc/v2/howto/rnn/hierarchical_layer_en.rst b/doc/v2/howto/rnn/hierarchical_layer_en.rst
index 236f58a160c7f77c28e4b1216b83b3d3cdaaa459..fb668f1babb47f49b2dab6d2411565e99599d8b0 100644
--- a/doc/v2/howto/rnn/hierarchical_layer_en.rst
+++ b/doc/v2/howto/rnn/hierarchical_layer_en.rst
@@ -1,4 +1,89 @@
-Layers supporting hierarchical sequence as input
-================================================
-
-TBD
+###########################
+Layers that Support Hierarchical Sequences as Input
+###########################
+ 
+.. contents::
+ 
+Overview 
+====
+ 
+A sequence is a common data type in natural language processing tasks. An independent word can be regarded as a non-sequential input or a 0-level sequence. A sentence made up of words is a single-level sequence; a number of sentences make up a paragraph, which is a double-level sequence.
+ 
+A double-level sequence is a nested sequence where each element is a single-level sequence. This is a very flexible way of organizing data that helps us construct some complex input information.
+ 
+We can define non-sequences, single-level sequences, and double-level sequences at the following levels.
+ 
++ 0-level sequence: an independent element. Its type can be any input data type supported by PaddlePaddle;
++ Single-level sequence: multiple elements arranged in a row; each element is a 0-level sequence. The order of elements is an important input information;
++ Double-level sequence: multiple elements arranged in a row; each element is a single-layer sequence called a subseq of a double-level sequence, and each element of the subseq is a 0-level sequence.
+ 
+In PaddlePaddle, the following layers accept double-layer sequences as input and perform corresponding calculations.
+ 
+`pooling`
+========
+ 
+The use of pooling is as follows:
+ 
+.. code-block:: bash
+ 
+        Seq_pool = pooling(input=layer,
+                           Pooling_type=pooling.Max(),
+                           Agg_level=AggregateLevel.TO_SEQUENCE)
+        
+- `pooling_type` currently supports two types: pooling.Max() and pooling.Avg().
+ 
+- When ʻagg_level=AggregateLevel.TO_NO_SEQUENCE` (default):
+ 
+  - Effect: a double-level sequence input will be converted into a 0-level sequence, and a single-level sequence will be converted into a 0-level sequence 
+  - Input: a double-level sequence or a single-level sequence
+  - Output: a 0-level sequence which is the average (or maximum) of the entire input sequence (single or double)
+ 
+- When ʻagg_level=AggregateLevel.TO_SEQUENCE`:
+ 
+  - Effect: a double-level sequence will be transformed into a single-level sequence
+  - Input: a double-level sequence
+  - Output: a single-level sequence where each element of the sequence is the average (or maximum) value of each subseq element of the original double-level sequence.
+ 
+`last_seq` and `first_seq`
+=====================
+ 
+An example of using `last_seq` is as follows (usage of `first_seq` is similar).
+ 
+.. code-block:: bash
+ 
+        Last = last_seq(input=layer,
+                        Agg_level=AggregateLevel.TO_SEQUENCE)
+        
+- When ʻagg_level=AggregateLevel.TO_NO_SEQUENCE` (default):
+ 
+  - Effect: a double-level sequence input will be converted into a 0-level sequence, and a single-level sequence will be converted into a 0-level sequence
+  - Input: a double-level sequence or a single-level sequence
+  - Output: a 0-level sequence, which is the last or the first element of the input sequence (double or single level).
+ 
+- When ʻagg_level=AggregateLevel.TO_SEQUENCE`:
+  - Effect: a double-level sequence will be transformed into a single-level sequence
+  - Input: a double-level sequence
+  - Output: a single-layer sequence in which each element is the last (or first) element of each subseq in a double-level sequence.
+ 
+`expand`
+======
+ 
+The use of expand is as follows.
+ 
+.. code-block:: bash
+ 
+        Ex = expand(input=layer1,
+                    Expand_as=layer2,
+                    Expand_level=ExpandLevel.FROM_NO_SEQUENCE)
+        
+- When `expand_level=ExpandLevel.FROM_NO_SEQUENCE` (default):
+ 
+  - Effect: a 0-level sequence is extended to a single-level sequence or a double-level sequence
+  - Input: layer1 must be a 0-level sequence to be extended; layer2 can be a single-level sequence or a double-level sequence that provides the extended length information
+  - Output: a single-level sequence or a double-level sequence; the type of the output sequence and the number of elements contained in the sequence are the same as layer2. If the output is a single-level sequence, each element of the single-level sequence will be a copy of the layer1 element. If the output is a double-level sequence, each element in the double-level sequence will be a copy of the layer1 element
+ 
+- When `expand_level=ExpandLevel.FROM_SEQUENCE`:
+ 
+  - Effect: a single-level sequence is extended to a double-level sequence
+  - Input: layer1 must be a single-level sequence to be extended; layer2 must be a double-level sequence providing extended length information
+  - Output: a double-level sequence with the same number of elements as that of layer2. It is required that the number of elements in the single-level sequence be the same as the number of subseq in the double-level sequences. The i-th element of the single-level sequence (the 0-level sequence) is expanded into a single-level sequence that constitutes the i-th subseq of the output, the double-level sequence.
diff --git a/paddle/contrib/float16/float16_transpiler.py b/paddle/contrib/float16/float16_transpiler.py
index 91ba101edb65cd45bd5e37a0c6ad25e515593a81..66e0345c299730c113ffbdc8dd3c1fa32f872f3d 100644
--- a/paddle/contrib/float16/float16_transpiler.py
+++ b/paddle/contrib/float16/float16_transpiler.py
@@ -118,7 +118,7 @@ class Float16Transpiler:
 
         for var in self.block.vars.keys():
             if var not in args:
-                self.block.remove_var(var)
+                self.block._remove_var(var)
 
     def _modify_feed_fetch(self):
         '''
@@ -165,7 +165,7 @@ class Float16Transpiler:
                     dtype=core.VarDesc.VarType.FP16,
                     shape=var.shape,
                     persistable=var.persistable)
-                self.block.insert_op(
+                self.block._insert_op(
                     i + 1,
                     type="cast",
                     inputs={"X": var},
@@ -188,7 +188,7 @@ class Float16Transpiler:
                     persistable=var.persistable)
                 find_op(var)
                 var.op.rename_output(var_name, tmp_var_name)
-                self.block.insert_op(
+                self.block._insert_op(
                     i,
                     type="cast",
                     inputs={"X": tmp_var},
@@ -253,4 +253,4 @@ class Float16Transpiler:
 
             # old var will be replaced by the fp16 var in program desc
             self.input_map[var.name] = fp16_var_name
-            self.block.remove_var(var.name)
+            self.block._remove_var(var.name)
diff --git a/paddle/contrib/inference/CMakeLists.txt b/paddle/contrib/inference/CMakeLists.txt
deleted file mode 100644
index a8bbb4eb8081420ae0bbaf761bd27303c0d043cb..0000000000000000000000000000000000000000
--- a/paddle/contrib/inference/CMakeLists.txt
+++ /dev/null
@@ -1,88 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-if(APPLE)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=pessimizing-move")
-endif(APPLE)
-
-
-set(inference_deps paddle_inference_api paddle_fluid_api)
-if(WITH_GPU AND TENSORRT_FOUND)
-    set(inference_deps ${inference_deps} paddle_inference_tensorrt_subgraph_engine)
-endif()
-
-function(inference_api_test TARGET_NAME)
-    if (WITH_TESTING)
-        set(options "")
-        set(oneValueArgs "")
-        set(multiValueArgs ARGS)
-        cmake_parse_arguments(inference_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-
-        set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests)
-        cc_test(${TARGET_NAME}
-                SRCS ${TARGET_NAME}.cc
-                DEPS "${inference_deps}"
-                ARGS --dirname=${PYTHON_TESTS_DIR}/book/)
-        if(inference_test_ARGS)
-            set_tests_properties(${TARGET_NAME}
-                    PROPERTIES DEPENDS "${inference_test_ARGS}")
-        endif()
-    endif(WITH_TESTING)
-endfunction(inference_api_test)
-
-cc_library(paddle_inference_api
-    SRCS paddle_inference_api.cc paddle_inference_api_impl.cc
-    DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
-
-cc_library(paddle_inference_api_shared SHARED
-    SRCS paddle_inference_api.cc paddle_inference_api_impl.cc
-    DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
-
-cc_test(test_paddle_inference_api
-        SRCS test_paddle_inference_api.cc
-        DEPS paddle_inference_api)
-
-inference_api_test(test_paddle_inference_api_impl
-                    ARGS test_word2vec test_image_classification)
-
-if(WITH_GPU AND TENSORRT_FOUND)
-cc_library(paddle_inference_tensorrt_subgraph_engine
-        SRCS paddle_inference_api_tensorrt_subgraph_engine.cc
-        DEPS paddle_inference_api analysis tensorrt_engine paddle_inference_api paddle_fluid_api)
-
-inference_api_test(test_paddle_inference_api_tensorrt_subgraph_engine ARGS test_word2vec)
-endif()
-
-if (WITH_ANAKIN) # only needed in CI
-    # Due to Anakin do not have official library releases and the versions of protobuf and cuda do not match Paddle's,
-    # so anakin library will not be merged to our official inference library. To use anakin prediction API, one need to
-    # compile the libinference_anakin_api.a and compile with anakin.so.
-    nv_library(inference_anakin_api SRCS paddle_inference_api.cc paddle_inference_api_anakin_engine.cc)
-    nv_library(inference_anakin_api_shared SHARED SRCS paddle_inference_api.cc paddle_inference_api_anakin_engine.cc)
-    target_compile_options(inference_anakin_api BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
-    target_compile_options(inference_anakin_api_shared BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
-    target_link_libraries(inference_anakin_api anakin anakin_saber_common)
-    target_link_libraries(inference_anakin_api_shared anakin anakin_saber_common)
-    if (WITH_TESTING)
-        cc_test(inference_anakin_test SRCS paddle_inference_api_anakin_engine_tester.cc
-                                  ARGS --model=${ANAKIN_INSTALL_DIR}/mobilenet_v2.anakin.bin
-                                  DEPS inference_anakin_api)
-        target_compile_options(inference_anakin_test BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
-     endif(WITH_TESTING)
-endif()
-
-if(WITH_TESTING)
-    add_subdirectory(demo)
-endif()
diff --git a/paddle/contrib/inference/demo/CMakeLists.txt b/paddle/contrib/inference/demo/CMakeLists.txt
deleted file mode 100644
index ecece6fe3471ad7b89c84c3e2b67af4ae9eb3c36..0000000000000000000000000000000000000000
--- a/paddle/contrib/inference/demo/CMakeLists.txt
+++ /dev/null
@@ -1,61 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-inference_api_test(simple_on_word2vec ARGS test_word2vec)
-
-option(WITH_INFERENCE_DEMO "Compile with Inference demo" OFF)
-if(NOT WITH_INFERENCE_DEMO)
-  return()
-endif()
-
-set(DEMO_INSTALL_DIR "${PADDLE_BINARY_DIR}/inference_demo")
-set(URL_ROOT http://paddlemodels.bj.bcebos.com/inference-vis-demos%2F)
-
-function(inference_download_test_demo TARGET)
-    if (NOT WITH_TESTING)
-        return()
-    endif()
-    set(options "")
-    set(oneValueArgs URL)
-    set(multiValueArgs SRCS)
-    cmake_parse_arguments(tests "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-
-    set(test_dir "${DEMO_INSTALL_DIR}/${TARGET}")
-    message(STATUS "inference demo ${test_dir}")
-
-    if(NOT EXISTS "${test_dir}")
-        message(STATUS "Download ${TARGET} model from ${tests_URL}")
-        execute_process(COMMAND bash -c "mkdir -p ${test_dir}")
-        execute_process(COMMAND bash -c "cd ${test_dir}; wget -q ${tests_URL}")
-        execute_process(COMMAND bash -c "cd ${test_dir}; tar xzf *.tar.gz")
-    endif()
-
-    cc_test(${TARGET} SRCS "${tests_SRCS}"
-        DEPS paddle_inference_api paddle_fluid
-        ARGS --data=${test_dir}/data.txt
-             --modeldir=${test_dir}/model
-             --refer=${test_dir}/result.txt)
-endfunction()
-
-# disable mobilenet test
-#inference_download_test_demo(mobilenet_inference_demo
-#    SRCS vis_demo.cc
-#    URL ${URL_ROOT}mobilenet.tar.gz)
-inference_download_test_demo(se_resnext50_inference_demo
-    SRCS vis_demo.cc
-    URL ${URL_ROOT}se_resnext50.tar.gz)
-inference_download_test_demo(ocr_inference_demo
-    SRCS vis_demo.cc
-    URL ${URL_ROOT}ocr.tar.gz)
diff --git a/paddle/contrib/inference/demo/README.md b/paddle/contrib/inference/demo/README.md
deleted file mode 100644
index f1d256660299a68dc5d9d73dbe4a401a0e7d9680..0000000000000000000000000000000000000000
--- a/paddle/contrib/inference/demo/README.md
+++ /dev/null
@@ -1,36 +0,0 @@
-# Infernce Demos
-
-Input data format:
-
-- Each line contains a single record
-- Each record's format is
-
-```
-<space splitted floats as data>\t<space splitted ints as shape>
-```
-
-Follow the C++ codes in `vis_demo.cc`.
-
-## MobileNet
-
-To execute the demo, simply run
-
-```sh
-./mobilenet_inference_demo --modeldir <model> --data <datafile>
-```
-
-## SE-ResNeXt-50
-
-To execute the demo, simply run
-
-```sh
-./se_resnext50_inference_demo --modeldir <model> --data <datafile>
-```
-
-## OCR
-
-To execute the demo, simply run
-
-```sh
-./ocr_inference_demo --modeldir <model> --data <datafile>
-```
diff --git a/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc b/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc
deleted file mode 100644
index ba2d30314715a57c5ab85e5ae1d8ac0512bbc74f..0000000000000000000000000000000000000000
--- a/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc
+++ /dev/null
@@ -1,116 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/contrib/inference/paddle_inference_api_anakin_engine.h"
-#include <cuda.h>
-
-namespace paddle {
-
-PaddleInferenceAnakinPredictor::PaddleInferenceAnakinPredictor(
-    const AnakinConfig &config) {
-  CHECK(Init(config));
-}
-
-bool PaddleInferenceAnakinPredictor::Init(const AnakinConfig &config) {
-  if (!(graph_.load(config.model_file))) {
-    return false;
-  }
-  graph_.ResetBatchSize("input_0", config.max_batch_size);
-  // optimization for graph
-  if (!(graph_.Optimize())) {
-    return false;
-  }
-  // construct executer
-  executor_.init(graph_);
-  return true;
-}
-
-bool PaddleInferenceAnakinPredictor::Run(
-    const std::vector<PaddleTensor> &inputs,
-    std::vector<PaddleTensor> *output_data) {
-  for (const auto &input : inputs) {
-    if (input.dtype != PaddleDType::FLOAT32) {
-      LOG(ERROR) << "Only support float type inputs. " << input.name
-                 << "'s type is not float";
-      return false;
-    }
-    auto d_tensor_in_p = executor_.get_in(input.name);
-    float *d_data_p = d_tensor_in_p->mutable_data();
-    if (cudaMemcpy(d_data_p,
-                   static_cast<float *>(input.data.data()),
-                   d_tensor_in_p->valid_size() * sizeof(float),
-                   cudaMemcpyHostToDevice) != 0) {
-      LOG(ERROR) << "copy data from CPU to GPU error";
-      return false;
-    }
-  }
-
-  executor_.prediction();
-
-  if (output_data->empty()) {
-    LOG(ERROR) << "At least one output should be set with tensors' names.";
-    return false;
-  }
-  for (auto &output : *output_data) {
-    auto *tensor = executor_.get_out(output.name);
-    output.shape = tensor->shape();
-    if (output.data.length() < tensor->valid_size() * sizeof(float)) {
-      output.data.Resize(tensor->valid_size() * sizeof(float));
-    }
-    // Copy data from GPU -> CPU
-    if (cudaMemcpy(output.data.data(),
-                   tensor->mutable_data(),
-                   tensor->valid_size() * sizeof(float),
-                   cudaMemcpyDeviceToHost) != 0) {
-      LOG(ERROR) << "copy data from GPU to CPU error";
-      return false;
-    }
-  }
-  return true;
-}
-
-anakin::Net<anakin::NV, anakin::saber::AK_FLOAT, anakin::Precision::FP32>
-    &PaddleInferenceAnakinPredictor::get_executer() {
-  return executor_;
-}
-
-// the cloned new Predictor of anakin share the same net weights from original
-// Predictor
-std::unique_ptr<PaddlePredictor> PaddleInferenceAnakinPredictor::Clone() {
-  VLOG(3) << "Anakin Predictor::clone";
-  std::unique_ptr<PaddlePredictor> cls(new PaddleInferenceAnakinPredictor());
-  // construct executer from other graph
-  auto anakin_predictor_p =
-      dynamic_cast<PaddleInferenceAnakinPredictor *>(cls.get());
-  if (!anakin_predictor_p) {
-    LOG(ERROR) << "fail to call Init";
-    return nullptr;
-  }
-  anakin_predictor_p->get_executer().init(graph_);
-
-  return std::move(cls);
-}
-
-// A factory to help create difference predictor.
-template <>
-std::unique_ptr<PaddlePredictor>
-CreatePaddlePredictor<AnakinConfig, PaddleEngineKind::kAnakin>(
-    const AnakinConfig &config) {
-  VLOG(3) << "Anakin Predictor create.";
-  std::unique_ptr<PaddlePredictor> x(
-      new PaddleInferenceAnakinPredictor(config));
-  return x;
-};
-
-}  // namespace paddle
diff --git a/paddle/contrib/inference/paddle_inference_api_impl.cc b/paddle/contrib/inference/paddle_inference_api_impl.cc
deleted file mode 100644
index b1e5b875981e0142f6970cf6864b7b598743654b..0000000000000000000000000000000000000000
--- a/paddle/contrib/inference/paddle_inference_api_impl.cc
+++ /dev/null
@@ -1,290 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <sys/time.h>
-#include <algorithm>
-#include <map>
-#include <set>
-#include <sstream>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "paddle/contrib/inference/paddle_inference_api_impl.h"
-
-namespace paddle {
-namespace {
-
-// Timer for timer
-class Timer {
- public:
-  double start;
-  double startu;
-  void tic() {
-    struct timeval tp;
-    gettimeofday(&tp, NULL);
-    start = tp.tv_sec;
-    startu = tp.tv_usec;
-  }
-  double toc() {
-    struct timeval tp;
-    gettimeofday(&tp, NULL);
-    double used_time_ms =
-        (tp.tv_sec - start) * 1000.0 + (tp.tv_usec - startu) / 1000.0;
-    return used_time_ms;
-  }
-};
-
-template <class T>
-std::string num2str(T a) {
-  std::stringstream istr;
-  istr << a;
-  return istr.str();
-}
-}  // namespace
-
-bool NativePaddlePredictor::Init(
-    std::shared_ptr<framework::Scope> parent_scope) {
-  VLOG(3) << "Predictor::init()";
-
-  if (config_.use_gpu) {
-    place_ = paddle::platform::CUDAPlace(config_.device);
-  } else {
-    place_ = paddle::platform::CPUPlace();
-  }
-  if (parent_scope) {
-    scope_ = parent_scope;
-    sub_scope_ = &(parent_scope->NewScope());
-  } else {
-    paddle::framework::InitDevices(false);
-    scope_.reset(new paddle::framework::Scope());
-  }
-
-  executor_.reset(new paddle::framework::Executor(place_));
-
-  // Initialize the inference program
-  if (!config_.model_dir.empty()) {
-    // Parameters are saved in separate files sited in
-    // the specified `dirname`.
-    inference_program_ = paddle::inference::Load(
-        executor_.get(), scope_.get(), config_.model_dir);
-  } else if (!config_.prog_file.empty() && !config_.param_file.empty()) {
-    // All parameters are saved in a single file.
-    // The file names should be consistent with that used
-    // in Python API `fluid.io.save_inference_model`.
-    inference_program_ = paddle::inference::Load(
-        executor_.get(), scope_.get(), config_.prog_file, config_.param_file);
-  } else {
-    LOG(ERROR) << "fail to load inference model.";
-    return false;
-  }
-
-  ctx_ = executor_->Prepare(*inference_program_, 0);
-  executor_->CreateVariables(
-      *inference_program_, sub_scope_ ? sub_scope_ : scope_.get(), 0);
-
-  // Get the feed_target_names and fetch_target_names
-  feed_target_names_ = inference_program_->GetFeedTargetNames();
-  fetch_target_names_ = inference_program_->GetFetchTargetNames();
-  return true;
-}
-
-NativePaddlePredictor::~NativePaddlePredictor() {
-  if (sub_scope_) {
-    PADDLE_ENFORCE_NOT_NULL(scope_, "Should have parent scope!");
-    scope_->DeleteScope(sub_scope_);
-  }
-};
-
-bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
-                                std::vector<PaddleTensor> *output_data) {
-  VLOG(3) << "Predictor::predict";
-  Timer timer;
-  timer.tic();
-  // set feed variable
-  std::map<std::string, const framework::LoDTensor *> feed_targets;
-  std::vector<framework::LoDTensor> feeds;
-  if (!SetFeed(inputs, &feeds)) {
-    LOG(ERROR) << "fail to set feed";
-    return false;
-  }
-  for (size_t i = 0; i < feed_target_names_.size(); ++i) {
-    VLOG(4) << "setting " << i << "-th target";
-    feed_targets[feed_target_names_[i]] = &feeds[i];
-  }
-  // get fetch variable
-  std::map<std::string, framework::LoDTensor *> fetch_targets;
-  std::vector<framework::LoDTensor> fetchs;
-  fetchs.resize(fetch_target_names_.size());
-  for (size_t i = 0; i < fetch_target_names_.size(); ++i) {
-    fetch_targets[fetch_target_names_[i]] = &fetchs[i];
-  }
-  // Run the inference program
-  // if share variables, we need not create variables
-  VLOG(4) << "Run prepared context";
-  executor_->RunPreparedContext(
-      ctx_.get(),
-      sub_scope_ != nullptr ? sub_scope_ : scope_.get(),
-      &feed_targets,
-      &fetch_targets,
-      false /* don't create variable eatch time */);
-  VLOG(4) << "Finish prepared context";
-  if (!GetFetch(fetchs, output_data)) {
-    LOG(ERROR) << "fail to get fetches";
-    return false;
-  }
-  VLOG(3) << "predict cost: " << timer.toc() << "ms";
-  return true;
-}
-
-std::unique_ptr<PaddlePredictor> NativePaddlePredictor::Clone() {
-  VLOG(3) << "Predictor::clone";
-  std::unique_ptr<PaddlePredictor> cls(new NativePaddlePredictor(config_));
-
-  if (!dynamic_cast<NativePaddlePredictor *>(cls.get())->Init(scope_)) {
-    LOG(ERROR) << "fail to call Init";
-    return nullptr;
-  }
-  // fix manylinux compile error.
-  return std::move(cls);
-}
-
-bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
-                                    std::vector<framework::LoDTensor> *feeds) {
-  VLOG(3) << "Predictor::set_feed";
-  if (inputs.size() != feed_target_names_.size()) {
-    LOG(ERROR) << "wrong feed input size.";
-    return false;
-  }
-  for (size_t i = 0; i < feed_target_names_.size(); ++i) {
-    framework::LoDTensor input;
-    framework::DDim ddim = framework::make_ddim(inputs[i].shape);
-    void *input_ptr;
-    if (inputs[i].dtype == PaddleDType::INT64) {
-      input_ptr = input.mutable_data<int64_t>(ddim, platform::CPUPlace());
-    } else if (inputs[i].dtype == PaddleDType::FLOAT32) {
-      input_ptr = input.mutable_data<float>(ddim, platform::CPUPlace());
-    } else {
-      LOG(ERROR) << "unsupported feed type " << inputs[i].dtype;
-      return false;
-    }
-
-    // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
-    std::memcpy(static_cast<void *>(input_ptr),
-                inputs[i].data.data(),
-                inputs[i].data.length());
-    feeds->push_back(input);
-  }
-  return true;
-}
-
-bool NativePaddlePredictor::GetFetch(
-    const std::vector<framework::LoDTensor> &fetchs,
-    std::vector<PaddleTensor> *outputs) {
-  VLOG(3) << "Predictor::get_fetch";
-  outputs->resize(fetchs.size());
-  for (size_t i = 0; i < fetchs.size(); ++i) {
-    // TODO(panyx0718): Support fetch of other types.
-    if (fetchs[i].type() != typeid(float)) {
-      LOG(ERROR) << "only support fetching float now.";
-      return false;
-    }
-    std::vector<int> shape;
-    auto dims_i = fetchs[i].dims();
-    auto lod = fetchs[i].lod();
-    const float *output_ptr = fetchs[i].data<float>();
-    // const int64_t* output_ptr = fetchs[i].data<int64_t>();
-    auto num = fetchs[i].numel();
-    std::vector<float> data;
-    if (0 == lod.size()) {
-      std::copy(output_ptr, output_ptr + num, std::back_inserter(data));
-      for (int j = 0; j < dims_i.size(); ++j) {
-        shape.push_back(dims_i[j]);
-      }
-    } else {
-      // for batch detection
-      // image[0] -> output[0] shape {145, 6}
-      // image[1] -> output[1] shape {176, 6}
-      // then,
-      // the batch output shape {321, 6}
-      // the lod {{0, 145, 321}}
-      // so we should append output[0] to {176, 6}
-      size_t max_dim = 0;
-      for (size_t j = 1; j < lod[0].size(); j++) {
-        max_dim = std::max(max_dim, lod[0][j] - lod[0][j - 1]);
-      }
-      size_t common_dim = lod[0].back() == 0 ? 0 : num / lod[0].back();
-      if (max_dim > 0) {
-        data.resize((lod[0].size() - 1) * max_dim * common_dim, 0);
-      }
-      for (size_t j = 1; j < lod[0].size(); j++) {
-        size_t start = lod[0][j - 1] * common_dim;
-        size_t end = lod[0][j] * common_dim;
-        if (end > start) {
-          std::copy(output_ptr + start,
-                    output_ptr + end,
-                    data.begin() + (j - 1) * max_dim * common_dim);
-        }
-      }
-      shape.push_back(lod[0].size() - 1);
-      shape.push_back(max_dim);
-      for (int j = 1; j < dims_i.size(); ++j) {
-        shape.push_back(dims_i[j]);
-      }
-    }
-
-    outputs->at(i).shape = shape;
-    auto &buffer = outputs->at(i).data;
-    if (buffer.empty() || buffer.length() < sizeof(float) * data.size()) {
-      buffer.Resize(sizeof(float) * data.size());
-    }
-    std::memcpy(buffer.data(), data.data(), buffer.length());
-    outputs->at(i).dtype = PaddleDType::FLOAT32;
-    // TODO(panyx0718): support other types? fill tensor name? avoid a copy.
-  }
-  return true;
-}
-
-template <>
-std::unique_ptr<PaddlePredictor>
-CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(
-    const NativeConfig &config) {
-  VLOG(3) << "create NativePaddlePredictor";
-  if (config.use_gpu) {
-    // 1. GPU memeroy
-    PADDLE_ENFORCE_GT(
-        config.fraction_of_gpu_memory,
-        0.f,
-        "fraction_of_gpu_memory in the config should be set to range (0., 1.]");
-    PADDLE_ENFORCE_GE(config.device, 0, "Invalid device id %d", config.device);
-    std::vector<std::string> flags;
-    if (config.fraction_of_gpu_memory >= 0.0f ||
-        config.fraction_of_gpu_memory <= 0.95f) {
-      flags.push_back("dummpy");
-      std::string flag = "--fraction_of_gpu_memory_to_use=" +
-                         num2str<float>(config.fraction_of_gpu_memory);
-      flags.push_back(flag);
-      VLOG(3) << "set flag: " << flag;
-      framework::InitGflags(flags);
-    }
-  }
-
-  std::unique_ptr<PaddlePredictor> predictor(new NativePaddlePredictor(config));
-  if (!dynamic_cast<NativePaddlePredictor *>(predictor.get())->Init(nullptr)) {
-    return nullptr;
-  }
-  return std::move(predictor);
-}
-
-}  // namespace paddle
diff --git a/paddle/contrib/inference/test_paddle_inference_api_tensorrt_subgraph_engine.cc b/paddle/contrib/inference/test_paddle_inference_api_tensorrt_subgraph_engine.cc
deleted file mode 100644
index b100630dbe412ca811f1a8f2b8191356f5ebec2f..0000000000000000000000000000000000000000
--- a/paddle/contrib/inference/test_paddle_inference_api_tensorrt_subgraph_engine.cc
+++ /dev/null
@@ -1,64 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gflags/gflags.h>
-#include <glog/logging.h>
-#include <gtest/gtest.h>
-#include "paddle/contrib/inference/paddle_inference_api.h"
-
-namespace paddle {
-
-DEFINE_string(dirname, "", "Directory of the inference model.");
-
-void Main(bool use_gpu) {
-  //# 1. Create PaddlePredictor with a config.
-  TensorRTConfig config;
-  config.model_dir = FLAGS_dirname + "word2vec.inference.model";
-  config.use_gpu = use_gpu;
-  config.fraction_of_gpu_memory = 0.15;
-  config.device = 0;
-  auto predictor =
-      CreatePaddlePredictor<TensorRTConfig,
-                            PaddleEngineKind::kAutoMixedTensorRT>(config);
-
-  for (int batch_id = 0; batch_id < 3; batch_id++) {
-    //# 2. Prepare input.
-    int64_t data[4] = {1, 2, 3, 4};
-
-    PaddleTensor tensor{.name = "",
-                        .shape = std::vector<int>({4, 1}),
-                        .data = PaddleBuf(data, sizeof(data)),
-                        .dtype = PaddleDType::INT64};
-
-    // For simplicity, we set all the slots with the same data.
-    std::vector<PaddleTensor> slots(4, tensor);
-
-    //# 3. Run
-    std::vector<PaddleTensor> outputs;
-    CHECK(predictor->Run(slots, &outputs));
-
-    //# 4. Get output.
-    ASSERT_EQ(outputs.size(), 1UL);
-    LOG(INFO) << "output buffer size: " << outputs.front().data.length();
-    const size_t num_elements = outputs.front().data.length() / sizeof(float);
-    // The outputs' buffers are in CPU memory.
-    for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
-      LOG(INFO) << static_cast<float*>(outputs.front().data.data())[i];
-    }
-  }
-}
-
-TEST(paddle_inference_api_tensorrt_subgraph_engine, main) { Main(true); }
-
-}  // namespace paddle
\ No newline at end of file
diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
new file mode 100644
index 0000000000000000000000000000000000000000..e362d3486487dd0b55e3e40d1c1358f2e5604ac5
--- /dev/null
+++ b/paddle/fluid/API.spec
@@ -0,0 +1,428 @@
+paddle.fluid.Program.__init__ ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.Program.block ArgSpec(args=['self', 'index'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.Program.clone ArgSpec(args=['self', 'for_test'], varargs=None, keywords=None, defaults=(False,))
+paddle.fluid.Program.copy_data_info_from ArgSpec(args=['self', 'other'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.Program.create_block ArgSpec(args=['self', 'parent_idx'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.Program.current_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.Program.get_desc ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.Program.global_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.Program.inference_optimize ArgSpec(args=['self', 'export_for_deployment'], varargs=None, keywords=None, defaults=(True,))
+paddle.fluid.Program.list_vars ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.Program.optimized_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
+paddle.fluid.Program.parse_from_string ArgSpec(args=['binary_str'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.Program.prune ArgSpec(args=['self', 'targets'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.Program.rollback ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.Program.to_string ArgSpec(args=['self', 'throw_on_error', 'with_details'], varargs=None, keywords=None, defaults=(False,))
+paddle.fluid.Operator.__init__ ArgSpec(args=['self', 'block', 'desc', 'type', 'inputs', 'outputs', 'attrs'], varargs=None, keywords=None, defaults=(None, None, None, None))
+paddle.fluid.Operator.all_attrs ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.Operator.attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.Operator.attr_type ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.Operator.block_attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.Operator.block_attr_id ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.Operator.blocks_attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.Operator.blocks_attr_ids ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.Operator.has_attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.Operator.has_kernel ArgSpec(args=['self', 'op_type'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.Operator.input ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.Operator.output ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.Operator.rename_input ArgSpec(args=['self', 'old_name', 'new_name'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.Operator.rename_output ArgSpec(args=['self', 'old_name', 'new_name'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.Operator.set_attr ArgSpec(args=['self', 'name', 'val'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.Operator.to_string ArgSpec(args=['self', 'throw_on_error'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.Parameter.__init__ ArgSpec(args=['self', 'block', 'shape', 'dtype'], varargs=None, keywords='kwargs', defaults=None)
+paddle.fluid.Parameter.astype ArgSpec(args=['self', 'dtype'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.Parameter.to_string ArgSpec(args=['self', 'throw_on_error', 'with_details'], varargs=None, keywords=None, defaults=(False,))
+paddle.fluid.default_startup_program ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
+paddle.fluid.default_main_program ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
+paddle.fluid.program_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
+paddle.fluid.get_var ArgSpec(args=['name', 'program'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.name_scope ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
+paddle.fluid.Executor.__init__ ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.Executor.close ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.Executor.run ArgSpec(args=['self', 'program', 'feed', 'fetch_list', 'feed_var_name', 'fetch_var_name', 'scope', 'return_numpy', 'use_program_cache'], varargs=None, keywords=None, defaults=(None, None, None, 'feed', 'fetch', None, True, False))
+paddle.fluid.global_scope ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
+paddle.fluid.scope_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
+paddle.fluid.Trainer.__init__ ArgSpec(args=['self', 'train_func', 'optimizer_func', 'param_path', 'place', 'parallel', 'checkpoint_config'], varargs=None, keywords=None, defaults=(None, None, False, None))
+paddle.fluid.Trainer.save_inference_model ArgSpec(args=['self', 'param_path', 'feeded_var_names', 'target_var_indexes'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.Trainer.save_params ArgSpec(args=['self', 'param_path'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.Trainer.stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.Trainer.test ArgSpec(args=['self', 'reader', 'feed_order'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.Trainer.train ArgSpec(args=['self', 'num_epochs', 'event_handler', 'reader', 'feed_order'], varargs=None, keywords=None, defaults=(None, None))
+paddle.fluid.BeginEpochEvent.__init__ ArgSpec(args=['self', 'epoch_id'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.EndEpochEvent.__init__ ArgSpec(args=['self', 'epoch_id'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.BeginStepEvent.__init__ ArgSpec(args=['self', 'epoch_id', 'step_id'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.EndStepEvent.__init__ ArgSpec(args=['self', 'epoch_id', 'step_id', 'metrics'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.CheckpointConfig.__init__ ArgSpec(args=['self', 'checkpoint_dir', 'max_num_checkpoints', 'epoch_interval', 'step_interval'], varargs=None, keywords=None, defaults=(None, 3, 1, 10))
+paddle.fluid.Inferencer.__init__ ArgSpec(args=['self', 'infer_func', 'param_path', 'place', 'parallel'], varargs=None, keywords=None, defaults=(None, False))
+paddle.fluid.Inferencer.infer ArgSpec(args=['self', 'inputs', 'return_numpy'], varargs=None, keywords=None, defaults=(True,))
+paddle.fluid.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None))
+paddle.fluid.DistributeTranspiler.get_trainer_program ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,))
+paddle.fluid.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None))
+paddle.fluid.InferenceTranspiler.__init__ 
+paddle.fluid.InferenceTranspiler.transpile ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'], varargs=None, keywords=None, defaults=(None, False, 0))
+paddle.fluid.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.DistributeTranspilerConfig.__init__ 
+paddle.fluid.ParallelExecutor.__init__ ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id', 'scope'], varargs=None, keywords='kwargs', defaults=(None, None, None, None, None, 1, 0, None))
+paddle.fluid.ParallelExecutor.run ArgSpec(args=['self', 'fetch_list', 'feed', 'feed_dict', 'return_numpy'], varargs=None, keywords=None, defaults=(None, None, True))
+paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ExecutionStrategy) -> None
+paddle.fluid.BuildStrategy.GradientScaleStrategy.__init__ __init__(self: paddle.fluid.core.GradientScaleStrategy, arg0: int) -> None
+paddle.fluid.BuildStrategy.ReduceStrategy.__init__ __init__(self: paddle.fluid.core.ReduceStrategy, arg0: int) -> None
+paddle.fluid.BuildStrategy.__init__ __init__(self: paddle.fluid.core.BuildStrategy) -> None
+paddle.fluid.create_lod_tensor ArgSpec(args=['data', 'recursive_seq_lens', 'place'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.create_random_int_lodtensor ArgSpec(args=['recursive_seq_lens', 'base_shape', 'place', 'low', 'high'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.io.save_vars ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None))
+paddle.fluid.io.save_params ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None))
+paddle.fluid.io.save_persistables ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None))
+paddle.fluid.io.load_vars ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None))
+paddle.fluid.io.load_params ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None))
+paddle.fluid.io.load_persistables ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None))
+paddle.fluid.io.save_inference_model ArgSpec(args=['dirname', 'feeded_var_names', 'target_vars', 'executor', 'main_program', 'model_filename', 'params_filename', 'export_for_deployment'], varargs=None, keywords=None, defaults=(None, None, None, True))
+paddle.fluid.io.load_inference_model ArgSpec(args=['dirname', 'executor', 'model_filename', 'params_filename', 'pserver_endpoints'], varargs=None, keywords=None, defaults=(None, None, None))
+paddle.fluid.io.get_inference_program ArgSpec(args=['target_vars', 'main_program'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.initializer.ConstantInitializer.__init__ ArgSpec(args=['self', 'value', 'force_cpu'], varargs=None, keywords=None, defaults=(0.0, False))
+paddle.fluid.initializer.UniformInitializer.__init__ ArgSpec(args=['self', 'low', 'high', 'seed'], varargs=None, keywords=None, defaults=(-1.0, 1.0, 0))
+paddle.fluid.initializer.NormalInitializer.__init__ ArgSpec(args=['self', 'loc', 'scale', 'seed'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0))
+paddle.fluid.initializer.XavierInitializer.__init__ ArgSpec(args=['self', 'uniform', 'fan_in', 'fan_out', 'seed'], varargs=None, keywords=None, defaults=(True, None, None, 0))
+paddle.fluid.initializer.BilinearInitializer.__init__ ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.initializer.MSRAInitializer.__init__ ArgSpec(args=['self', 'uniform', 'fan_in', 'seed'], varargs=None, keywords=None, defaults=(True, None, 0))
+paddle.fluid.initializer.force_init_on_cpu ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
+paddle.fluid.initializer.init_on_cpu ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
+paddle.fluid.layers.fc ArgSpec(args=['input', 'size', 'num_flatten_dims', 'param_attr', 'bias_attr', 'use_mkldnn', 'act', 'is_test', 'name'], varargs=None, keywords=None, defaults=(1, None, None, False, None, False, None))
+paddle.fluid.layers.embedding ArgSpec(args=['input', 'size', 'is_sparse', 'is_distributed', 'padding_idx', 'param_attr', 'dtype'], varargs=None, keywords=None, defaults=(False, False, None, None, 'float32'))
+paddle.fluid.layers.dynamic_lstm ArgSpec(args=['input', 'size', 'h_0', 'c_0', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'float32', None))
+paddle.fluid.layers.dynamic_lstmp ArgSpec(args=['input', 'size', 'proj_size', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'proj_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'tanh', 'float32', None))
+paddle.fluid.layers.dynamic_gru ArgSpec(args=['input', 'size', 'param_attr', 'bias_attr', 'is_reverse', 'gate_activation', 'candidate_activation', 'h_0'], varargs=None, keywords=None, defaults=(None, None, False, 'sigmoid', 'tanh', None))
+paddle.fluid.layers.gru_unit ArgSpec(args=['input', 'hidden', 'size', 'param_attr', 'bias_attr', 'activation', 'gate_activation'], varargs=None, keywords=None, defaults=(None, None, 'tanh', 'sigmoid'))
+paddle.fluid.layers.linear_chain_crf ArgSpec(args=['input', 'label', 'param_attr'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.crf_decoding ArgSpec(args=['input', 'param_attr', 'label'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.cos_sim ArgSpec(args=['X', 'Y'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.cross_entropy ArgSpec(args=['input', 'label', 'soft_label', 'ignore_index'], varargs=None, keywords=None, defaults=(False, -100))
+paddle.fluid.layers.square_error_cost ArgSpec(args=['input', 'label'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.chunk_eval ArgSpec(args=['input', 'label', 'chunk_scheme', 'num_chunk_types', 'excluded_chunk_types'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.sequence_conv ArgSpec(args=['input', 'num_filters', 'filter_size', 'filter_stride', 'padding', 'bias_attr', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(3, 1, None, None, None, None))
+paddle.fluid.layers.conv2d ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, False, None, None))
+paddle.fluid.layers.conv3d ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, False, None, None))
+paddle.fluid.layers.sequence_pool ArgSpec(args=['input', 'pool_type'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.sequence_softmax ArgSpec(args=['input', 'param_attr', 'bias_attr', 'use_cudnn'], varargs=None, keywords=None, defaults=(None, None, True))
+paddle.fluid.layers.softmax ArgSpec(args=['input', 'param_attr', 'bias_attr', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(None, None, True, None))
+paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'use_mkldnn', 'name'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, False, None))
+paddle.fluid.layers.pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'use_mkldnn', 'name'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, False, None))
+paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'use_mkldnn', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, False, None, None, None, False, False))
+paddle.fluid.layers.beam_search_decode ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
+paddle.fluid.layers.conv3d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
+paddle.fluid.layers.sequence_expand ArgSpec(args=['x', 'y', 'ref_level', 'name'], varargs=None, keywords=None, defaults=(-1, None))
+paddle.fluid.layers.sequence_pad ArgSpec(args=['x', 'pad_value', 'maxlen'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.lstm_unit ArgSpec(args=['x_t', 'hidden_t_prev', 'cell_t_prev', 'forget_bias', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(0.0, None, None, None))
+paddle.fluid.layers.reduce_sum ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None))
+paddle.fluid.layers.reduce_mean ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None))
+paddle.fluid.layers.reduce_max ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None))
+paddle.fluid.layers.reduce_min ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None))
+paddle.fluid.layers.reduce_prod ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None))
+paddle.fluid.layers.sequence_first_step ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.sequence_last_step ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.dropout ArgSpec(args=['x', 'dropout_prob', 'is_test', 'seed', 'name'], varargs=None, keywords=None, defaults=(False, None, None))
+paddle.fluid.layers.split ArgSpec(args=['input', 'num_or_sections', 'dim', 'name'], varargs=None, keywords=None, defaults=(-1, None))
+paddle.fluid.layers.ctc_greedy_decoder ArgSpec(args=['input', 'blank', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.edit_distance ArgSpec(args=['input', 'label', 'normalized', 'ignored_tokens'], varargs=None, keywords=None, defaults=(True, None))
+paddle.fluid.layers.l2_normalize ArgSpec(args=['x', 'axis', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(1e-12, None))
+paddle.fluid.layers.matmul ArgSpec(args=['x', 'y', 'transpose_x', 'transpose_y', 'name'], varargs=None, keywords=None, defaults=(False, False, None))
+paddle.fluid.layers.topk ArgSpec(args=['input', 'k', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.warpctc ArgSpec(args=['input', 'label', 'blank', 'norm_by_times'], varargs=None, keywords=None, defaults=(0, False))
+paddle.fluid.layers.sequence_reshape ArgSpec(args=['input', 'new_dim'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.transpose ArgSpec(args=['x', 'perm', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.im2sequence ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None))
+paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples'], varargs=None, keywords=None, defaults=(None, None, None, None))
+paddle.fluid.layers.hsigmoid ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr'], varargs=None, keywords=None, defaults=(None, None))
+paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'name'], varargs=None, keywords=None, defaults=(0, None))
+paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None))
+paddle.fluid.layers.multiplex ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.layer_norm ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None))
+paddle.fluid.layers.softmax_with_cross_entropy ArgSpec(args=['logits', 'label', 'soft_label', 'ignore_index'], varargs=None, keywords=None, defaults=(False, -100))
+paddle.fluid.layers.smooth_l1 ArgSpec(args=['x', 'y', 'inside_weight', 'outside_weight', 'sigma'], varargs=None, keywords=None, defaults=(None, None, None))
+paddle.fluid.layers.one_hot ArgSpec(args=['input', 'depth'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.autoincreased_step_counter ArgSpec(args=['counter_name', 'begin', 'step'], varargs=None, keywords=None, defaults=(None, 1, 1))
+paddle.fluid.layers.reshape ArgSpec(args=['x', 'shape', 'actual_shape', 'act', 'inplace', 'name'], varargs=None, keywords=None, defaults=(None, None, True, None))
+paddle.fluid.layers.squeeze ArgSpec(args=['input', 'axes', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.unsqueeze ArgSpec(args=['input', 'axes', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.lod_reset ArgSpec(args=['x', 'y', 'target_lod'], varargs=None, keywords=None, defaults=(None, None))
+paddle.fluid.layers.lrn ArgSpec(args=['input', 'n', 'k', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(5, 1.0, 0.0001, 0.75, None))
+paddle.fluid.layers.pad ArgSpec(args=['x', 'paddings', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0.0, None))
+paddle.fluid.layers.pad_constant_like ArgSpec(args=['x', 'y', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0.0, None))
+paddle.fluid.layers.label_smooth ArgSpec(args=['label', 'prior_dist', 'epsilon', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, 0.1, 'float32', None))
+paddle.fluid.layers.roi_pool ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1, 1, 1.0))
+paddle.fluid.layers.dice_loss ArgSpec(args=['input', 'label', 'epsilon'], varargs=None, keywords=None, defaults=(1e-05,))
+paddle.fluid.layers.image_resize ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'resample'], varargs=None, keywords=None, defaults=(None, None, None, 'BILINEAR'))
+paddle.fluid.layers.image_resize_short ArgSpec(args=['input', 'out_short_len', 'resample'], varargs=None, keywords=None, defaults=('BILINEAR',))
+paddle.fluid.layers.resize_bilinear ArgSpec(args=['input', 'out_shape', 'scale', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
+paddle.fluid.layers.gather ArgSpec(args=['input', 'index'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.scatter ArgSpec(args=['input', 'index', 'updates', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.random_crop ArgSpec(args=['x', 'shape', 'seed'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.mean_iou ArgSpec(args=['input', 'label', 'num_classes'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.relu ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.log ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.crop ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
+paddle.fluid.layers.rank_loss ArgSpec(args=['label', 'left', 'right', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.prelu ArgSpec(args=['x', 'mode', 'param_attr', 'name'], varargs=None, keywords=None, defaults=(None, None))
+paddle.fluid.layers.flatten ArgSpec(args=['x', 'axis', 'name'], varargs=None, keywords=None, defaults=(1, None))
+paddle.fluid.layers.sequence_mask ArgSpec(args=['x', 'maxlen', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, 'int64', None))
+paddle.fluid.layers.stack ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=(0,))
+paddle.fluid.layers.pad2d ArgSpec(args=['input', 'paddings', 'mode', 'pad_value', 'data_format', 'name'], varargs=None, keywords=None, defaults=([0, 0, 0, 0], 'constant', 0.0, 'NCHW', None))
+paddle.fluid.layers.unstack ArgSpec(args=['x', 'axis', 'num'], varargs=None, keywords=None, defaults=(0, None))
+paddle.fluid.layers.sequence_enumerate ArgSpec(args=['input', 'win_size', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0, None))
+paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
+paddle.fluid.layers.open_recordio_file ArgSpec(args=['filename', 'shapes', 'lod_levels', 'dtypes', 'pass_num', 'for_parallel'], varargs=None, keywords=None, defaults=(1, True))
+paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
+paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.shuffle ArgSpec(args=['reader', 'buffer_size'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.batch ArgSpec(args=['reader', 'batch_size'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.double_buffer ArgSpec(args=['reader', 'place', 'name'], varargs=None, keywords=None, defaults=(None, None))
+paddle.fluid.layers.random_data_generator ArgSpec(args=['low', 'high', 'shapes', 'lod_levels', 'for_parallel'], varargs=None, keywords=None, defaults=(True,))
+paddle.fluid.layers.py_reader ArgSpec(args=['capacity', 'shapes', 'dtypes', 'lod_levels', 'name', 'use_double_buffer'], varargs=None, keywords=None, defaults=(None, None, True))
+paddle.fluid.layers.Preprocessor.__init__ ArgSpec(args=['self', 'reader', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.Preprocessor.block ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
+paddle.fluid.layers.Preprocessor.inputs ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.Preprocessor.outputs ArgSpec(args=['self'], varargs='outs', keywords=None, defaults=None)
+paddle.fluid.layers.load ArgSpec(args=['out', 'file_path', 'load_as_fp16'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.create_tensor ArgSpec(args=['dtype', 'name', 'persistable'], varargs=None, keywords=None, defaults=(None, False))
+paddle.fluid.layers.create_parameter ArgSpec(args=['shape', 'dtype', 'name', 'attr', 'is_bias', 'default_initializer'], varargs=None, keywords=None, defaults=(None, None, False, None))
+paddle.fluid.layers.create_global_var ArgSpec(args=['shape', 'value', 'dtype', 'persistable', 'force_cpu', 'name'], varargs=None, keywords=None, defaults=(False, False, None))
+paddle.fluid.layers.cast ArgSpec(args=['x', 'dtype'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.concat ArgSpec(args=['input', 'axis', 'name'], varargs=None, keywords=None, defaults=(0, None))
+paddle.fluid.layers.sums ArgSpec(args=['input', 'out'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.assign ArgSpec(args=['input', 'output'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.fill_constant_batch_size_like ArgSpec(args=['input', 'shape', 'dtype', 'value', 'input_dim_idx', 'output_dim_idx'], varargs=None, keywords=None, defaults=(0, 0))
+paddle.fluid.layers.fill_constant ArgSpec(args=['shape', 'dtype', 'value', 'force_cpu', 'out'], varargs=None, keywords=None, defaults=(False, None))
+paddle.fluid.layers.argmin ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=(0,))
+paddle.fluid.layers.argmax ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=(0,))
+paddle.fluid.layers.argsort ArgSpec(args=['input', 'axis', 'name'], varargs=None, keywords=None, defaults=(-1, None))
+paddle.fluid.layers.ones ArgSpec(args=['shape', 'dtype', 'force_cpu'], varargs=None, keywords=None, defaults=(False,))
+paddle.fluid.layers.zeros ArgSpec(args=['shape', 'dtype', 'force_cpu'], varargs=None, keywords=None, defaults=(False,))
+paddle.fluid.layers.reverse ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.While.__init__ ArgSpec(args=['self', 'cond', 'is_test', 'name'], varargs=None, keywords=None, defaults=(False, None))
+paddle.fluid.layers.While.block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.Switch.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.Switch.case ArgSpec(args=['self', 'condition'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.Switch.default ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.increment ArgSpec(args=['x', 'value', 'in_place'], varargs=None, keywords=None, defaults=(1.0, True))
+paddle.fluid.layers.array_write ArgSpec(args=['x', 'i', 'array'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.create_array ArgSpec(args=['dtype'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.less_than ArgSpec(args=['x', 'y', 'force_cpu', 'cond'], varargs=None, keywords='ignored', defaults=(None, None))
+paddle.fluid.layers.equal ArgSpec(args=['x', 'y', 'cond'], varargs=None, keywords='ignored', defaults=(None,))
+paddle.fluid.layers.array_read ArgSpec(args=['array', 'i'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.array_length ArgSpec(args=['array'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.IfElse.__init__ ArgSpec(args=['self', 'cond', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.IfElse.false_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.IfElse.input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.IfElse.output ArgSpec(args=['self'], varargs='outs', keywords=None, defaults=None)
+paddle.fluid.layers.IfElse.true_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.DynamicRNN.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.DynamicRNN.block ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
+paddle.fluid.layers.DynamicRNN.memory ArgSpec(args=['self', 'init', 'shape', 'value', 'need_reorder', 'dtype'], varargs=None, keywords=None, defaults=(None, None, 0.0, False, 'float32'))
+paddle.fluid.layers.DynamicRNN.output ArgSpec(args=['self'], varargs='outputs', keywords=None, defaults=None)
+paddle.fluid.layers.DynamicRNN.static_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.DynamicRNN.step_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.DynamicRNN.update_memory ArgSpec(args=['self', 'ex_mem', 'new_mem'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.StaticRNN.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.StaticRNN.memory ArgSpec(args=['self', 'init', 'shape', 'batch_ref', 'init_value', 'init_batch_dim_idx', 'ref_batch_dim_idx'], varargs=None, keywords=None, defaults=(None, None, None, 0.0, 0, 1))
+paddle.fluid.layers.StaticRNN.output ArgSpec(args=['self'], varargs='outputs', keywords=None, defaults=None)
+paddle.fluid.layers.StaticRNN.step ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.StaticRNN.step_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.StaticRNN.step_output ArgSpec(args=['self', 'o'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.StaticRNN.update_memory ArgSpec(args=['self', 'mem', 'var'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.reorder_lod_tensor_by_rank ArgSpec(args=['x', 'rank_table'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.ParallelDo.__init__ ArgSpec(args=['self', 'places', 'use_nccl', 'name'], varargs=None, keywords=None, defaults=(False, None))
+paddle.fluid.layers.ParallelDo.do ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.ParallelDo.get_parameters ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.ParallelDo.parent_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.ParallelDo.read_input ArgSpec(args=['self', 'var'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.ParallelDo.write_output ArgSpec(args=['self', 'var'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.Print ArgSpec(args=['input', 'first_n', 'message', 'summarize', 'print_tensor_name', 'print_tensor_type', 'print_tensor_shape', 'print_tensor_lod', 'print_phase'], varargs=None, keywords=None, defaults=(-1, None, -1, True, True, True, True, 'both'))
+paddle.fluid.layers.is_empty ArgSpec(args=['x', 'cond'], varargs=None, keywords='ignored', defaults=(None,))
+paddle.fluid.layers.mean ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.mul ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.scale ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.elementwise_add ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.elementwise_div ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.elementwise_sub ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.elementwise_mul ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.elementwise_max ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.elementwise_min ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.elementwise_pow ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.clip ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.clip_by_norm ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.logical_and ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.logical_or ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.logical_xor ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.logical_not ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.uniform_random_batch_size_like ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.gaussian_random ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.sampling_id ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.gaussian_random_batch_size_like ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.sum ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.slice ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.shape ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.maxout ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.sigmoid ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.logsigmoid ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.exp ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.tanh ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.tanh_shrink ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.softshrink ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.sqrt ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.abs ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.ceil ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.floor ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.cos ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.sin ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.round ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.reciprocal ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.square ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.softplus ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.softsign ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.brelu ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.leaky_relu ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.soft_relu ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.elu ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.relu6 ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.pow ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.stanh ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.hard_sigmoid ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.swish ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.uniform_random ArgSpec(args=['shape', 'dtype', 'min', 'max', 'seed'], varargs=None, keywords=None, defaults=(None, None, None, None))
+paddle.fluid.layers.hard_shrink ArgSpec(args=['x', 'threshold'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.cumsum ArgSpec(args=['x', 'axis', 'exclusive', 'reverse'], varargs=None, keywords=None, defaults=(None, None, None))
+paddle.fluid.layers.thresholded_relu ArgSpec(args=['x', 'threshold'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.prior_box ArgSpec(args=['input', 'image', 'min_sizes', 'max_sizes', 'aspect_ratios', 'variance', 'flip', 'clip', 'steps', 'offset', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, [1.0], [0.1, 0.1, 0.2, 0.2], False, False, [0.0, 0.0], 0.5, None, False))
+paddle.fluid.layers.multi_box_head ArgSpec(args=['inputs', 'image', 'base_size', 'num_classes', 'aspect_ratios', 'min_ratio', 'max_ratio', 'min_sizes', 'max_sizes', 'steps', 'step_w', 'step_h', 'offset', 'variance', 'flip', 'clip', 'kernel_size', 'pad', 'stride', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None, 0.5, [0.1, 0.1, 0.2, 0.2], True, False, 1, 0, 1, None, False))
+paddle.fluid.layers.bipartite_match ArgSpec(args=['dist_matrix', 'match_type', 'dist_threshold', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
+paddle.fluid.layers.target_assign ArgSpec(args=['input', 'matched_indices', 'negative_indices', 'mismatch_value', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
+paddle.fluid.layers.detection_output ArgSpec(args=['loc', 'scores', 'prior_box', 'prior_box_var', 'background_label', 'nms_threshold', 'nms_top_k', 'keep_top_k', 'score_threshold', 'nms_eta'], varargs=None, keywords=None, defaults=(0, 0.3, 400, 200, 0.01, 1.0))
+paddle.fluid.layers.ssd_loss ArgSpec(args=['location', 'confidence', 'gt_box', 'gt_label', 'prior_box', 'prior_box_var', 'background_label', 'overlap_threshold', 'neg_pos_ratio', 'neg_overlap', 'loc_loss_weight', 'conf_loss_weight', 'match_type', 'mining_type', 'normalize', 'sample_size'], varargs=None, keywords=None, defaults=(None, 0, 0.5, 3.0, 0.5, 1.0, 1.0, 'per_prediction', 'max_negative', True, None))
+paddle.fluid.layers.detection_map ArgSpec(args=['detect_res', 'label', 'class_num', 'background_label', 'overlap_threshold', 'evaluate_difficult', 'has_state', 'input_states', 'out_states', 'ap_version'], varargs=None, keywords=None, defaults=(0, 0.3, True, None, None, None, 'integral'))
+paddle.fluid.layers.rpn_target_assign ArgSpec(args=['bbox_pred', 'cls_logits', 'anchor_box', 'anchor_var', 'gt_boxes', 'is_crowd', 'im_info', 'rpn_batch_size_per_im', 'rpn_straddle_thresh', 'rpn_fg_fraction', 'rpn_positive_overlap', 'rpn_negative_overlap', 'use_random'], varargs=None, keywords=None, defaults=(256, 0.0, 0.5, 0.7, 0.3, True))
+paddle.fluid.layers.anchor_generator ArgSpec(args=['input', 'anchor_sizes', 'aspect_ratios', 'variance', 'stride', 'offset', 'name'], varargs=None, keywords=None, defaults=(None, None, [0.1, 0.1, 0.2, 0.2], None, 0.5, None))
+paddle.fluid.layers.generate_proposal_labels ArgSpec(args=['rpn_rois', 'gt_classes', 'is_crowd', 'gt_boxes', 'im_info', 'batch_size_per_im', 'fg_fraction', 'fg_thresh', 'bg_thresh_hi', 'bg_thresh_lo', 'bbox_reg_weights', 'class_nums', 'use_random'], varargs=None, keywords=None, defaults=(256, 0.25, 0.25, 0.5, 0.0, [0.1, 0.1, 0.2, 0.2], None, True))
+paddle.fluid.layers.generate_proposals ArgSpec(args=['scores', 'bbox_deltas', 'im_info', 'anchors', 'variances', 'pre_nms_top_n', 'post_nms_top_n', 'nms_thresh', 'min_size', 'eta', 'name'], varargs=None, keywords=None, defaults=(6000, 1000, 0.5, 0.1, 1.0, None))
+paddle.fluid.layers.iou_similarity ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.box_coder ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.polygon_box_transform ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None))
+paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk'], varargs=None, keywords=None, defaults=('ROC', 4095, 1))
+paddle.fluid.layers.exponential_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
+paddle.fluid.layers.natural_exp_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
+paddle.fluid.layers.inverse_time_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
+paddle.fluid.layers.polynomial_decay ArgSpec(args=['learning_rate', 'decay_steps', 'end_learning_rate', 'power', 'cycle'], varargs=None, keywords=None, defaults=(0.0001, 1.0, False))
+paddle.fluid.layers.piecewise_decay ArgSpec(args=['boundaries', 'values'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.noam_decay ArgSpec(args=['d_model', 'warmup_steps'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.append_LARS ArgSpec(args=['params_grads', 'learning_rate', 'weight_decay'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.InitState.__init__ ArgSpec(args=['self', 'init', 'shape', 'value', 'init_boot', 'need_reorder', 'dtype'], varargs=None, keywords=None, defaults=(None, None, 0.0, None, False, 'float32'))
+paddle.fluid.contrib.StateCell.__init__ ArgSpec(args=['self', 'inputs', 'states', 'out_state', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.contrib.StateCell.compute_state ArgSpec(args=['self', 'inputs'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.StateCell.get_input ArgSpec(args=['self', 'input_name'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.StateCell.get_state ArgSpec(args=['self', 'state_name'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.StateCell.out_state ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.StateCell.set_state ArgSpec(args=['self', 'state_name', 'state_value'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.StateCell.state_updater ArgSpec(args=['self', 'updater'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.StateCell.update_states ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.TrainingDecoder.__init__ ArgSpec(args=['self', 'state_cell', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.contrib.TrainingDecoder.block ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
+paddle.fluid.contrib.TrainingDecoder.output ArgSpec(args=['self'], varargs='outputs', keywords=None, defaults=None)
+paddle.fluid.contrib.TrainingDecoder.static_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.TrainingDecoder.step_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.BeamSearchDecoder.__init__ ArgSpec(args=['self', 'state_cell', 'init_ids', 'init_scores', 'target_dict_dim', 'word_dim', 'input_var_dict', 'topk_size', 'sparse_emb', 'max_len', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=({}, 50, True, 100, 1, 1, None))
+paddle.fluid.contrib.BeamSearchDecoder.block ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
+paddle.fluid.contrib.BeamSearchDecoder.decode ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.BeamSearchDecoder.early_stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.BeamSearchDecoder.read_array ArgSpec(args=['self', 'init', 'is_ids', 'is_scores'], varargs=None, keywords=None, defaults=(False, False))
+paddle.fluid.contrib.BeamSearchDecoder.update_array ArgSpec(args=['self', 'array', 'value'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.memory_usage ArgSpec(args=['program', 'batch_size'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.transpiler.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.transpiler.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None))
+paddle.fluid.transpiler.DistributeTranspiler.get_trainer_program ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,))
+paddle.fluid.transpiler.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None))
+paddle.fluid.transpiler.InferenceTranspiler.__init__ 
+paddle.fluid.transpiler.InferenceTranspiler.transpile ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.transpiler.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'], varargs=None, keywords=None, defaults=(None, False, 0))
+paddle.fluid.transpiler.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.transpiler.HashName.__init__ ArgSpec(args=['self', 'pserver_endpoints'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.transpiler.HashName.dispatch ArgSpec(args=['self', 'varlist'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.transpiler.HashName.reset ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.transpiler.RoundRobin.__init__ ArgSpec(args=['self', 'pserver_endpoints'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.transpiler.RoundRobin.dispatch ArgSpec(args=['self', 'varlist'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.transpiler.RoundRobin.reset ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.transpiler.DistributeTranspilerConfig.__init__ 
+paddle.fluid.nets.simple_img_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'pool_size', 'pool_stride', 'pool_padding', 'pool_type', 'global_pooling', 'conv_stride', 'conv_padding', 'conv_dilation', 'conv_groups', 'param_attr', 'bias_attr', 'act', 'use_cudnn', 'use_mkldnn'], varargs=None, keywords=None, defaults=(0, 'max', False, 1, 0, 1, 1, None, None, None, True, False))
+paddle.fluid.nets.sequence_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'param_attr', 'act', 'pool_type'], varargs=None, keywords=None, defaults=(None, 'sigmoid', 'max'))
+paddle.fluid.nets.glu ArgSpec(args=['input', 'dim'], varargs=None, keywords=None, defaults=(-1,))
+paddle.fluid.nets.scaled_dot_product_attention ArgSpec(args=['queries', 'keys', 'values', 'num_heads', 'dropout_rate'], varargs=None, keywords=None, defaults=(1, 0.0))
+paddle.fluid.optimizer.SGDOptimizer.__init__ ArgSpec(args=['self', 'learning_rate'], varargs=None, keywords='kwargs', defaults=None)
+paddle.fluid.optimizer.SGDOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
+paddle.fluid.optimizer.MomentumOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'momentum', 'use_nesterov'], varargs=None, keywords='kwargs', defaults=(False,))
+paddle.fluid.optimizer.MomentumOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
+paddle.fluid.optimizer.AdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon'], varargs=None, keywords='kwargs', defaults=(1e-06,))
+paddle.fluid.optimizer.AdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
+paddle.fluid.optimizer.AdamOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon'], varargs=None, keywords='kwargs', defaults=(0.001, 0.9, 0.999, 1e-08))
+paddle.fluid.optimizer.AdamOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
+paddle.fluid.optimizer.AdamaxOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon'], varargs=None, keywords='kwargs', defaults=(0.001, 0.9, 0.999, 1e-08))
+paddle.fluid.optimizer.AdamaxOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
+paddle.fluid.optimizer.DecayedAdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'decay', 'epsilon'], varargs=None, keywords='kwargs', defaults=(0.95, 1e-06))
+paddle.fluid.optimizer.DecayedAdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
+paddle.fluid.optimizer.FtrlOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'l1', 'l2', 'lr_power'], varargs=None, keywords='kwargs', defaults=(0.0, 0.0, -0.5))
+paddle.fluid.optimizer.FtrlOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
+paddle.fluid.optimizer.RMSPropOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'rho', 'epsilon', 'momentum', 'centered'], varargs=None, keywords='kwargs', defaults=(0.95, 1e-06, 0.0, False))
+paddle.fluid.optimizer.RMSPropOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
+paddle.fluid.optimizer.AdadeltaOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'rho'], varargs=None, keywords='kwargs', defaults=(1e-06, 0.95))
+paddle.fluid.optimizer.AdadeltaOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
+paddle.fluid.optimizer.ModelAverage.__init__ ArgSpec(args=['self', 'average_window_rate', 'min_average_window', 'max_average_window'], varargs=None, keywords='kwargs', defaults=(10000, 10000))
+paddle.fluid.optimizer.ModelAverage.apply ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
+paddle.fluid.optimizer.ModelAverage.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
+paddle.fluid.optimizer.ModelAverage.restore ArgSpec(args=['self', 'executor'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.backward.append_backward ArgSpec(args=['loss', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None))
+paddle.fluid.regularizer.L1DecayRegularizer.__init__ ArgSpec(args=['self', 'regularization_coeff'], varargs=None, keywords=None, defaults=(0.0,))
+paddle.fluid.regularizer.L2DecayRegularizer.__init__ ArgSpec(args=['self', 'regularization_coeff'], varargs=None, keywords=None, defaults=(0.0,))
+paddle.fluid.LoDTensor.__init__ 1. __init__(self: paddle.fluid.core.LoDTensor, arg0: List[List[int]]) -> None  2. __init__(self: paddle.fluid.core.LoDTensor) -> None
+paddle.fluid.LoDTensor.has_valid_recursive_sequence_lengths has_valid_recursive_sequence_lengths(self: paddle.fluid.core.LoDTensor) -> bool
+paddle.fluid.LoDTensor.lod lod(self: paddle.fluid.core.LoDTensor) -> List[List[int]]
+paddle.fluid.LoDTensor.recursive_sequence_lengths recursive_sequence_lengths(self: paddle.fluid.core.LoDTensor) -> List[List[int]]
+paddle.fluid.LoDTensor.set 1. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float32], arg1: paddle::platform::CPUPlace) -> None  2. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int32], arg1: paddle::platform::CPUPlace) -> None  3. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float64], arg1: paddle::platform::CPUPlace) -> None  4. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int64], arg1: paddle::platform::CPUPlace) -> None  5. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[bool], arg1: paddle::platform::CPUPlace) -> None  6. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint16], arg1: paddle::platform::CPUPlace) -> None  7. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint8], arg1: paddle::platform::CPUPlace) -> None  8. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int8], arg1: paddle::platform::CPUPlace) -> None  9. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float32], arg1: paddle::platform::CUDAPlace) -> None  10. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int32], arg1: paddle::platform::CUDAPlace) -> None  11. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float64], arg1: paddle::platform::CUDAPlace) -> None  12. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int64], arg1: paddle::platform::CUDAPlace) -> None  13. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[bool], arg1: paddle::platform::CUDAPlace) -> None  14. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint16], arg1: paddle::platform::CUDAPlace) -> None  15. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint8], arg1: paddle::platform::CUDAPlace) -> None  16. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int8], arg1: paddle::platform::CUDAPlace) -> None  17. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float32], arg1: paddle::platform::CUDAPinnedPlace) -> None  18. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int32], arg1: paddle::platform::CUDAPinnedPlace) -> None  19. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float64], arg1: paddle::platform::CUDAPinnedPlace) -> None  20. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int64], arg1: paddle::platform::CUDAPinnedPlace) -> None  21. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[bool], arg1: paddle::platform::CUDAPinnedPlace) -> None  22. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint16], arg1: paddle::platform::CUDAPinnedPlace) -> None  23. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint8], arg1: paddle::platform::CUDAPinnedPlace) -> None  24. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int8], arg1: paddle::platform::CUDAPinnedPlace) -> None
+paddle.fluid.LoDTensor.set_lod set_lod(self: paddle.fluid.core.LoDTensor, arg0: List[List[int]]) -> None
+paddle.fluid.LoDTensor.set_recursive_sequence_lengths set_recursive_sequence_lengths(self: paddle.fluid.core.LoDTensor, arg0: List[List[int]]) -> None
+paddle.fluid.LoDTensor.shape shape(self: paddle.fluid.core.Tensor) -> List[int]
+paddle.fluid.LoDTensorArray.__init__ __init__(self: paddle.fluid.core.LoDTensorArray) -> None
+paddle.fluid.LoDTensorArray.append append(self: paddle.fluid.core.LoDTensorArray, arg0: paddle.fluid.core.LoDTensor) -> None
+paddle.fluid.CPUPlace.__init__ __init__(self: paddle.fluid.core.CPUPlace) -> None
+paddle.fluid.CUDAPlace.__init__ __init__(self: paddle.fluid.core.CUDAPlace, arg0: int) -> None
+paddle.fluid.CUDAPinnedPlace.__init__ __init__(self: paddle.fluid.core.CUDAPinnedPlace) -> None
+paddle.fluid.ParamAttr.__init__ ArgSpec(args=['self', 'name', 'initializer', 'learning_rate', 'regularizer', 'trainable', 'gradient_clip', 'do_model_average'], varargs=None, keywords=None, defaults=(None, None, 1.0, None, True, None, False))
+paddle.fluid.WeightNormParamAttr.__init__ ArgSpec(args=['self', 'dim'], varargs=None, keywords='kwargs', defaults=(None,))
+paddle.fluid.DataFeeder.__init__ ArgSpec(args=['self', 'feed_list', 'place', 'program'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.DataFeeder.decorate_reader ArgSpec(args=['self', 'reader', 'multi_devices', 'num_places', 'drop_last'], varargs=None, keywords=None, defaults=(None, True))
+paddle.fluid.DataFeeder.feed ArgSpec(args=['self', 'iterable'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.DataFeeder.feed_parallel ArgSpec(args=['self', 'iterable', 'num_places'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.clip.ErrorClipByValue.__init__ ArgSpec(args=['self', 'max', 'min'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.clip.GradientClipByValue.__init__ ArgSpec(args=['self', 'max', 'min'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.clip.GradientClipByNorm.__init__ ArgSpec(args=['self', 'clip_norm'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.clip.GradientClipByGlobalNorm.__init__ ArgSpec(args=['self', 'clip_norm', 'group_name'], varargs=None, keywords=None, defaults=('default_group',))
+paddle.fluid.profiler.cuda_profiler ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
+paddle.fluid.profiler.reset_profiler ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
+paddle.fluid.profiler.profiler ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
+paddle.fluid.profiler.start_profiler ArgSpec(args=['state'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.profiler.stop_profiler ArgSpec(args=['sorted_key', 'profile_path'], varargs=None, keywords=None, defaults=(None, '/tmp/profile'))
+paddle.fluid.unique_name.generate ArgSpec(args=['key'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.unique_name.switch ArgSpec(args=['new_generator'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.unique_name.guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
+paddle.fluid.recordio_writer.convert_reader_to_recordio_file ArgSpec(args=['filename', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None))
+paddle.fluid.recordio_writer.convert_reader_to_recordio_files ArgSpec(args=['filename', 'batch_per_file', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None))
+paddle.fluid.Scope.__init__ __init__(self: paddle.fluid.core.Scope) -> None
+paddle.fluid.Scope.drop_kids drop_kids(self: paddle.fluid.core.Scope) -> None
+paddle.fluid.Scope.find_var find_var(self: paddle.fluid.core.Scope, arg0: unicode) -> paddle.fluid.core.Variable
+paddle.fluid.Scope.new_scope new_scope(self: paddle.fluid.core.Scope) -> paddle.fluid.core.Scope
+paddle.fluid.Scope.var var(self: paddle.fluid.core.Scope, arg0: unicode) -> paddle.fluid.core.Variable
diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt
index d274d96c29bdbf5973d568d783369c3975bdc436..ee1f655e25dedb8846bb26275072fd9f6c1f123e 100644
--- a/paddle/fluid/CMakeLists.txt
+++ b/paddle/fluid/CMakeLists.txt
@@ -2,8 +2,14 @@ add_subdirectory(memory)
 add_subdirectory(platform)
 add_subdirectory(framework)
 add_subdirectory(operators)
-add_subdirectory(pybind)
 add_subdirectory(string)
+
+if (NOT WIN32)
+add_subdirectory(pybind)
 add_subdirectory(recordio)
-# NOTE: please add subdirectory inference at last.
-add_subdirectory(inference)
+endif(NOT WIN32)
+
+if(WITH_INFERENCE)
+  # NOTE: please add subdirectory inference at last.
+  add_subdirectory(inference)
+endif()
diff --git a/paddle/fluid/framework/.gitignore b/paddle/fluid/framework/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..5132131e55e2feee8ae88b4c65ec102fbc9c5fe1
--- /dev/null
+++ b/paddle/fluid/framework/.gitignore
@@ -0,0 +1,2 @@
+.tensor_util.cu
+.data_type_transform.cu
\ No newline at end of file
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 397c9f739452e5130dad28a763b92cf76720ec61..d998109df21f585bc4905e00e59fe07247fd3f5e 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -1,4 +1,26 @@
+# windows treat symbolic file as a real file, which is different with unix
+# We create a hidden file and compile it instead of origin source file.
+function(windows_symbolic TARGET)
+  set(oneValueArgs "")
+  set(multiValueArgs SRCS DEPS)
+  cmake_parse_arguments(windows_symbolic "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  foreach(src ${windows_symbolic_SRCS})
+  get_filename_component(src ${src} NAME_WE)
+  if (NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc OR NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cu)
+      message(FATAL " ${src}.cc and ${src}.cu must exsits, and ${src}.cu must be symbolic file.")
+  endif()
+  add_custom_command(OUTPUT .${src}.cu 
+          COMMAND ${CMAKE_COMMAND} -E remove ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu
+          COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc" "${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu"
+          COMMENT "create hidden file of ${src}.cu")
+  add_custom_target(${TARGET} ALL DEPENDS .${src}.cu)  
+  endforeach()
+endfunction()
+
+add_subdirectory(ir)
+if (NOT WIN32)
 add_subdirectory(details)
+endif (NOT WIN32)
 # ddim lib
 proto_library(framework_proto SRCS framework.proto)
 
@@ -6,10 +28,17 @@ cc_library(ddim SRCS ddim.cc DEPS eigen3 boost)
 cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
 nv_test(dim_test SRCS dim_test.cu DEPS ddim)
 cc_library(data_type SRCS data_type.cc DEPS framework_proto ddim device_context)
+cc_test(data_type_test SRCS data_type_test.cc DEPS data_type place tensor)
 if(WITH_GPU)
-  nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type)
+  if (WIN32)
+    windows_symbolic(tensor_util SRCS tensor_util.cu)
+    nv_library(tensor SRCS tensor.cc .tensor_util.cu DEPS place memory data_type device_context)
+    add_dependencies(tensor tensor_util)
+  else()
+    nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context)
+  endif(WIN32)
 else()
-  cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type)
+  cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context)
 endif()
 
 cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
@@ -21,12 +50,22 @@ endif()
 
 cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)
 
-nv_test(mixed_vector_test SRCS mixed_vector_test.cu DEPS place memory device_context tensor)
-cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio)
+if(WITH_GPU)
+  nv_test(mixed_vector_test SRCS mixed_vector_test.cc mixed_vector_test.cu DEPS place memory device_context tensor)
+else()
+  cc_test(mixed_vector_test SRCS mixed_vector_test.cc DEPS place memory device_context tensor)
+endif()
+if (NOT WIN32)
+cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio version)
+else()
+cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto version)
+endif (NOT WIN32)
+
 cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory)
 nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor)
 
 cc_library(reader SRCS reader.cc DEPS lod_tensor ddim)
+cc_test(reader_test SRCS reader_test.cc DEPS reader)
 
 cc_test(variable_test SRCS variable_test.cc)
 
@@ -41,7 +80,13 @@ nv_test(data_device_transform_test SRCS data_device_transform_test.cu
         DEPS operator op_registry device_context math_function)
 
 if(WITH_GPU)
-  nv_library(data_type_transform SRCS data_type_transform.cu DEPS tensor)
+  if (WIN32)
+      windows_symbolic(hidden_file SRCS data_type_transform.cu)
+      nv_library(data_type_transform SRCS .data_type_transform.cu DEPS tensor)
+      add_dependencies(data_type_transform hidden_file)
+  else()
+      nv_library(data_type_transform SRCS data_type_transform.cu DEPS tensor)
+  endif(WIN32)
   nv_test(data_type_transform_test SRCS data_type_transform_test.cc data_type_transform_test.cu DEPS data_type_transform)
 else()
   cc_library(data_type_transform SRCS data_type_transform.cc DEPS tensor)
@@ -61,14 +106,26 @@ cc_library(op_proto_maker SRCS op_proto_maker.cc DEPS framework_proto attribute
 cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
 cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
 cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_context)
+
+if (NOT WIN32)
 cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog
     shape_inference data_transform lod_tensor profiler)
+else()
+cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog
+    shape_inference data_transform lod_tensor)
+endif(NOT WIN32)
+
 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context)
-cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog)
+
+cc_library(version SRCS version.cc)
+cc_test(version_test SRCS version_test.cc DEPS version)
+
+cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version)
 
 cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
 nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
 
+if (NOT WIN32)
 py_proto_compile(framework_py_proto SRCS framework.proto)
 # Generate an empty __init__.py to make framework_py_proto as a valid python module.
 add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
@@ -78,21 +135,27 @@ add_custom_command(TARGET framework_py_proto POST_BUILD
     COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/
     COMMENT "Copy generated python proto into directory paddle/fluid/proto."
     WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+endif(NOT WIN32)
 
 cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)
 
 cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog)
 
 if(WITH_DISTRIBUTE)
-  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr)
+  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr graph_to_program_pass)
   set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
   set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 else()
-  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method)
+  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass)
 endif()
 
-
-cc_library(parallel_executor SRCS parallel_executor.cc DEPS ssa_graph_builder_factory threaded_ssa_graph_executor scope_buffered_ssa_graph_executor)
+if (NOT WIN32)
+cc_library(parallel_executor SRCS parallel_executor.cc DEPS
+        threaded_ssa_graph_executor scope_buffered_ssa_graph_executor
+        graph graph_viz_pass multi_devices_graph_pass
+        multi_devices_graph_print_pass multi_devices_graph_check_pass
+        fast_threaded_ssa_graph_executor)
+endif() # NOT WIN32
 
 cc_library(prune SRCS prune.cc DEPS framework_proto)
 cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
@@ -103,10 +166,14 @@ cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows)
 
 cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto)
 cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc)
-      
+
 # cc_test(channel_test SRCS channel_test.cc)
 cc_test(tuple_test SRCS tuple_test.cc )
 
+if (NOT WIN32)
+cc_test(rw_lock_test SRCS rw_lock_test.cc)
+endif (NOT WIN32)
+
 # disable test temporarily.
 # TODO https://github.com/PaddlePaddle/Paddle/issues/11971
 # cc_test(concurrency_test SRCS concurrency_test.cc DEPS go_op channel_close_op channel_create_op
diff --git a/paddle/fluid/framework/array.h b/paddle/fluid/framework/array.h
new file mode 100644
index 0000000000000000000000000000000000000000..be9efcd74924a2050a2fd9ab83059590a1a2a2fd
--- /dev/null
+++ b/paddle/fluid/framework/array.h
@@ -0,0 +1,48 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstdint>
+#include "paddle/fluid/platform/hostdevice.h"
+
+namespace paddle {
+namespace framework {
+template <typename T, size_t N>
+class Array {
+  static_assert(N > 0, "The size of array must be larger than 0");
+
+ public:
+  HOSTDEVICE Array() {}
+
+  HOSTDEVICE explicit Array(const T &val) {
+    for (size_t i = 0; i < N; ++i) data_[i] = val;
+  }
+
+  HOSTDEVICE const T *Get() const { return data_; }
+
+  HOSTDEVICE T *GetMutable() { return data_; }
+
+  HOSTDEVICE T &operator[](size_t index) { return data_[index]; }
+
+  HOSTDEVICE const T &operator[](size_t index) const { return data_[index]; }
+
+  HOSTDEVICE constexpr size_t size() const { return N; }
+
+ private:
+  T data_[N];
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/attribute.h b/paddle/fluid/framework/attribute.h
index 8428bf8e3392f68c9d1e2553f4d017cb620bb9f3..14ca3e96209ed17f12e87fda8506806514698977 100644
--- a/paddle/fluid/framework/attribute.h
+++ b/paddle/fluid/framework/attribute.h
@@ -128,7 +128,8 @@ struct ExtractAttribute {
       attr_value = &boost::get<T>(attr);
     } catch (boost::bad_get& bad_get) {
       PADDLE_THROW("Cannot get attribute %s by type %s, its type is %s",
-                   attr_name_, typeid(T).name(), attr.type().name());
+                   attr_name_, paddle::platform::demangle(typeid(T).name()),
+                   paddle::platform::demangle(attr.type().name()));
     }
     return attr_value;
   }
@@ -160,7 +161,7 @@ struct ExtractAttribute<bool> {
       attr_value = &boost::get<bool>(attr);
     } catch (boost::bad_get& bad_get) {
       PADDLE_THROW("Cannot get attribute %s by type bool, its type is %s",
-                   attr_name_, attr.type().name());
+                   attr_name_, paddle::platform::demangle(attr.type().name()));
     }
     return attr_value;
   }
@@ -186,7 +187,7 @@ struct ExtractAttribute<int64_t> {
       attr_value = &boost::get<int64_t>(attr);
     } catch (boost::bad_get& bad_get) {
       PADDLE_THROW("Cannot get attribute %s by type int64_t, its type is %s",
-                   attr_name_, attr.type().name());
+                   attr_name_, paddle::platform::demangle(attr.type().name()));
     }
     return attr_value;
   }
diff --git a/paddle/fluid/framework/block_desc.h b/paddle/fluid/framework/block_desc.h
index ce48548418478cc5c9f9ca1244df9e66dca884e6..960ca39e1eadd3c064beb0e2c1342a406c4f0b6a 100644
--- a/paddle/fluid/framework/block_desc.h
+++ b/paddle/fluid/framework/block_desc.h
@@ -88,9 +88,8 @@ class BlockDesc {
   OpDesc *InsertOp(size_t index);
 
   /*
-   * Remove Op and its input/output variables.
-   * Note that for either input or output variable, if it is also an input or
-   * output variable of other ops, we should remain it.
+   * Only remove op itself,
+   * do nothing to its input and output variables
    */
   void RemoveOp(size_t s, size_t e);
 
diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc
index cd00b7de7338982308acfa1f1e8c38e010c6a43b..c9e3a8ac1d1e5228725bff49ecc6d91e640dfe57 100644
--- a/paddle/fluid/framework/data_layout_transform.cc
+++ b/paddle/fluid/framework/data_layout_transform.cc
@@ -46,7 +46,7 @@ struct CastDataLayout {
   const std::vector<int> axis_;
 
   template <typename T>
-  void operator()() {
+  void apply() {
     auto place = ctx_->GetPlace();
 
     if (platform::is_cpu_place(place)) {
diff --git a/paddle/fluid/framework/data_type.cc b/paddle/fluid/framework/data_type.cc
index 60382faffb8e53870658b2d1ff83abc4008cb4cf..28f3da88fa18021f6b71e458fdb467be86d4dbf0 100644
--- a/paddle/fluid/framework/data_type.cc
+++ b/paddle/fluid/framework/data_type.cc
@@ -17,6 +17,8 @@
 #include <string>
 #include <unordered_map>
 
+using float16 = paddle::platform::float16;
+
 namespace paddle {
 namespace framework {
 
@@ -53,7 +55,7 @@ static DataTypeMap* InitDataTypeMap() {
   RegisterType<cc_type>(retv, proto_type, #cc_type)
 
   // NOTE: Add your customize type here.
-  RegType(platform::float16, proto::VarType::FP16);
+  RegType(float16, proto::VarType::FP16);
   RegType(float, proto::VarType::FP32);
   RegType(double, proto::VarType::FP64);
   RegType(int, proto::VarType::INT32);
@@ -62,6 +64,7 @@ static DataTypeMap* InitDataTypeMap() {
   RegType(size_t, proto::VarType::SIZE_T);
   RegType(int16_t, proto::VarType::INT16);
   RegType(uint8_t, proto::VarType::UINT8);
+  RegType(int8_t, proto::VarType::INT8);
 
 #undef RegType
   return retv;
diff --git a/paddle/fluid/framework/data_type.h b/paddle/fluid/framework/data_type.h
index 491413db8c8d66fd907801131e89d9303bdef9f2..8ad2fb5f3ffd9641932bbbb024a31e81d31dc9bb 100644
--- a/paddle/fluid/framework/data_type.h
+++ b/paddle/fluid/framework/data_type.h
@@ -30,28 +30,31 @@ template <typename Visitor>
 inline void VisitDataType(proto::VarType::Type type, Visitor visitor) {
   switch (type) {
     case proto::VarType::FP16:
-      visitor.template operator()<platform::float16>();
+      visitor.template apply<platform::float16>();
       break;
     case proto::VarType::FP32:
-      visitor.template operator()<float>();
+      visitor.template apply<float>();
       break;
     case proto::VarType::FP64:
-      visitor.template operator()<double>();
+      visitor.template apply<double>();
       break;
     case proto::VarType::INT32:
-      visitor.template operator()<int>();
+      visitor.template apply<int>();
       break;
     case proto::VarType::INT64:
-      visitor.template operator()<int64_t>();
+      visitor.template apply<int64_t>();
       break;
     case proto::VarType::BOOL:
-      visitor.template operator()<bool>();
+      visitor.template apply<bool>();
       break;
     case proto::VarType::UINT8:
-      visitor.template operator()<uint8_t>();
+      visitor.template apply<uint8_t>();
       break;
     case proto::VarType::INT16:
-      visitor.template operator()<int16_t>();
+      visitor.template apply<int16_t>();
+      break;
+    case proto::VarType::INT8:
+      visitor.template apply<int8_t>();
       break;
     default:
       PADDLE_THROW("Not supported %d", type);
diff --git a/paddle/fluid/framework/data_type_test.cc b/paddle/fluid/framework/data_type_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..54c41c55ba63c0b2001cfcb6a9e94fbb0036d437
--- /dev/null
+++ b/paddle/fluid/framework/data_type_test.cc
@@ -0,0 +1,40 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/data_type.h"
+
+#include <string>
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/tensor.h"
+
+TEST(DataType, float16) {
+  using paddle::framework::Tensor;
+  using paddle::platform::CPUPlace;
+  using paddle::platform::float16;
+  namespace f = paddle::framework;
+  f::proto::VarType::Type dtype = f::proto::VarType::FP16;
+
+  Tensor tensor;
+  CPUPlace cpu;
+  tensor.mutable_data(cpu, f::ToTypeIndex(dtype));
+
+  // test fp16 tensor
+  EXPECT_EQ(tensor.type(), std::type_index(typeid(float16)));
+
+  // test fp16 size
+  EXPECT_EQ(f::SizeOfType(f::ToTypeIndex(dtype)), 2u);
+
+  // test debug info
+  std::string type = "float16";
+  EXPECT_STREQ(f::DataTypeToString(dtype).c_str(), type.c_str());
+}
diff --git a/paddle/fluid/framework/data_type_transform.cc b/paddle/fluid/framework/data_type_transform.cc
index 5a57ec20585c26dbcd4251464718fc819148a7a5..d79f8cacb5f4727defc77380371e57bcea65f068 100644
--- a/paddle/fluid/framework/data_type_transform.cc
+++ b/paddle/fluid/framework/data_type_transform.cc
@@ -37,7 +37,7 @@ struct CastDataType {
   const platform::DeviceContext* ctx_;
 
   template <typename OutType>
-  void operator()() {
+  void apply() {
     auto* in_begin = in_.data<InType>();
     auto* in_end = in_begin + in_.numel();
     auto* out_begin = out_->mutable_data<OutType>(in_.place());
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index 8404bf4a3e12bdd33c063678d9288bbb24bd8aea..a8e0c4a3fedfd56e38de7568be6b3f2e76a4b25f 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -1,14 +1,13 @@
-cc_library(var_handle SRCS var_handle.cc DEPS place)
+cc_library(var_handle SRCS var_handle.cc DEPS place framework_proto node)
 cc_library(op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context lod_tensor)
 cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
 cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
 cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry)
 cc_library(rpc_op_handle SRCS rpc_op_handle.cc DEPS framework_proto scope place operator op_registry)
 
-cc_library(ssa_graph SRCS ssa_graph.cc DEPS var_handle op_handle_base)
-cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS ssa_graph)
-cc_library(ssa_graph_printer SRCS ssa_graph_printer.cc DEPS ssa_graph_builder)
-cc_library(ssa_graph_checker SRCS ssa_graph_checker.cc DEPS ssa_graph_builder)
+cc_library(multi_devices_helper SRCS multi_devices_helper.cc DEPS graph graph_helper)
+cc_library(multi_devices_graph_print_pass SRCS multi_devices_graph_print_pass.cc DEPS multi_devices_helper)
+cc_library(multi_devices_graph_check_pass SRCS multi_devices_graph_check_pass.cc DEPS multi_devices_helper)
 
 cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_rows)
 
@@ -53,3 +52,5 @@ cc_test(gather_op_test SRCS gather_op_handle_test.cc DEPS var_handle op_handle_b
 cc_library(scope_buffered_ssa_graph_executor SRCS scope_buffered_ssa_graph_executor.cc DEPS ssa_graph_executor)
 #cc_test(reduce_op_handle_test SRCS reduce_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
 #        device_context reduce_op_handle )
+cc_library(fast_threaded_ssa_graph_executor SRCS fast_threaded_ssa_graph_executor.cc
+        DEPS fetch_op_handle ssa_graph_executor scope simple_threadpool device_context)
diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc
index b335d3a0d364c916e19574de8d3ed89aaec7de41..7c5f5bd80a937bf1a1c891155764833d7b21c5c2 100644
--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@@ -17,16 +17,21 @@
 #include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/reduce_and_gather.h"
 #include "paddle/fluid/framework/details/variable_visitor.h"
+#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace framework {
 namespace details {
 
 #ifdef PADDLE_WITH_CUDA
-AllReduceOpHandle::AllReduceOpHandle(const std::vector<Scope *> &local_scopes,
+AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
+                                     const std::vector<Scope *> &local_scopes,
                                      const std::vector<platform::Place> &places,
                                      const platform::NCCLContextMap *ctxs)
-    : local_scopes_(local_scopes), places_(places), nccl_ctxs_(ctxs) {
+    : OpHandleBase(node),
+      local_scopes_(local_scopes),
+      places_(places),
+      nccl_ctxs_(ctxs) {
   if (nccl_ctxs_) {
     for (auto &p : places_) {
       this->dev_ctxes_[p] = nccl_ctxs_->DevCtx(p);
@@ -34,12 +39,15 @@ AllReduceOpHandle::AllReduceOpHandle(const std::vector<Scope *> &local_scopes,
   }
 }
 #else
-AllReduceOpHandle::AllReduceOpHandle(const std::vector<Scope *> &local_scopes,
+AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
+                                     const std::vector<Scope *> &local_scopes,
                                      const std::vector<platform::Place> &places)
-    : local_scopes_(local_scopes), places_(places) {}
+    : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {}
 #endif
 
 void AllReduceOpHandle::RunImpl() {
+  platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second);
+
   if (NoDummyInputSize() == 1) {
     return;  // No need to all reduce when GPU count = 1;
   } else {
diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.h b/paddle/fluid/framework/details/all_reduce_op_handle.h
index fdd250b0d3eb166249271a95f7592b9fadee5265..f6ef3a1367b91b6abf8ce74a91f73056efd0f84e 100644
--- a/paddle/fluid/framework/details/all_reduce_op_handle.h
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.h
@@ -30,11 +30,11 @@ namespace details {
 
 struct AllReduceOpHandle : public OpHandleBase {
 #ifdef PADDLE_WITH_CUDA
-  AllReduceOpHandle(const std::vector<Scope *> &local_scopes,
+  AllReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
                     const std::vector<platform::Place> &places,
                     const platform::NCCLContextMap *ctxs);
 #else
-  AllReduceOpHandle(const std::vector<Scope *> &local_scopes,
+  AllReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
                     const std::vector<platform::Place> &places);
 #endif
   std::string Name() const override;
diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc
index 1d9f1bd6e417e30f0799f0bbed1739cedb4e8fbf..4fdab5cd94358d08eac7f8b041bf16d09042f0bd 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle.cc
@@ -15,12 +15,15 @@
 #include "paddle/fluid/framework/details/broadcast_op_handle.h"
 #include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/variable_visitor.h"
+#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace framework {
 namespace details {
 
 void BroadcastOpHandle::RunImpl() {
+  platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second);
+
   if (places_.size() == 1) return;
 
   // The input and output may have dummy vars.
diff --git a/paddle/fluid/framework/details/broadcast_op_handle.h b/paddle/fluid/framework/details/broadcast_op_handle.h
index 8036f756b6d6506684c109ab881d546f38176a10..fe4e733e43417977df324fde808f52b228a27d19 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle.h
+++ b/paddle/fluid/framework/details/broadcast_op_handle.h
@@ -35,10 +35,13 @@ namespace details {
 struct BroadcastOpHandle : public OpHandleBase {
  public:
 #ifdef PADDLE_WITH_CUDA
-  BroadcastOpHandle(const std::vector<Scope *> &local_scopes,
+  BroadcastOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
                     const std::vector<platform::Place> &places,
                     const platform::NCCLContextMap *nccl_ctxs)
-      : local_scopes_(local_scopes), places_(places), nccl_ctxs_(nccl_ctxs) {
+      : OpHandleBase(node),
+        local_scopes_(local_scopes),
+        places_(places),
+        nccl_ctxs_(nccl_ctxs) {
     if (nccl_ctxs_) {
       for (auto &p_ctx : nccl_ctxs_->contexts_) {
         dev_ctxes_[platform::CUDAPlace(p_ctx.first)] = p_ctx.second.ctx_.get();
@@ -46,9 +49,9 @@ struct BroadcastOpHandle : public OpHandleBase {
     }
   }
 #else
-  BroadcastOpHandle(const std::vector<Scope *> &local_scopes,
+  BroadcastOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
                     const std::vector<platform::Place> &places)
-      : local_scopes_(local_scopes), places_(places) {}
+      : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {}
 #endif
 
   std::string Name() const override;
diff --git a/paddle/fluid/framework/details/broadcast_op_handle_test.cc b/paddle/fluid/framework/details/broadcast_op_handle_test.cc
index c6e923ef77ff03413eefe4f26457a5322747618e..1413f7bd9ac515ae7dceee62de8f3bc74e3a2efc 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle_test.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle_test.cc
@@ -96,48 +96,61 @@ struct TestBroadcastOpHandle {
     }
     param_scopes_[input_scope_idx]->Var("input");
 
+    std::unique_ptr<ir::Node> n(
+        new ir::Node("node0", ir::Node::Type::kOperation));
     if (use_gpu_) {
 #ifdef PADDLE_WITH_CUDA
-      op_handle_.reset(
-          new BroadcastOpHandle(local_scopes_, gpu_list_, nccl_ctxs_.get()));
+      op_handle_.reset(new BroadcastOpHandle(n.get(), local_scopes_, gpu_list_,
+                                             nccl_ctxs_.get()));
 #else
       PADDLE_THROW("CUDA is not support.");
 #endif
     } else {
 #ifdef PADDLE_WITH_CUDA
-      op_handle_.reset(
-          new BroadcastOpHandle(local_scopes_, gpu_list_, nccl_ctxs_.get()));
+      op_handle_.reset(new BroadcastOpHandle(n.get(), local_scopes_, gpu_list_,
+                                             nccl_ctxs_.get()));
 #else
-      op_handle_.reset(new BroadcastOpHandle(local_scopes_, gpu_list_));
+      op_handle_.reset(
+          new BroadcastOpHandle(n.get(), local_scopes_, gpu_list_));
 #endif
     }
 
-    auto* in_var_handle =
-        new VarHandle(1, input_scope_idx, "input", gpu_list_[input_scope_idx]);
+    std::unique_ptr<ir::Node> v(
+        new ir::Node("node1", ir::Node::Type::kVariable));
+    auto* in_var_handle = new VarHandle(v.get(), 1, input_scope_idx, "input",
+                                        gpu_list_[input_scope_idx]);
     vars_.emplace_back(in_var_handle);
     op_handle_->AddInput(in_var_handle);
 
     // add dummy var
-    vars_.emplace_back(new DummyVarHandle());
+
+    std::unique_ptr<ir::Node> v2(
+        new ir::Node("node2", ir::Node::Type::kVariable));
+    vars_.emplace_back(new DummyVarHandle(v2.get()));
     DummyVarHandle* dummy_var_handle =
         static_cast<DummyVarHandle*>(vars_.back().get());
-    dummy_var_handle->generated_op_ = nullptr;
+    dummy_var_handle->ClearGeneratedOp();
     op_handle_->AddInput(dummy_var_handle);
 
     for (size_t j = 0; j < gpu_list_.size(); ++j) {
       if (!use_gpu_) {
         op_handle_->SetDeviceContext(gpu_list_[j], ctxs_[j].get());
       }
-      VarHandle* out_var_handle = new VarHandle(2, j, "out", gpu_list_[j]);
+      std::unique_ptr<ir::Node> v3(
+          new ir::Node("node3", ir::Node::Type::kVariable));
+      VarHandle* out_var_handle =
+          new VarHandle(v3.get(), 2, j, "out", gpu_list_[j]);
       vars_.emplace_back(out_var_handle);
       op_handle_->AddOutput(out_var_handle);
     }
 
     // add dummy var
-    vars_.emplace_back(new DummyVarHandle());
+    std::unique_ptr<ir::Node> v4(
+        new ir::Node("node4", ir::Node::Type::kVariable));
+    vars_.emplace_back(new DummyVarHandle(v4.get()));
     DummyVarHandle* out_dummy_var_handle =
         static_cast<DummyVarHandle*>(vars_.back().get());
-    out_dummy_var_handle->generated_op_ = nullptr;
+    out_dummy_var_handle->ClearGeneratedOp();
     op_handle_->AddOutput(out_dummy_var_handle);
   }
 
diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h
index b2e5399e2376a86c1cd310b29c768832665af87f..8714a42162bda3d5ad12e7925fe8cc4e693f51b1 100644
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -21,6 +21,26 @@ namespace framework {
 namespace details {
 
 struct BuildStrategy {
+  // ParallelExecutor supports two modes of ReduceStrategy, kAllReduce and
+  // kReduce, for CPU and GPU. If you use kAllReduce, different threads
+  // optimize their parameters separately. If you use kReduce, the optimizations
+  // of parameters are distributed to different threads.
+  // For example, a model has 100 parameters and is running with four threads,
+  // if you choose kAllReduce, every thread is to optimize 100 parameters
+  // separately, if you choose kReduce, every thread is to optimize 25
+  // parameters.
+  // Of particular note is, if you use kReduce when using CPU training,
+  // all the parameters are shared between different threads. This feature will
+  // save memory.
+  // FIXME(zcd): The result of the two modes(kAllReduce and kReduce) maybe not
+  // equal for GPU. Because, the result of the different order of summing maybe
+  // different, for example, the result of `a+b+c+d` may be different with the
+  // result of `c+a+b+d`.
+  // For GPU, the implementation of kAllReduce and kReduce is adopted NCCL,
+  // so the result of kAllReduce and kReduce maybe not equal.
+  // For CPU, if you want to fix the order of summing to make the result
+  // of kAllReduce and kReduce no diff, you can add
+  // `FLAGS_cpu_deterministic=true` to env.
   enum class ReduceStrategy { kAllReduce = 0, kReduce = 1 };
 
   enum class GradientScaleStrategy {
diff --git a/paddle/fluid/framework/details/computation_op_handle.cc b/paddle/fluid/framework/details/computation_op_handle.cc
index df05bb06333d6b964f2f5434c3d43214e5d2cb7a..b6282debdb4eb6b1f29c39e54ac4f3e2296838da 100644
--- a/paddle/fluid/framework/details/computation_op_handle.cc
+++ b/paddle/fluid/framework/details/computation_op_handle.cc
@@ -19,9 +19,10 @@
 namespace paddle {
 namespace framework {
 namespace details {
-ComputationOpHandle::ComputationOpHandle(const OpDesc &op_desc, Scope *scope,
+ComputationOpHandle::ComputationOpHandle(ir::Node *node, Scope *scope,
                                          platform::Place place)
-    : op_(framework::OpRegistry::CreateOp(op_desc)),
+    : OpHandleBase(node),
+      op_(framework::OpRegistry::CreateOp(*node->Op())),
       scope_(scope),
       place_(place) {}
 
@@ -35,8 +36,8 @@ void ComputationOpHandle::RunImpl() {
 
 bool ComputationOpHandle::NeedWait(VarHandleBase *in_var) {
   bool need_wait =
-      in_var && in_var->generated_op_ &&
-      in_var->generated_op_->DeviceContext(place_) != dev_ctxes_[place_];
+      in_var && in_var->GeneratedOp() &&
+      in_var->GeneratedOp()->DeviceContext(place_) != dev_ctxes_[place_];
   return need_wait;
 }
 
diff --git a/paddle/fluid/framework/details/computation_op_handle.h b/paddle/fluid/framework/details/computation_op_handle.h
index 401ebb7953bb5d6c81d1e5206598c4b0ee5904c8..9a330749ea97dd5774b49638e2442d24b26d7b70 100644
--- a/paddle/fluid/framework/details/computation_op_handle.h
+++ b/paddle/fluid/framework/details/computation_op_handle.h
@@ -30,8 +30,7 @@ namespace framework {
 namespace details {
 struct ComputationOpHandle : public OpHandleBase {
  public:
-  ComputationOpHandle(const OpDesc &op_desc, Scope *scope,
-                      platform::Place place);
+  ComputationOpHandle(ir::Node *node, Scope *scope, platform::Place place);
 
   std::string Name() const override;
 
diff --git a/paddle/fluid/framework/details/data_balance_op_handle.cc b/paddle/fluid/framework/details/data_balance_op_handle.cc
index 68896c8ac1bae7d4bfcfa79cc8ec5c26bf2d93ee..525d24322442ef4dd6e8c24212af61c908959b87 100644
--- a/paddle/fluid/framework/details/data_balance_op_handle.cc
+++ b/paddle/fluid/framework/details/data_balance_op_handle.cc
@@ -22,10 +22,10 @@ namespace details {
 
 #ifdef PADDLE_WITH_CUDA
 DataBalanceOpHandle::DataBalanceOpHandle(
-    const std::vector<Scope *> &local_scopes,
+    ir::Node *node, const std::vector<Scope *> &local_scopes,
     const std::vector<platform::Place> &places,
     const platform::NCCLContextMap *ctxs)
-    : local_scopes_(local_scopes), places_(places) {
+    : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {
   if (ctxs) {
     for (auto &p : places_) {
       this->dev_ctxes_[p] = ctxs->DevCtx(p);
@@ -34,9 +34,9 @@ DataBalanceOpHandle::DataBalanceOpHandle(
 }
 #else
 DataBalanceOpHandle::DataBalanceOpHandle(
-    const std::vector<Scope *> &local_scopes,
+    ir::Node *node, const std::vector<Scope *> &local_scopes,
     const std::vector<platform::Place> &places)
-    : local_scopes_(local_scopes), places_(places) {}
+    : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {}
 #endif
 
 std::string DataBalanceOpHandle::Name() const { return "data balance"; }
diff --git a/paddle/fluid/framework/details/data_balance_op_handle.h b/paddle/fluid/framework/details/data_balance_op_handle.h
index 76a407e3610e8bb48facf1f814779f4c23f92d98..0462fb6ec713eb977f420a9cb485c0273e782496 100644
--- a/paddle/fluid/framework/details/data_balance_op_handle.h
+++ b/paddle/fluid/framework/details/data_balance_op_handle.h
@@ -30,11 +30,11 @@ namespace details {
 struct DataBalanceOpHandle : public OpHandleBase {
  public:
 #ifdef PADDLE_WITH_CUDA
-  DataBalanceOpHandle(const std::vector<Scope *> &local_scopes,
+  DataBalanceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
                       const std::vector<platform::Place> &places,
                       const platform::NCCLContextMap *ctxs);
 #else
-  DataBalanceOpHandle(const std::vector<Scope *> &local_scopes,
+  DataBalanceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
                       const std::vector<platform::Place> &places);
 #endif
 
diff --git a/paddle/fluid/framework/details/exception_holder.h b/paddle/fluid/framework/details/exception_holder.h
new file mode 100644
index 0000000000000000000000000000000000000000..c97b364de1ecae21e97351196389615187932b5e
--- /dev/null
+++ b/paddle/fluid/framework/details/exception_holder.h
@@ -0,0 +1,95 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "glog/logging.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+class ExceptionHolder {
+ public:
+  void Catch(std::exception_ptr eptr) {
+    try {
+      std::rethrow_exception(eptr);
+    } catch (platform::EOFException exp) {
+      Catch(exp);
+    } catch (platform::EnforceNotMet exp) {
+      Catch(exp);
+    } catch (...) {
+      LOG(FATAL) << "Unknown exception caught";
+    }
+  }
+
+  bool IsCaught() const {
+    std::lock_guard<std::mutex> lock(mu_);
+    return exception_.get() != nullptr;
+  }
+
+  void ReThrow() {
+    std::lock_guard<std::mutex> lock(mu_);
+    switch (type_) {
+      case kNone:
+        break;
+      case kEnforceNotMet: {
+        auto e = *static_cast<platform::EnforceNotMet*>(exception_.get());
+        throw e;
+      }
+      case kEOF: {
+        auto e = *static_cast<platform::EOFException*>(exception_.get());
+        throw e;
+      }
+    }
+    ClearImpl();
+  }
+
+  void Clear() {
+    std::lock_guard<std::mutex> lock(mu_);
+    ClearImpl();
+  }
+
+ private:
+  void ClearImpl() {
+    exception_.reset();
+    type_ = kNone;
+  }
+
+  void Catch(const platform::EnforceNotMet& exp) {
+    std::lock_guard<std::mutex> lock(mu_);
+    exception_.reset(new platform::EnforceNotMet(exp));
+    type_ = kEnforceNotMet;
+  }
+
+  void Catch(const platform::EOFException& exp) {
+    std::lock_guard<std::mutex> lock(mu_);
+    // EOFException will not cover up existing EnforceNotMet.
+    if (exception_.get() == nullptr) {
+      exception_.reset(new platform::EOFException(exp));
+      type_ = kEOF;
+    }
+  }
+
+  enum ExceptionType { kNone, kEnforceNotMet, kEOF };
+  ExceptionType type_{kNone};
+
+  std::unique_ptr<std::exception> exception_;
+  mutable std::mutex mu_;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/execution_strategy.h b/paddle/fluid/framework/details/execution_strategy.h
index 716d674fa29bad9321fc20979775c06f26bf4679..5183be878eb49cccc68603c3fdd8023be5578036 100644
--- a/paddle/fluid/framework/details/execution_strategy.h
+++ b/paddle/fluid/framework/details/execution_strategy.h
@@ -19,10 +19,13 @@ namespace framework {
 namespace details {
 
 struct ExecutionStrategy {
+  enum ExecutorType { kDefault = 0, kExperimental = 1 };
+
   size_t num_threads_{0};
   bool use_cuda_{true};
   bool allow_op_delay_{false};
   size_t num_iteration_per_drop_scope_{100};
+  ExecutorType type_{kDefault};
 };
 
 }  //  namespace details
diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7606f2bc06b2ecf07c5649eeae1a2d5587a8880c
--- /dev/null
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
@@ -0,0 +1,175 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/details/fetch_op_handle.h"
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+FastThreadedSSAGraphExecutor::FastThreadedSSAGraphExecutor(
+    const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
+    const std::vector<platform::Place> &places,
+    std::unique_ptr<ir::Graph> &&graph)
+    : strategy_(strategy),
+      local_scopes_(local_scopes),
+      places_(places),
+      graph_(std::move(graph)),
+      pool_(strategy.num_threads_ +
+            1),  // add one more thread for generate op_deps
+      fetch_ctxs_(places) {
+  auto &ops = graph_->Get<details::GraphOps>("ops");
+
+  for (auto &op : ops) {
+    int dep = static_cast<int>(op->NotReadyInputSize());
+    op_deps_.emplace(op.get(), dep);
+    if (dep == 0) {
+      bootstrap_ops_.emplace_back(op.get());
+    }
+  }
+
+  PrepareAtomicOpDeps();
+}
+
+FeedFetchList FastThreadedSSAGraphExecutor::Run(
+    const std::vector<std::string> &fetch_tensors) {
+  std::unique_ptr<std::unordered_map<OpHandleBase *, std::atomic<int>>>
+      op_deps = atomic_op_deps_.get();
+  PrepareAtomicOpDeps();
+
+  paddle::framework::FeedFetchList fetches;
+  fetches.resize(fetch_tensors.size());
+  std::unordered_map<std::string, std::vector<VarHandleBase *>> fetched_vars;
+  std::vector<std::unique_ptr<ir::Node>> fetch_nodes;
+  std::vector<std::unique_ptr<FetchOpHandle>> fetch_ops;
+
+  for (auto &fetch_var_name : fetch_tensors) {
+    for (auto &var_map : graph_->Get<details::GraphVars>("vars")) {
+      auto it = var_map.find(fetch_var_name);
+      if (it != var_map.end()) {
+        fetched_vars[fetch_var_name].push_back(it->second.rbegin()->get());
+      }
+    }
+  }
+
+  for (size_t i = 0; i < fetch_tensors.size(); ++i) {
+    auto &var_name = fetch_tensors[i];
+    auto fetched_var_it = fetched_vars.find(var_name);
+    PADDLE_ENFORCE(fetched_var_it != fetched_vars.end(),
+                   "Cannot find fetched variable.(Perhaps the main_program "
+                   "is not set to ParallelExecutor)");
+
+    auto &vars = fetched_var_it->second;
+
+    fetch_nodes.emplace_back(new ir::Node("fetch", ir::Node::Type::kOperation));
+    auto *op = new FetchOpHandle(fetch_nodes.back().get(), &fetches, i,
+                                 &local_scopes_);
+    fetch_ops.emplace_back(op);
+
+    for (auto &p : places_) {
+      op->SetDeviceContext(p, fetch_ctxs_.Get(p));
+    }
+
+    for (auto *var : vars) {
+      op->AddInput(var);
+    }
+
+    (*op_deps)[op] = static_cast<int>(op->NotReadyInputSize());
+  }
+
+  size_t num_complete = 0;
+  remaining_ = 0;
+  BlockingQueue<size_t> complete_q;
+  for (auto op : bootstrap_ops_) {
+    RunOpAsync(op_deps.get(), op, &complete_q);
+  }
+
+  while (num_complete != op_deps->size()) {
+    size_t num_comp = complete_q.Pop();
+    if (num_comp == -1UL) {
+      int remaining = 0;
+      while (true) {
+        remaining = remaining_;
+        if (remaining == 0) {
+          break;
+        }
+        for (int i = 0; i < remaining; ++i) {
+          complete_q.Pop();
+        }
+      }
+      exception_.ReThrow();
+    }
+    num_complete += num_comp;
+  }
+  // Wait FetchOps.
+  if (!fetch_ops.empty()) {
+    fetch_ops.clear();
+  }
+  return fetches;
+}
+void FastThreadedSSAGraphExecutor::RunOpAsync(
+    std::unordered_map<OpHandleBase *, std::atomic<int>> *op_deps,
+    OpHandleBase *op, BlockingQueue<size_t> *complete_q) {
+  ++remaining_;
+  this->pool_.enqueue([=] {
+    OpHandleBase *op_to_run = op;
+    size_t complete = 0;
+    while (op_to_run != nullptr) {
+      try {
+        op_to_run->Run(strategy_.use_cuda_);
+        ++complete;
+      } catch (...) {
+        exception_.Catch(std::current_exception());
+        --remaining_;
+        complete_q->Push(-1UL);
+        return;
+      }
+      auto &outputs = op_to_run->Outputs();
+      op_to_run = nullptr;
+      for (auto &output : outputs) {
+        for (auto &pending_op : output->PendingOps()) {
+          std::atomic<int> &deps = op_deps->at(pending_op);
+          if (deps.fetch_sub(1) == 1) {  // pending_op ready
+            if (op_to_run == nullptr) {
+              op_to_run = pending_op;
+            } else {
+              this->RunOpAsync(op_deps, pending_op, complete_q);
+            }
+          }
+        }
+      }
+    }
+    --remaining_;
+    complete_q->Push(complete);
+  });
+}
+void FastThreadedSSAGraphExecutor::PrepareAtomicOpDeps() {
+  atomic_op_deps_ = pool_.enqueue([&] {
+    std::unordered_map<OpHandleBase *, std::atomic<int>> *op_deps =
+        new std::unordered_map<OpHandleBase *, std::atomic<int>>;
+    for (auto &pair : op_deps_) {
+      (*op_deps)[pair.first] = pair.second;
+    }
+    return std::unique_ptr<
+        std::unordered_map<OpHandleBase *, std::atomic<int>>>(op_deps);
+  });
+}
+
+const ir::Graph &FastThreadedSSAGraphExecutor::Graph() const { return *graph_; }
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
new file mode 100644
index 0000000000000000000000000000000000000000..dad3a231cba6402f57ba654a9ac5fb520b9c8f04
--- /dev/null
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
@@ -0,0 +1,64 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "ThreadPool.h"
+#include "paddle/fluid/framework/blocking_queue.h"
+#include "paddle/fluid/framework/details/exception_holder.h"
+#include "paddle/fluid/framework/details/execution_strategy.h"
+#include "paddle/fluid/framework/details/ssa_graph_executor.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+namespace details {
+
+class OpHandleBase;
+class FastThreadedSSAGraphExecutor : public SSAGraphExecutor {
+ public:
+  FastThreadedSSAGraphExecutor(const ExecutionStrategy &strategy,
+                               const std::vector<Scope *> &local_scopes,
+                               const std::vector<platform::Place> &places,
+                               std::unique_ptr<ir::Graph> &&graph);
+  FeedFetchList Run(const std::vector<std::string> &fetch_tensors) override;
+  const ir::Graph &Graph() const override;
+
+ private:
+  ExecutionStrategy strategy_;
+  std::vector<Scope *> local_scopes_;
+  std::vector<platform::Place> places_;
+  std::unique_ptr<ir::Graph> graph_;
+
+  std::unordered_map<OpHandleBase *, int> op_deps_;
+  std::vector<OpHandleBase *> bootstrap_ops_;
+
+  ::ThreadPool pool_;
+  platform::DeviceContextPool fetch_ctxs_;
+  std::atomic<int> remaining_;
+
+  void RunOpAsync(std::unordered_map<OpHandleBase *, std::atomic<int>> *op_deps,
+                  OpHandleBase *op, BlockingQueue<size_t> *complete_q);
+
+  void PrepareAtomicOpDeps();
+
+  std::future<
+      std::unique_ptr<std::unordered_map<OpHandleBase *, std::atomic<int>>>>
+      atomic_op_deps_;
+  ExceptionHolder exception_;
+};
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc
index d646c944601e81477787740189d7ac60ae97fa80..fe18b2060c5cd7e157374da53c5a985f70545ab7 100644
--- a/paddle/fluid/framework/details/fetch_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_op_handle.cc
@@ -21,13 +21,16 @@ namespace paddle {
 namespace framework {
 namespace details {
 
-FetchOpHandle::FetchOpHandle(FeedFetchList *data, size_t offset,
+FetchOpHandle::FetchOpHandle(ir::Node *node, FeedFetchList *data, size_t offset,
                              std::vector<Scope *> *local_scopes)
-    : data_(data), offset_(offset), local_scopes_(local_scopes) {}
+    : OpHandleBase(node),
+      data_(data),
+      offset_(offset),
+      local_scopes_(local_scopes) {}
 
 FetchOpHandle::~FetchOpHandle() {
   for (auto *input_var : inputs_) {
-    input_var->pending_ops_.erase(this);
+    input_var->RemoveOutput(this, this->Node());
   }
 }
 
@@ -77,8 +80,8 @@ void FetchOpHandle::RunImpl() {
 void FetchOpHandle::WaitInputVarGenerated(const platform::Place &place) {
   auto cpu_ctx = platform::DeviceContextPool::Instance().Get(place);
   for (auto *input : inputs_) {
-    if (input->generated_op_) {
-      input->generated_op_->RecordWaitEventOnCtx(cpu_ctx);
+    if (input->GeneratedOp()) {
+      input->GeneratedOp()->RecordWaitEventOnCtx(cpu_ctx);
     }
   }
 }
diff --git a/paddle/fluid/framework/details/fetch_op_handle.h b/paddle/fluid/framework/details/fetch_op_handle.h
index e09bdd1d3338bb175c1ddae35b53f98197b68e9a..6ce42f92d7f1e81eeafd1eb5c28ce3564a5ffebc 100644
--- a/paddle/fluid/framework/details/fetch_op_handle.h
+++ b/paddle/fluid/framework/details/fetch_op_handle.h
@@ -28,7 +28,7 @@ namespace details {
 
 struct FetchOpHandle : public OpHandleBase {
  public:
-  FetchOpHandle(FeedFetchList *data, size_t offset,
+  FetchOpHandle(ir::Node *node, FeedFetchList *data, size_t offset,
                 std::vector<Scope *> *local_scopes);
 
   ~FetchOpHandle();
diff --git a/paddle/fluid/framework/details/fuse_vars_op_handle.h b/paddle/fluid/framework/details/fuse_vars_op_handle.h
index 140fb5bb49a33146de974b6d79559b4cf15bdd7b..3f360c510a4fdc0caaeb15d862b217ef41b8ea6e 100644
--- a/paddle/fluid/framework/details/fuse_vars_op_handle.h
+++ b/paddle/fluid/framework/details/fuse_vars_op_handle.h
@@ -30,10 +30,12 @@ namespace details {
 
 struct FuseVarsOpHandle : public OpHandleBase {
  public:
-  FuseVarsOpHandle(Scope *local_scope, const platform::Place &place,
+  FuseVarsOpHandle(ir::Node *node, Scope *local_scope,
+                   const platform::Place &place,
                    const std::unordered_map<std::string, int64_t> &inputs_numel,
                    const std::type_index &var_type)
-      : local_scope_(local_scope),
+      : OpHandleBase(node),
+        local_scope_(local_scope),
         place_(place),
         inputs_numel_(inputs_numel),
         type_(var_type) {
diff --git a/paddle/fluid/framework/details/gather_op_handle.cc b/paddle/fluid/framework/details/gather_op_handle.cc
index 2be02304566cf5dbe348fa01fc4171990eafd158..9aae19fc73de4387186da47c55710c94d53f1b88 100644
--- a/paddle/fluid/framework/details/gather_op_handle.cc
+++ b/paddle/fluid/framework/details/gather_op_handle.cc
@@ -20,9 +20,10 @@ namespace paddle {
 namespace framework {
 namespace details {
 
-GatherOpHandle::GatherOpHandle(const std::vector<Scope *> &local_scopes,
+GatherOpHandle::GatherOpHandle(ir::Node *node,
+                               const std::vector<Scope *> &local_scopes,
                                const std::vector<platform::Place> &places)
-    : local_scopes_(local_scopes), places_(places) {}
+    : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {}
 
 void GatherOpHandle::RunImpl() {
   if (places_.size() == 1) return;
diff --git a/paddle/fluid/framework/details/gather_op_handle.h b/paddle/fluid/framework/details/gather_op_handle.h
index d11ef8556aa8840949ca8dc7aa176413f70b9f22..d9afbc6547e18e8886c414ff150e332cfaf9b0c3 100644
--- a/paddle/fluid/framework/details/gather_op_handle.h
+++ b/paddle/fluid/framework/details/gather_op_handle.h
@@ -30,7 +30,7 @@ namespace details {
 
 struct GatherOpHandle : public OpHandleBase {
  public:
-  GatherOpHandle(const std::vector<Scope *> &local_scopes,
+  GatherOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
                  const std::vector<platform::Place> &places);
 
   std::string Name() const override;
diff --git a/paddle/fluid/framework/details/gather_op_handle_test.cc b/paddle/fluid/framework/details/gather_op_handle_test.cc
index 3cce2cc1640b3866130126424ff8fef18b8befc6..c9b94d1e1039df6ff27f9ffe225b2a50c35a5c50 100644
--- a/paddle/fluid/framework/details/gather_op_handle_test.cc
+++ b/paddle/fluid/framework/details/gather_op_handle_test.cc
@@ -70,6 +70,7 @@ struct TestGatherOpHandle {
   }
 
   void InitGatherOp(size_t input_scope_idx) {
+    std::vector<std::unique_ptr<ir::Node>> nodes;
     for (size_t j = 0; j < gpu_list_.size(); ++j) {
       local_scopes_.push_back(&(g_scope_.NewScope()));
       Scope& local_scope = local_scopes_.back()->NewScope();
@@ -81,30 +82,37 @@ struct TestGatherOpHandle {
     }
     param_scopes_[input_scope_idx]->Var("out");
 
-    op_handle_.reset(new GatherOpHandle(local_scopes_, gpu_list_));
+    nodes.emplace_back(new ir::Node("node", ir::Node::Type::kOperation));
+    op_handle_.reset(
+        new GatherOpHandle(nodes.back().get(), local_scopes_, gpu_list_));
     // add input
     for (size_t j = 0; j < gpu_list_.size(); ++j) {
       op_handle_->SetDeviceContext(gpu_list_[j], ctxs_[j].get());
-      auto* in_var_handle = new VarHandle(1, j, "input", gpu_list_[j]);
+      nodes.emplace_back(new ir::Node("node1", ir::Node::Type::kVariable));
+      auto* in_var_handle =
+          new VarHandle(nodes.back().get(), 1, j, "input", gpu_list_[j]);
       vars_.emplace_back(in_var_handle);
       op_handle_->AddInput(in_var_handle);
     }
 
     // add dummy var
-    vars_.emplace_back(new DummyVarHandle());
+    nodes.emplace_back(new ir::Node("node2", ir::Node::Type::kVariable));
+    vars_.emplace_back(new DummyVarHandle(nodes.back().get()));
     DummyVarHandle* in_dummy_var_handle =
         static_cast<DummyVarHandle*>(vars_.back().get());
-    in_dummy_var_handle->generated_op_ = nullptr;
+    in_dummy_var_handle->ClearGeneratedOp();
     op_handle_->AddInput(in_dummy_var_handle);
 
     // add output
-    auto* out_var_handle =
-        new VarHandle(2, input_scope_idx, "out", gpu_list_[input_scope_idx]);
+    nodes.emplace_back(new ir::Node("node3", ir::Node::Type::kVariable));
+    auto* out_var_handle = new VarHandle(nodes.back().get(), 2, input_scope_idx,
+                                         "out", gpu_list_[input_scope_idx]);
     vars_.emplace_back(out_var_handle);
     op_handle_->AddOutput(out_var_handle);
 
     // add dummy var
-    vars_.emplace_back(new DummyVarHandle());
+    nodes.emplace_back(new ir::Node("node4", ir::Node::Type::kVariable));
+    vars_.emplace_back(new DummyVarHandle(nodes.back().get()));
     DummyVarHandle* dummy_var_handle =
         static_cast<DummyVarHandle*>(vars_.back().get());
     op_handle_->AddOutput(dummy_var_handle);
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
deleted file mode 100644
index b82c2ef4082110f1621eb38d50361396511a4825..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ /dev/null
@@ -1,602 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include <algorithm>
-#include <fstream>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/details/all_reduce_op_handle.h"
-#include "paddle/fluid/framework/details/broadcast_op_handle.h"
-#include "paddle/fluid/framework/details/computation_op_handle.h"
-#include "paddle/fluid/framework/details/data_balance_op_handle.h"
-#include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
-#include "paddle/fluid/framework/details/reduce_op_handle.h"
-#include "paddle/fluid/framework/details/rpc_op_handle.h"
-#include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h"
-#include "paddle/fluid/framework/op_info.h"
-#include "paddle/fluid/framework/scope.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-#ifdef PADDLE_WITH_CUDA
-MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
-    const std::vector<platform::Place> &places,
-    const std::string &loss_var_name,
-    const std::unordered_set<std::string> &params,
-    const std::vector<Scope *> &local_scopes,
-    platform::NCCLContextMap *nccl_ctxs, const BuildStrategy &strategy)
-    : loss_var_name_(loss_var_name),
-      places_(places),
-      local_scopes_(local_scopes),
-      nccl_ctxs_(nccl_ctxs),
-      strategy_(strategy) {
-#else
-MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
-    const std::vector<platform::Place> &places,
-    const std::string &loss_var_name,
-    const std::unordered_set<std::string> &params,
-    const std::vector<Scope *> &local_scopes, const BuildStrategy &strategy)
-    : loss_var_name_(loss_var_name),
-      places_(places),
-      local_scopes_(local_scopes),
-      strategy_(strategy) {
-#endif
-  for (auto &p : params) {
-    grad_names_.insert(GradVarName(p));
-  }
-  balance_vars_.resize(places_.size(), 0);
-  if (strategy_.enable_data_balance_ && places_.size() == 1) {
-    LOG(WARNING) << "It is no need to enable data balance when there is only "
-                    "one place. enable_data_balance is set to False.";
-    strategy_.enable_data_balance_ = false;
-  }
-}
-
-void MultiDevSSAGraphBuilder::CreateOpHandleIOs(SSAGraph *result,
-                                                const OpDesc &op,
-                                                size_t place_id) const {
-  auto p = places_[place_id];
-  auto *op_handle = result->ops_.back().get();
-  op_handle->SetDeviceContext(p,
-                              platform::DeviceContextPool::Instance().Get(p));
-
-  for (auto &each_var_name : op.InputArgumentNames()) {
-    VarHandle *var =
-        CreateOrGetLatestVarHandle(result, each_var_name, p, place_id);
-    op_handle->AddInput(var);
-  }
-
-  for (auto &each_var_name : op.OutputArgumentNames()) {
-    CreateOpOutput(result, op_handle, each_var_name, p, place_id);
-  }
-}
-
-std::vector<std::string> MultiDevSSAGraphBuilder::FindDistTrainSendVars(
-    const ProgramDesc &program) const {
-  std::vector<std::string> send_vars;
-  // since parameters are all in block 0,
-  // it's enough to only scan send ops in block 0
-  for (auto *op : program.Block(0).AllOps()) {
-    // TODO(Yancey1989): use a graceful method to find send op,
-    // instead of the the hard code string
-    if (op->Type() == "send") {
-      auto op_vars = op->InputArgumentNames();
-      send_vars.reserve(send_vars.size() +
-                        std::distance(op_vars.begin(), op_vars.end()));
-      send_vars.insert(send_vars.end(), op_vars.begin(), op_vars.end());
-    }
-  }
-  return send_vars;
-}
-
-std::vector<std::string> MultiDevSSAGraphBuilder::FindDistTrainRecvVars(
-    const ProgramDesc &program) const {
-  std::vector<std::string> recv_vars;
-  for (auto *op : program.Block(0).AllOps()) {
-    // TODO(Yancey1989): use a graceful method to find recv op,
-    // instead of the hard code string
-    if (op->Type() == "recv") {
-      auto op_vars = op->OutputArgumentNames();
-      recv_vars.reserve(recv_vars.size() +
-                        std::distance(op_vars.begin(), op_vars.end()));
-      recv_vars.insert(recv_vars.end(), op_vars.begin(), op_vars.end());
-    }
-  }
-  return recv_vars;
-}
-
-bool MultiDevSSAGraphBuilder::IsDistTrainOp(
-    const OpDesc &op, const std::vector<std::string> &send_vars,
-    const std::vector<std::string> &recv_vars) const {
-  if (send_vars.size() == 0 || recv_vars.size() == 0) {
-    return false;
-  }
-
-  /**
-   * Check any of opvars contains `.block` and in sendvars
-   */
-  auto checker = [](const std::vector<std::string> &opvars,
-                    const std::vector<std::string> &rpc_vars) -> bool {
-    for (auto &var : opvars) {
-      // a variable name with the suffix `.block` means it's a splited
-      // variable by (DistributeTranspiler)
-      // [python/paddle/fluid/transpiler/distribute_transpiler.py]
-      if (var.find(".block") != std::string::npos &&
-          std::find(rpc_vars.begin(), rpc_vars.end(), var) != rpc_vars.end()) {
-        return true;
-      }
-    }
-    return false;
-  };
-
-  return checker(op.OutputArgumentNames(), send_vars) ||
-         checker(op.InputArgumentNames(), recv_vars);
-}
-
-size_t MultiDevSSAGraphBuilder::GetAppropriateDeviceID(
-    const std::vector<std::string> &var_names) const {
-  int64_t numel_sum = 0;
-  for (auto var_name : var_names) {
-    auto var_desc = all_vars_.at(var_name);
-    PADDLE_ENFORCE_NOT_NULL(var_desc);
-    auto dim = framework::make_ddim(var_desc->GetShape());
-    int64_t numel = framework::product(dim);
-    PADDLE_ENFORCE_GT(numel, 0);
-    numel_sum += numel;
-  }
-
-  auto smallest =
-      std::min_element(std::begin(balance_vars_), std::end(balance_vars_));
-  size_t dev_id =
-      static_cast<size_t>(std::distance(std::begin(balance_vars_), smallest));
-  balance_vars_[dev_id] += numel_sum;
-  return dev_id;
-}
-
-std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
-    const ProgramDesc &program) const {
-  for (auto *var : program.Block(0).AllVars()) {
-    all_vars_.emplace(var->Name(), var);
-  }
-
-  auto graph = new SSAGraph();
-  SSAGraph &result = *graph;
-  std::unordered_set<std::string> og_has_been_broadcast;
-
-  // We cannot invoke resize. It is a bug of GCC 4.8
-  result.vars_ = std::vector<
-      std::unordered_map<std::string, std::vector<std::unique_ptr<VarHandle>>>>(
-      places_.size());
-
-  // find send/recv vars so that we can place the distributed training
-  // realted op in the place 0
-  auto send_vars = FindDistTrainSendVars(program);
-  auto recv_vars = FindDistTrainRecvVars(program);
-
-  std::vector<std::unordered_set<std::string>> bcast_var_name_set;
-  bcast_var_name_set.resize(places_.size());
-
-  size_t cur_device_id = 0;
-  bool is_forwarding = true;
-
-  for (auto *op : program.Block(0).AllOps()) {
-    if (boost::get<int>(
-            op->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) ==
-        static_cast<int>(OpRole::kRPC)) {
-      CreateRPCOp(&result, *op);
-    } else if (IsDistTrainOp(*op, send_vars, recv_vars)) {
-      CreateDistTrainOp(&result, *op);
-    } else if (IsScaleLossOp(*op)) {
-      // user can customize loss@grad if not use_default_grad_scale_
-      if (strategy_.gradient_scale_ !=
-          BuildStrategy::GradientScaleStrategy::kCustomized) {
-        CreateScaleLossGradOp(&result);
-      }
-      // This assumes the backward generating code will ensure IsScaleLossOp
-      // is true only for the op that scale the final scalar loss.
-      // It also assumes backward op will always follow the forward op in
-      // the block.
-      is_forwarding = false;
-    } else {
-      int op_dev_id = GetOpDeviceID(*op);
-      if (op_dev_id != -1) {  // This op only runs on one specific device.
-        CreateComputationalOp(&result, *op, op_dev_id);
-        for (auto &var_name : op->OutputArgumentNames()) {
-          var_name_on_devices_.emplace(var_name, op_dev_id);
-        }
-      } else {
-        // This op runs on all devices, and its output may have parameter's
-        // gradients.
-        if (op->Type() == "read" && strategy_.enable_data_balance_) {
-          op->SetAttr("throw_eof_exp", false);
-          CreateComputationalOps(&result, *op, places_.size());
-          const auto &data_var_names = op->Output("Out");
-          InsertDataBalanceOp(&result, data_var_names);
-        } else {
-          CreateComputationalOps(&result, *op, places_.size());
-        }
-
-        if (!is_forwarding && places_.size() > 1) {
-          // Currently, we assume that once gradient is generated, it can be
-          // broadcast, and each gradient is only broadcast once.
-          if (static_cast<bool>(boost::get<int>(op->GetAttr(
-                                    OpProtoAndCheckerMaker::OpRoleAttrName())) &
-                                static_cast<int>(OpRole::kBackward))) {
-            try {
-              auto backward_vars =
-                  boost::get<std::vector<std::string>>(op->GetNullableAttr(
-                      OpProtoAndCheckerMaker::OpRoleVarAttrName()));
-
-              PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0);
-
-              for (size_t i = 0; i < backward_vars.size(); i += 2) {
-                auto &p_name = backward_vars[i];
-                auto &g_name = backward_vars[i + 1];
-                VLOG(10) << "Bcast " << g_name << " for parameter " << p_name;
-
-                switch (strategy_.reduce_) {
-                  case BuildStrategy::ReduceStrategy::kReduce:
-                    cur_device_id = GetAppropriateDeviceID({g_name});
-                    CreateReduceOp(&result, g_name, cur_device_id);
-                    var_name_on_devices_.emplace(g_name, cur_device_id);
-                    bcast_var_name_set[cur_device_id].emplace(p_name);
-                    break;
-                  case BuildStrategy::ReduceStrategy::kAllReduce:
-                    if (IsSparseGradient(g_name)) {
-                      CreateReduceOp(&result, g_name, 0);
-                      CreateBroadcastOp(&result, g_name, 0);
-                    } else {
-                      InsertAllReduceOp(&result, g_name);
-                    }
-                    break;
-                  default:
-                    LOG(FATAL) << "Unknown reduce strategy ";
-                    break;
-                }
-              }
-            } catch (boost::bad_get e) {
-            }
-          }
-        }
-      }
-    }
-  }
-
-  // Insert BCast Ops
-  for (size_t dev_id = 0; dev_id < bcast_var_name_set.size(); ++dev_id) {
-    auto &to_bcast_set = bcast_var_name_set[dev_id];
-    for (auto &bcast_name : to_bcast_set) {
-      CreateBroadcastOp(&result, bcast_name, dev_id);
-    }
-  }
-  /*
-    Dependency graph has been constructed. However, there are still data
-    hazards need to be handled.
-   */
-  PolishGraphToSupportDataHazards(&result);
-
-  /*
-   * Only variables should be the leaves of graph.
-   */
-  AddOutputToLeafOps(&result);
-
-  return std::unique_ptr<SSAGraph>(graph);
-}
-
-bool MultiDevSSAGraphBuilder::IsSparseGradient(const std::string &og) const {
-  PADDLE_ENFORCE(all_vars_.count(og) != 0);
-  if (all_vars_.at(og)->GetType() == proto::VarType::SELECTED_ROWS) {
-    return true;
-  }
-  return false;
-}
-
-void MultiDevSSAGraphBuilder::SetCommunicationContext(
-    OpHandleBase *op_handle, const platform::Place &p) const {
-#ifdef PADDLE_WITH_CUDA
-  if (nccl_ctxs_ == nullptr) {
-    op_handle->SetDeviceContext(p,
-                                platform::DeviceContextPool::Instance().Get(p));
-  }
-#else
-  op_handle->SetDeviceContext(p,
-                              platform::DeviceContextPool::Instance().Get(p));
-#endif
-}
-
-void MultiDevSSAGraphBuilder::CreateBroadcastOp(SSAGraph *result,
-                                                const std::string &p_name,
-                                                size_t src_dev_id) const {
-#ifdef PADDLE_WITH_CUDA
-  auto *op_handle = new BroadcastOpHandle(local_scopes_, places_, nccl_ctxs_);
-#else
-  auto *op_handle = new BroadcastOpHandle(local_scopes_, places_);
-#endif
-
-  result->ops_.emplace_back(op_handle);
-  auto *in = result->vars_.at(src_dev_id).at(p_name).back().get();
-  op_handle->AddInput(in);
-
-  for (size_t i = 0; i < places_.size(); ++i) {
-    auto &p = places_[i];
-    SetCommunicationContext(op_handle, p);
-    auto &vars = result->vars_.at(i).at(p_name);
-    auto *out_var = new VarHandle(vars.size(), i, p_name, p);
-    vars.emplace_back(out_var);
-    op_handle->AddOutput(out_var);
-  }
-}
-
-void MultiDevSSAGraphBuilder::CreateComputationalOp(SSAGraph *result,
-                                                    const OpDesc &op,
-                                                    int dev_id) const {
-  result->ops_.emplace_back(
-      new ComputationOpHandle(op, local_scopes_[dev_id], places_[dev_id]));
-  CreateOpHandleIOs(result, op, dev_id);
-}
-
-void MultiDevSSAGraphBuilder::InsertAllReduceOp(SSAGraph *result,
-                                                const std::string &og) const {
-#ifdef PADDLE_WITH_CUDA
-  result->ops_.emplace_back(
-      new AllReduceOpHandle(local_scopes_, places_, nccl_ctxs_));
-#else
-  result->ops_.emplace_back(new AllReduceOpHandle(local_scopes_, places_));
-#endif
-  auto *op_handle = result->ops_.back().get();
-
-  for (size_t i = 0; i < places_.size(); ++i) {
-    auto &p = places_[i];
-    SetCommunicationContext(op_handle, p);
-    auto &vars = result->vars_[i][og];
-    PADDLE_ENFORCE(!vars.empty());
-    auto &prev_grad = vars.back();
-    op_handle->AddInput(prev_grad.get());
-
-    auto var = new VarHandle(vars.size(), i, og, p);
-    vars.emplace_back(var);
-    op_handle->AddOutput(var);
-  }
-}
-
-void MultiDevSSAGraphBuilder::InsertDataBalanceOp(
-    SSAGraph *result, const std::vector<std::string> &datas) const {
-#ifdef PADDLE_WITH_CUDA
-  result->ops_.emplace_back(
-      new DataBalanceOpHandle(local_scopes_, places_, nccl_ctxs_));
-#else
-  result->ops_.emplace_back(new DataBalanceOpHandle(local_scopes_, places_));
-#endif
-  auto *op_handle = result->ops_.back().get();
-  for (size_t i = 0; i < places_.size(); ++i) {
-    auto &p = places_[i];
-    SetCommunicationContext(op_handle, p);
-    for (const std::string &d_name : datas) {
-      auto &vars = result->vars_[i][d_name];
-      PADDLE_ENFORCE(!vars.empty());
-      op_handle->AddInput(vars.back().get());
-      auto var = new VarHandle(vars.size(), i, d_name, p);
-      vars.emplace_back(var);
-      op_handle->AddOutput(var);
-    }
-  }
-}
-
-bool MultiDevSSAGraphBuilder::IsParameterGradientOnce(
-    const std::string &og,
-    std::unordered_set<std::string> *og_has_been_broadcast) const {
-  bool is_pg_once =
-      grad_names_.count(og) != 0 && og_has_been_broadcast->count(og) == 0;
-  if (is_pg_once) {
-    // Insert NCCL AllReduce Op
-    og_has_been_broadcast->insert(og);
-  }
-  return is_pg_once;
-}
-
-int MultiDevSSAGraphBuilder::GetOpDeviceID(const OpDesc &op) const {
-  if (strategy_.reduce_ != BuildStrategy::ReduceStrategy::kReduce) {
-    return -1;
-  }
-
-  for (auto &varname : op.InputArgumentNames()) {
-    int dev_id = GetVarDeviceID(varname);
-    if (dev_id != -1) {
-      return dev_id;
-    }
-  }
-  return -1;
-}
-
-int MultiDevSSAGraphBuilder::GetVarDeviceID(const std::string &varname) const {
-  auto got = var_name_on_devices_.find(varname);
-  return got == var_name_on_devices_.end() ? -1 : got->second;
-}
-
-void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(SSAGraph *result) const {
-  for (size_t i = 0; i < places_.size(); ++i) {
-// Insert ScaleCost OpHandle
-#ifdef PADDLE_WITH_CUDA
-    auto *communication_dev_ctx =
-        nccl_ctxs_ ? nccl_ctxs_->DevCtx(places_[i])
-                   : platform::DeviceContextPool::Instance().Get(places_[i]);
-#else
-    auto *communication_dev_ctx =
-        platform::DeviceContextPool::Instance().Get(platform::CPUPlace());
-#endif
-
-    auto *op_handle =
-        new ScaleLossGradOpHandle(local_scopes_.size(), local_scopes_[i],
-                                  places_[i], communication_dev_ctx);
-    result->ops_.emplace_back(op_handle);
-
-    // FIXME: Currently ScaleLossGradOp only use device_count as scale
-    // factor. So it does not depend on any other operators.
-    // VarHandle *loss = GetVarHandle(loss_var_name, place);
-    // loss->pending_ops_.emplace_back(op_handle);
-    // op_handle->inputs_.emplace_back(loss);
-
-    CreateOpOutput(result, op_handle, GradVarName(loss_var_name_), places_[i],
-                   i);
-  }
-}
-
-void MultiDevSSAGraphBuilder::CreateComputationalOps(SSAGraph *result,
-                                                     const OpDesc &op,
-                                                     size_t num_places) const {
-  for (size_t scope_idx = 0; scope_idx < num_places; ++scope_idx) {
-    auto p = places_[scope_idx];
-    auto s = local_scopes_[scope_idx];
-    result->ops_.emplace_back(new ComputationOpHandle(op, s, p));
-    CreateOpHandleIOs(result, op, scope_idx);
-  }
-}
-
-VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(SSAGraph *result,
-                                                   const std::string &og,
-                                                   int dst_dev_id) const {
-#ifdef PADDLE_WITH_CUDA
-  result->ops_.emplace_back(
-      new ReduceOpHandle(local_scopes_, places_, nccl_ctxs_));
-#else
-  result->ops_.emplace_back(new ReduceOpHandle(local_scopes_, places_));
-#endif
-  auto *op_handle = result->ops_.back().get();
-
-  for (size_t i = 0; i < places_.size(); ++i) {
-    auto &p = places_[i];
-    SetCommunicationContext(op_handle, p);
-    auto &vars = result->vars_[i][og];
-    PADDLE_ENFORCE(!vars.empty());
-    auto &prev_grad = vars.back();
-    op_handle->AddInput(prev_grad.get());
-  }
-  auto &vars = result->vars_[dst_dev_id][og];
-  auto var = new VarHandle(vars.size(), dst_dev_id, og, places_[dst_dev_id]);
-  vars.emplace_back(var);
-  op_handle->AddOutput(var);
-  return var;
-}
-
-// Find the first occurence of `prev_op_name` and make current `op` depend
-// on it.
-void MultiDevSSAGraphBuilder::ConnectOp(SSAGraph *result, OpHandleBase *op,
-                                        const std::string &prev_op_name) const {
-  for (auto &prev_op : result->ops_) {
-    if (prev_op->Name() == prev_op_name) {
-      auto *dep_var = new DummyVarHandle();
-      prev_op->AddOutput(dep_var);
-      result->dep_vars_.emplace(dep_var);
-      op->AddInput(dep_var);
-    }
-  }
-}
-
-void MultiDevSSAGraphBuilder::CreateDistTrainOp(SSAGraph *result,
-                                                const OpDesc &op) const {
-  int op_dev_id = -1;
-  if (op.Type() == "split_byref" || op.Type() == "split_selected_rows") {
-    op_dev_id = GetVarDeviceID(op.InputArgumentNames()[0]);
-    if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) {
-      op_dev_id = GetAppropriateDeviceID(op.InputArgumentNames());
-      for (auto &varname : op.InputArgumentNames()) {
-        var_name_on_devices_.emplace(varname, op_dev_id);
-      }
-    }
-    for (auto &varname : op.OutputArgumentNames()) {
-      var_name_on_devices_.emplace(varname, op_dev_id);
-    }
-  } else if (op.Type() == "concat") {
-    op_dev_id = GetVarDeviceID(op.InputArgumentNames()[0]);
-    for (auto &varname : op.OutputArgumentNames()) {
-      var_name_on_devices_.emplace(varname, op_dev_id);
-    }
-  } else {
-    PADDLE_ENFORCE(
-        "the distribute training related op should be in [split_byref, "
-        "concat].");
-  }
-
-  PADDLE_ENFORCE(op_dev_id != -1,
-                 "can not find right place for distributed op: %s", op.Type());
-
-  CreateComputationalOp(result, op, op_dev_id);
-  if (op.Type() == "concat") {
-    ConnectOp(result, result->ops_.back().get(), "fetch_barrier");
-  }
-}
-
-// Create RPC related op handles that connects its in ops and out ops.
-void MultiDevSSAGraphBuilder::CreateRPCOp(SSAGraph *result,
-                                          const OpDesc &op) const {
-  int op_dev_id = -1;
-  if (op.Type() == "send") {
-    op_dev_id = GetVarDeviceID(op.InputArgumentNames()[0]);
-    // the variable name which contains .block means it was splited by
-    // split_byref op
-    // so that we can balance the variable blocks to all the pserver
-    // instances.
-    if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce &&
-        op.InputArgumentNames()[0].find(".block") == std::string::npos) {
-      op_dev_id = GetAppropriateDeviceID(op.InputArgumentNames());
-      for (auto &varname : op.InputArgumentNames()) {
-        var_name_on_devices_.emplace(varname, op_dev_id);
-      }
-    }
-  } else if (op.Type() == "recv") {
-    op_dev_id = GetAppropriateDeviceID(op.OutputArgumentNames());
-    for (auto &varname : op.OutputArgumentNames()) {
-      var_name_on_devices_.emplace(varname, op_dev_id);
-    }
-  } else {
-    // send_barrier and fetch_barrier op can be scheduled on device 0
-    op_dev_id = 0;
-  }
-
-  PADDLE_ENFORCE(op_dev_id != -1, "can not find the right place for rpc op: %s",
-                 op.Type());
-
-  result->ops_.emplace_back(new RPCOpHandle(op, local_scopes_[op_dev_id],
-                                            op.Type(), places_[op_dev_id]));
-
-  if (op.Type() == "send_barrier") {
-    ConnectOp(result, result->ops_.back().get(), "send");
-  } else if (op.Type() == "recv") {
-    ConnectOp(result, result->ops_.back().get(), "send_barrier");
-  } else if (op.Type() == "fetch_barrier") {
-    ConnectOp(result, result->ops_.back().get(), "recv");
-  } else if (op.Type() == "send") {
-    // do nothing
-  } else {
-    PADDLE_THROW(
-        "rpc op should be in ["
-        "send, send_barrier. recv, fetch_barrier]");
-  }
-
-  CreateOpHandleIOs(result, op, op_dev_id);
-}
-
-bool MultiDevSSAGraphBuilder::IsScaleLossOp(const OpDesc &op) const {
-  return boost::get<int>(
-             op.GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) ==
-             (static_cast<int>(OpRole::kBackward) |
-              static_cast<int>(OpRole::kLoss)) &&
-         !loss_var_name_.empty();  // If loss_var is empty. This is test mode
-}
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.h b/paddle/fluid/framework/details/multi_devices_graph_builder.h
deleted file mode 100644
index a964e024885e56693224a6199e00ff30beaa1df4..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.h
+++ /dev/null
@@ -1,126 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/details/build_strategy.h"
-#include "paddle/fluid/framework/details/ssa_graph_builder.h"
-
-namespace paddle {
-namespace platform {
-class NCCLContextMap;
-}
-
-namespace framework {
-class Scope;
-namespace details {
-
-class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
- public:
-#ifdef PADDLE_WITH_CUDA
-  MultiDevSSAGraphBuilder(const std::vector<platform::Place> &places,
-                          const std::string &loss_var_name,
-                          const std::unordered_set<std::string> &params,
-                          const std::vector<Scope *> &local_scopes,
-                          platform::NCCLContextMap *nccl_ctxs,
-                          const BuildStrategy &strategy);
-#else
-  MultiDevSSAGraphBuilder(const std::vector<platform::Place> &places,
-                          const std::string &loss_var_name,
-                          const std::unordered_set<std::string> &params,
-                          const std::vector<Scope *> &local_scopes,
-                          const BuildStrategy &strategy);
-#endif
-
-  std::unique_ptr<SSAGraph> Build(const ProgramDesc &program) const override;
-  int GetVarDeviceID(const std::string &varname) const override;
-
- private:
-  void CreateOpHandleIOs(SSAGraph *result, const OpDesc &op,
-                         size_t device_id) const;
-
- private:
-  std::string loss_var_name_;
-  const std::vector<platform::Place> &places_;
-  const std::vector<Scope *> &local_scopes_;
-  std::unordered_set<std::string> grad_names_;
-
-#ifdef PADDLE_WITH_CUDA
-  platform::NCCLContextMap *nccl_ctxs_;
-#endif
-
-  bool IsScaleLossOp(const OpDesc &op) const;
-
-  void CreateRPCOp(SSAGraph *result, const OpDesc &op) const;
-  void CreateDistTrainOp(SSAGraph *result, const OpDesc &op) const;
-
-  /**
-   * Is this operator as the end-point operator before/after send operator.
-   */
-  bool IsDistTrainOp(const OpDesc &op,
-                     const std::vector<std::string> &send_vars,
-                     const std::vector<std::string> &recv_vars) const;
-
-  std::vector<std::string> FindDistTrainSendVars(
-      const ProgramDesc &program) const;
-
-  std::vector<std::string> FindDistTrainRecvVars(
-      const ProgramDesc &program) const;
-
-  void ConnectOp(SSAGraph *result, OpHandleBase *op,
-                 const std::string &prev_op_name) const;
-
-  void CreateComputationalOps(SSAGraph *result, const OpDesc &op,
-                              size_t num_places) const;
-
-  void CreateScaleLossGradOp(SSAGraph *result) const;
-  VarHandle *CreateReduceOp(SSAGraph *result, const std::string &og,
-                            int dst_dev_id) const;
-  void CreateComputationalOp(SSAGraph *result, const OpDesc &op,
-                             int dev_id) const;
-
-  bool IsParameterGradientOnce(
-      const std::string &og,
-      std::unordered_set<std::string> *og_has_been_broadcast) const;
-
-  int GetOpDeviceID(const OpDesc &op) const;
-
-  void InsertAllReduceOp(SSAGraph *result, const std::string &og) const;
-
-  void InsertDataBalanceOp(SSAGraph *result,
-                           const std::vector<std::string> &datas) const;
-
-  void CreateBroadcastOp(SSAGraph *result, const std::string &p_name,
-                         size_t src_dev_id) const;
-
-  bool IsSparseGradient(const std::string &og) const;
-
-  size_t GetAppropriateDeviceID(
-      const std::vector<std::string> &var_names) const;
-
- private:
-  BuildStrategy strategy_;
-  mutable std::unordered_map<std::string, VarDesc *> all_vars_;
-  mutable std::unordered_map<std::string, int> var_name_on_devices_;
-  mutable std::vector<int64_t> balance_vars_;
-
-  void SetCommunicationContext(OpHandleBase *op_handle,
-                               const platform::Place &p) const;
-};
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/ssa_graph_checker.cc b/paddle/fluid/framework/details/multi_devices_graph_check_pass.cc
similarity index 70%
rename from paddle/fluid/framework/details/ssa_graph_checker.cc
rename to paddle/fluid/framework/details/multi_devices_graph_check_pass.cc
index da5428946ee588e8eac1f78929dc0432df532975..c9c255864a2477ed29873f8521acce37fa928c06 100644
--- a/paddle/fluid/framework/details/ssa_graph_checker.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_check_pass.cc
@@ -12,15 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/details/ssa_graph.h"
+#include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h"
 #include <string>
-#include "paddle/fluid/framework/details/ssa_graph_checker.h"
+#include "paddle/fluid/framework/ir/graph.h"
 
 namespace paddle {
 namespace framework {
 namespace details {
 
-bool SSAGraghBuilderWithChecker::IsValidGraph(const SSAGraph *graph) const {
+bool SSAGraghBuilderWithChecker::IsValidGraph(const ir::Graph *graph) const {
   std::unordered_map<OpHandleBase *, size_t> pending_ops;
   std::unordered_set<VarHandleBase *> pending_vars;
   std::unordered_set<VarHandleBase *> ready_vars;
@@ -28,12 +28,12 @@ bool SSAGraghBuilderWithChecker::IsValidGraph(const SSAGraph *graph) const {
 
   auto insert_pending_var = [&](VarHandleBase *var) {
     pending_vars.insert(var);
-    if (var->generated_op_ == nullptr) {
+    if (var->GeneratedOp() == nullptr) {
       ready_vars.emplace(var);
     }
   };
 
-  for (auto &var_map : graph->vars_) {
+  for (auto &var_map : graph->Get<GraphVars>(kGraphVars)) {
     for (auto &name_pair : var_map) {
       for (auto &version_pair : name_pair.second) {
         insert_pending_var(version_pair.get());
@@ -41,11 +41,11 @@ bool SSAGraghBuilderWithChecker::IsValidGraph(const SSAGraph *graph) const {
     }
   }
 
-  for (auto &var : graph->dep_vars_) {
+  for (auto &var : graph->Get<GraphDepVars>(kGraphDepVars)) {
     insert_pending_var(var.get());
   }
 
-  for (auto &op : graph->ops_) {
+  for (auto &op : graph->Get<GraphOps>(kGraphOps)) {
     if (op->Inputs().empty()) {
       ready_ops.insert(op.get());
     } else {
@@ -71,7 +71,7 @@ bool SSAGraghBuilderWithChecker::IsValidGraph(const SSAGraph *graph) const {
 
     for (auto ready_var : ready_vars) {
       pending_vars.erase(ready_var);
-      for (auto *op : ready_var->pending_ops_) {
+      for (auto *op : ready_var->PendingOps()) {
         auto &deps = --pending_ops[op];
         if (deps == 0) {
           ready_ops.insert(op);
@@ -85,3 +85,10 @@ bool SSAGraghBuilderWithChecker::IsValidGraph(const SSAGraph *graph) const {
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
+
+REGISTER_PASS(multi_devices_check_pass,
+              paddle::framework::details::SSAGraghBuilderWithChecker)
+    .RequireGraphAttr(paddle::framework::details::kGraphVars)
+    .RequireGraphAttr(paddle::framework::details::kGraphDepVars)
+    .RequireGraphAttr(paddle::framework::details::kGraphOps)
+    .RequireGraphAttr(paddle::framework::details::kShardedVarDevice);
diff --git a/paddle/fluid/framework/details/ssa_graph_checker.h b/paddle/fluid/framework/details/multi_devices_graph_check_pass.h
similarity index 58%
rename from paddle/fluid/framework/details/ssa_graph_checker.h
rename to paddle/fluid/framework/details/multi_devices_graph_check_pass.h
index 331aa9d2b5864c470dbd5e29ef6faccffdcf781c..1e2b1867c376956d7d2dac465c13e2f3f64ba7eb 100644
--- a/paddle/fluid/framework/details/ssa_graph_checker.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_check_pass.h
@@ -14,35 +14,23 @@
 
 #pragma once
 
-#include "paddle/fluid/framework/details/ssa_graph_builder.h"
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
 
 #include <string>
 
 namespace paddle {
 namespace framework {
 namespace details {
-struct SSAGraph;
 
-class SSAGraghBuilderWithChecker : public SSAGraphBuilder {
- public:
-  explicit SSAGraghBuilderWithChecker(
-      std::unique_ptr<SSAGraphBuilder>&& builder)
-      : builder_(std::move(builder)) {}
-
-  std::unique_ptr<SSAGraph> Build(const ProgramDesc& program) const override {
-    auto graph = builder_->Build(program);
+class SSAGraghBuilderWithChecker : public ir::Pass {
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override {
     PADDLE_ENFORCE(IsValidGraph(graph.get()));
     return graph;
   }
 
-  int GetVarDeviceID(const std::string& var_name) const override {
-    return builder_->GetVarDeviceID(var_name);
-  }
-
-  bool IsValidGraph(const SSAGraph* graph) const;
-
- private:
-  std::unique_ptr<SSAGraphBuilder> builder_;
+  bool IsValidGraph(const ir::Graph* graph) const;
 };
 
 }  // namespace details
diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..250e093a5f789dba6b06df4889c060c294d469fe
--- /dev/null
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -0,0 +1,872 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <algorithm>
+#include <fstream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/framework/details/all_reduce_op_handle.h"
+#include "paddle/fluid/framework/details/broadcast_op_handle.h"
+#include "paddle/fluid/framework/details/computation_op_handle.h"
+#include "paddle/fluid/framework/details/data_balance_op_handle.h"
+#include "paddle/fluid/framework/details/multi_devices_graph_pass.h"
+#include "paddle/fluid/framework/details/reduce_op_handle.h"
+#include "paddle/fluid/framework/details/rpc_op_handle.h"
+#include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/ir/node.h"
+#include "paddle/fluid/framework/op_info.h"
+#include "paddle/fluid/framework/scope.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+namespace {
+void PolishGraphToSupportDataHazards(ir::Graph *graph) {
+  for (auto &var_map : graph->Get<GraphVars>(kGraphVars)) {
+    for (auto &name_pair : var_map) {
+      if (name_pair.second.size() <= 1) {
+        continue;
+      }
+      auto it_new = name_pair.second.rbegin();
+      auto it_old = name_pair.second.rbegin();
+      ++it_old;
+      for (; it_old != name_pair.second.rend(); it_new = it_old, ++it_old) {
+        OpHandleBase *write_op = (*it_new)->GeneratedOp();
+        const auto &read_ops = (*it_old)->PendingOps();
+
+        for (auto *read_op : read_ops) {
+          // Manually add a dependency var from read_op to write_op;
+          if (read_op == write_op) {
+            // Read Write is the same op.
+            continue;
+          }
+          bool has_dep = false;
+          for (auto *r_out : read_op->Outputs()) {
+            for (auto *w_in : write_op->Inputs()) {
+              if (r_out->Node() == w_in->Node()) {
+                has_dep = true;
+                break;
+              }
+            }
+          }
+          if (has_dep) continue;
+
+          auto *dep_var = new DummyVarHandle(graph->CreateControlDepVar());
+          read_op->AddOutput(dep_var);
+          write_op->AddInput(dep_var);
+          graph->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var);
+        }
+      }
+    }
+  }
+}
+
+VarHandle *CreateOrGetLatestVarHandle(ir::Graph *graph, ir::Node *node,
+                                      const platform::Place &place,
+                                      size_t place_offset) {
+  auto &var_holders = graph->Get<GraphVars>(kGraphVars)[place_offset];
+  auto &var_holder = var_holders[node->Name()];
+  VarHandle *var = nullptr;
+  if (var_holder.empty()) {
+    if (node->Var()) {
+      var = new VarHandle(graph->CreateVarNode(node->Var()), 0, place_offset,
+                          node->Name(), place);
+    } else {
+      var = new VarHandle(
+          graph->CreateEmptyNode(node->Name(), ir::Node::Type::kVariable), 0,
+          place_offset, node->Name(), place);
+    }
+    var_holder.emplace_back(var);
+  } else {
+    var = var_holder.rbegin()->get();
+  }
+  return var;
+}
+
+void CreateOpOutput(ir::Graph *graph, OpHandleBase *op_handle,
+                    ir::Node *new_node, const platform::Place &place,
+                    size_t place_offset) {
+  auto &vars =
+      graph->Get<GraphVars>(kGraphVars)[place_offset][new_node->Name()];
+  size_t version = vars.size();
+  auto var =
+      new VarHandle(new_node, version, place_offset, new_node->Name(), place);
+  vars.emplace_back(var);
+  op_handle->AddOutput(var);
+}
+
+void AddOutputToLeafOps(ir::Graph *graph) {
+  for (auto &op : graph->Get<GraphOps>(kGraphOps)) {
+    if (!op->Outputs().empty()) {
+      continue;
+    }
+    auto *dummy_leaf = new DummyVarHandle(graph->CreateControlDepVar());
+    graph->Get<GraphDepVars>(kGraphDepVars).emplace(dummy_leaf);
+    op->AddOutput(dummy_leaf);
+  }
+}
+}  // namespace
+
+static const char kLossVarName[] = "loss_var_name";
+static const char kPlaces[] = "places";
+static const char kParams[] = "params";
+static const char kLocalScopes[] = "local_scopes";
+static const char kStrategy[] = "strategy";
+
+void MultiDevSSAGraphBuilder::Init() const {
+  loss_var_name_ = Get<const std::string>(kLossVarName);
+  places_ = Get<const std::vector<platform::Place>>(kPlaces);
+  local_scopes_ = Get<const std::vector<Scope *>>(kLocalScopes);
+  strategy_ = Get<const BuildStrategy>(kStrategy);
+#ifdef PADDLE_WITH_CUDA
+  nccl_ctxs_ = &Get<platform::NCCLContextMap>("nccl_ctxs");
+#endif
+
+  for (auto &p : Get<const std::unordered_set<std::string>>(kParams)) {
+    grad_names_.insert(GradVarName(p));
+  }
+  balance_vars_.resize(places_.size(), 0);
+  if (strategy_.enable_data_balance_ && places_.size() == 1) {
+    LOG(WARNING) << "It is no need to enable data balance when there is only "
+                    "one place. enable_data_balance is set to False.";
+    strategy_.enable_data_balance_ = false;
+  }
+}
+
+void MultiDevSSAGraphBuilder::CreateOpHandleIOs(ir::Graph *result,
+                                                ir::Node *node,
+                                                size_t place_id) const {
+  auto p = places_[place_id];
+  auto *op_handle = result->Get<GraphOps>(kGraphOps).back().get();
+  op_handle->SetDeviceContext(p,
+                              platform::DeviceContextPool::Instance().Get(p));
+
+  for (ir::Node *input : node->inputs) {
+    VarHandle *var = CreateOrGetLatestVarHandle(result, input, p, place_id);
+    op_handle->AddInput(var);
+  }
+
+  for (ir::Node *output : node->outputs) {
+    ir::Node *new_node = nullptr;
+    if (output->Var()) {
+      new_node = result->CreateVarNode(output->Var());
+    } else {
+      new_node =
+          result->CreateEmptyNode(output->Name(), ir::Node::Type::kVariable);
+    }
+    CreateOpOutput(result, op_handle, new_node, p, place_id);
+  }
+}
+
+std::vector<std::string> MultiDevSSAGraphBuilder::FindDistTrainSendVars(
+    const std::vector<ir::Node *> &nodes) const {
+  std::vector<std::string> send_vars;
+  // since parameters are all in block 0,
+  // it's enough to only scan send ops in block 0
+  for (auto &node : nodes) {
+    OpDesc *op = node->Op();
+    // TODO(Yancey1989): use a graceful method to find send op,
+    // instead of the the hard code string
+    if (op->Type() == "send") {
+      auto op_vars = op->InputArgumentNames();
+      send_vars.reserve(send_vars.size() +
+                        std::distance(op_vars.begin(), op_vars.end()));
+      send_vars.insert(send_vars.end(), op_vars.begin(), op_vars.end());
+    }
+  }
+  return send_vars;
+}
+
+std::vector<std::string> MultiDevSSAGraphBuilder::FindDistTrainRecvVars(
+    const std::vector<ir::Node *> &nodes) const {
+  std::vector<std::string> recv_vars;
+  for (auto &node : nodes) {
+    OpDesc *op = node->Op();
+    // TODO(Yancey1989): use a graceful method to find recv op,
+    // instead of the hard code string
+    if (op->Type() == "recv") {
+      auto op_vars = op->OutputArgumentNames();
+      recv_vars.reserve(recv_vars.size() +
+                        std::distance(op_vars.begin(), op_vars.end()));
+      recv_vars.insert(recv_vars.end(), op_vars.begin(), op_vars.end());
+    }
+  }
+  return recv_vars;
+}
+
+bool MultiDevSSAGraphBuilder::IsDistTrainOp(
+    ir::Node *node, const std::vector<std::string> &send_vars,
+    const std::vector<std::string> &recv_vars) const {
+  if (send_vars.size() == 0 || recv_vars.size() == 0) {
+    return false;
+  }
+
+  /**
+   * Check any of opvars contains `.block` and in sendvars
+   */
+  auto checker = [](const std::vector<std::string> &opvars,
+                    const std::vector<std::string> &rpc_vars) -> bool {
+    for (auto &var : opvars) {
+      // a variable name with the suffix `.block` means it's a splited
+      // variable by (DistributeTranspiler)
+      // [python/paddle/fluid/transpiler/distribute_transpiler.py]
+      if (var.find(".block") != std::string::npos &&
+          std::find(rpc_vars.begin(), rpc_vars.end(), var) != rpc_vars.end()) {
+        return true;
+      }
+    }
+    return false;
+  };
+
+  std::vector<std::string> input_var_names;
+  std::vector<std::string> output_var_names;
+  for (ir::Node *input : node->inputs) {
+    input_var_names.push_back(input->Name());
+  }
+  for (ir::Node *output : node->outputs) {
+    output_var_names.push_back(output->Name());
+  }
+
+  return checker(output_var_names, send_vars) ||
+         checker(input_var_names, recv_vars);
+}
+
+size_t MultiDevSSAGraphBuilder::GetAppropriateDeviceID(
+    const std::vector<std::string> &var_names) const {
+  int64_t numel_sum = 0;
+  for (auto var_name : var_names) {
+    if (all_vars_.find(var_name) == all_vars_.end()) continue;
+    auto var_desc = all_vars_.at(var_name);
+    PADDLE_ENFORCE_NOT_NULL(var_desc);
+    auto dim = framework::make_ddim(var_desc->GetShape());
+    int64_t numel = framework::product(dim);
+    PADDLE_ENFORCE_GT(numel, 0);
+    numel_sum += numel;
+  }
+
+  auto smallest =
+      std::min_element(std::begin(balance_vars_), std::end(balance_vars_));
+  size_t dev_id =
+      static_cast<size_t>(std::distance(std::begin(balance_vars_), smallest));
+  balance_vars_[dev_id] += numel_sum;
+  return dev_id;
+}
+
+// Topology sort the graph nodes from inputs to outputs.
+// Since SSAGraphBuilder depends on forward/backward nodes to assign devices
+// to parameter/gradients before optimizer ops, topo sort is insufficient. (
+// some optimizer ops might not depend on any nodes), we manually move all
+// optimizer nodes after last backward nodes.
+// However, the assumption by SSAGraphBuilder should be relaxed in the future.
+std::vector<ir::Node *> SortOpsAndDelayOptimizeOp(const ir::Graph &graph) {
+  std::vector<ir::Node *> ret = ir::TopologySortOperations(graph);
+  size_t last_backward = 0;
+  for (size_t i = 0; i < ret.size(); ++i) {
+    if (boost::get<int>(
+            ret[i]->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) ==
+        static_cast<int>(OpRole::kBackward)) {
+      last_backward = i;
+    }
+  }
+
+  std::vector<ir::Node *> optimize_ops;
+  std::vector<ir::Node *> sorted_ret;
+  for (size_t i = 0; i < ret.size(); ++i) {
+    if (i < last_backward) {
+      if (boost::get<int>(ret[i]->Op()->GetAttr(
+              OpProtoAndCheckerMaker::OpRoleAttrName())) ==
+          static_cast<int>(OpRole::kOptimize)) {
+        optimize_ops.push_back(ret[i]);
+      } else {
+        sorted_ret.push_back(ret[i]);
+      }
+    } else if (i == last_backward) {
+      sorted_ret.push_back(ret[i]);
+      // Verify that no operations before optimize ops depends on optimize ops.
+      std::unordered_set<ir::Node *> optimize_set(optimize_ops.begin(),
+                                                  optimize_ops.end());
+      for (ir::Node *n : sorted_ret) {
+        for (ir::Node *in : n->inputs) {
+          for (ir::Node *pre_n : in->inputs) {
+            PADDLE_ENFORCE(optimize_set.find(pre_n) == optimize_set.end(),
+                           "optimize operations cannot be depended by forward "
+                           "or backward node %s -> %s",
+                           pre_n->Name(), n->Name());
+          }
+        }
+      }
+      sorted_ret.insert(sorted_ret.end(), optimize_ops.begin(),
+                        optimize_ops.end());
+    } else {
+      sorted_ret.push_back(ret[i]);
+    }
+  }
+  return sorted_ret;
+}
+
+std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  Init();
+  // Give the topology sort order and rebuild the graph structure.
+  std::vector<ir::Node *> sorted_ops = SortOpsAndDelayOptimizeOp(*graph);
+  auto nodes = graph->ReleaseNodes();
+  ir::Graph &result = *graph;
+
+  for (auto &node : nodes) {
+    if (node->IsVar() && node->Var()) {
+      all_vars_.emplace(node->Name(), node->Var());
+    }
+  }
+  std::unordered_set<std::string> og_has_been_broadcast;
+
+  // We cannot invoke resize. It is a bug of GCC 4.8
+  result.Set(kGraphVars, new GraphVars(places_.size()));
+  result.Set(kGraphDepVars, new GraphDepVars);
+  result.Set(kGraphOps, new GraphOps);
+  result.Set(kShardedVarDevice, new ShardedVarDevice);
+
+  // find send/recv vars so that we can place the distributed training
+  // related op in the place 0
+  auto send_vars = FindDistTrainSendVars(sorted_ops);
+  auto recv_vars = FindDistTrainRecvVars(sorted_ops);
+
+  std::vector<std::unordered_set<std::string>> bcast_var_name_set;
+  bcast_var_name_set.resize(places_.size());
+
+  size_t cur_device_id = 0;
+  bool is_forwarding = true;
+  bool is_dist_train = false;
+
+  for (ir::Node *node : sorted_ops) {
+    if (boost::get<int>(
+            node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) ==
+        static_cast<int>(OpRole::kRPC)) {
+      int op_dev_id = CreateRPCOp(&result, node);
+      PADDLE_ENFORCE(op_dev_id != -1,
+                     "Can not schedule the RPC operator to the right place.");
+      if (node->Op()->Type() == "recv") {
+        auto recv_vars_attr =
+            boost::get<std::vector<std::string>>(node->Op()->GetNullableAttr(
+                OpProtoAndCheckerMaker::OpRoleVarAttrName()));
+        PADDLE_ENFORCE(recv_vars_attr.size() == 2UL);  // [parameter, gradient]
+        if (recv_vars_attr[0].find(".block") == std::string::npos) {
+          bcast_var_name_set[op_dev_id].emplace(recv_vars_attr[0]);
+        }
+      }
+      is_dist_train = true;
+    } else if (IsDistTrainOp(node, send_vars, recv_vars)) {
+      int op_dev_id = CreateDistTrainOp(&result, node);
+      if (node->Op()->Type() == "concat") {
+        auto origin_param_name = node->Op()->OutputArgumentNames()[0];
+        bcast_var_name_set[op_dev_id].emplace(origin_param_name);
+      }
+    } else if (IsScaleLossOp(node)) {
+      // user can customize loss@grad if not use_default_grad_scale_
+      if (strategy_.gradient_scale_ !=
+          BuildStrategy::GradientScaleStrategy::kCustomized) {
+        // TODO(paddle-dev): Why is there no input for this op_handle?
+        auto loss_grad_name = node->Op()->OutputArgumentNames()[0];
+        CreateScaleLossGradOp(&result, loss_grad_name);
+      }
+      // This assumes the backward generating code will ensure IsScaleLossOp
+      // is true only for the op that scale the final scalar loss.
+      // It also assumes backward op will always follow the forward op in
+      // the block.
+      is_forwarding = false;
+    } else {
+      int op_dev_id = GetOpDeviceID(result, node);
+      if (op_dev_id != -1) {  // This op only runs on one specific device.
+        CreateComputationalOp(&result, node, op_dev_id);
+        for (ir::Node *n : node->outputs) {
+          graph->Get<ShardedVarDevice>(kShardedVarDevice)
+              .emplace(n->Name(), op_dev_id);
+        }
+      } else {
+        // This op runs on all devices, and its output may have parameter's
+        // gradients.
+        // TODO(paddle-dev): Why is so special about "read" op?
+        if (node->Op()->Type() == "read" && strategy_.enable_data_balance_) {
+          node->Op()->SetAttr("throw_eof_exp", false);
+          CreateComputationalOps(&result, node, places_.size());
+          const auto &data_var_names = node->Op()->Output("Out");
+          InsertDataBalanceOp(&result, data_var_names);
+        } else {
+          CreateComputationalOps(&result, node, places_.size());
+        }
+
+        if (!is_forwarding && places_.size() > 1) {
+          // Currently, we assume that once gradient is generated, it can be
+          // broadcast, and each gradient is only broadcast once.
+          if (static_cast<bool>(boost::get<int>(node->Op()->GetAttr(
+                                    OpProtoAndCheckerMaker::OpRoleAttrName())) &
+                                static_cast<int>(OpRole::kBackward))) {
+            try {
+              auto backward_vars = boost::get<std::vector<std::string>>(
+                  node->Op()->GetNullableAttr(
+                      OpProtoAndCheckerMaker::OpRoleVarAttrName()));
+
+              PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0);
+
+              for (size_t i = 0; i < backward_vars.size(); i += 2) {
+                auto &p_name = backward_vars[i];
+                auto &g_name = backward_vars[i + 1];
+                VLOG(10) << "Bcast " << g_name << " for parameter " << p_name;
+
+                switch (strategy_.reduce_) {
+                  case BuildStrategy::ReduceStrategy::kReduce:
+                    cur_device_id = GetAppropriateDeviceID({g_name});
+                    CreateReduceOp(&result, g_name, cur_device_id);
+                    graph->Get<ShardedVarDevice>(kShardedVarDevice)
+                        .emplace(g_name, cur_device_id);
+                    if (!is_dist_train) {
+                      bcast_var_name_set[cur_device_id].emplace(p_name);
+                    }
+                    break;
+                  case BuildStrategy::ReduceStrategy::kAllReduce:
+                    if (IsSparseGradient(g_name)) {
+                      CreateReduceOp(&result, g_name, 0);
+                      CreateBroadcastOp(&result, g_name, 0);
+                    } else {
+                      InsertAllReduceOp(&result, g_name);
+                    }
+                    break;
+                  default:
+                    LOG(FATAL) << "Unknown reduce strategy ";
+                    break;
+                }
+              }
+            } catch (boost::bad_get e) {
+            }
+          }
+        }
+      }
+    }
+  }
+  bool use_gpu = false;
+#ifdef PADDLE_WITH_CUDA
+  use_gpu = nccl_ctxs_ != nullptr;
+#endif
+
+  // Insert broadcast operators principle:
+  // 1. Broadcast optimized parameters in Reduce strategy;
+  // 2. No need broadcast optimized parameters in AllReduce strategy because of
+  //    the optimization sub-graph would be run on every GPU;
+  // 3. Allways broadcast received parameters in Distribute Training.
+  if ((use_gpu &&
+       strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce) ||
+      is_dist_train) {
+    for (size_t dev_id = 0; dev_id < bcast_var_name_set.size(); ++dev_id) {
+      auto &to_bcast_set = bcast_var_name_set[dev_id];
+      for (auto &bcast_name : to_bcast_set) {
+        CreateBroadcastOp(&result, bcast_name, dev_id);
+      }
+    }
+  }
+  /*
+  Dependency graph has been constructed. However, there are still data
+  hazards need to be handled.
+ */
+  PolishGraphToSupportDataHazards(&result);
+
+  /*
+   * Only variables should be the leaves of graph.
+   */
+  AddOutputToLeafOps(&result);
+  PADDLE_ENFORCE(!ir::HasCircle(result));
+  return graph;
+}
+
+bool MultiDevSSAGraphBuilder::IsSparseGradient(const std::string &og) const {
+  PADDLE_ENFORCE(all_vars_.count(og) != 0);
+  if (all_vars_.at(og)->GetType() == proto::VarType::SELECTED_ROWS) {
+    return true;
+  }
+  return false;
+}
+
+void MultiDevSSAGraphBuilder::SetCommunicationContext(
+    OpHandleBase *op_handle, const platform::Place &p) const {
+#ifdef PADDLE_WITH_CUDA
+  if (nccl_ctxs_ == nullptr) {
+    op_handle->SetDeviceContext(p,
+                                platform::DeviceContextPool::Instance().Get(p));
+  }
+#else
+  op_handle->SetDeviceContext(p,
+                              platform::DeviceContextPool::Instance().Get(p));
+#endif
+}
+
+void MultiDevSSAGraphBuilder::CreateBroadcastOp(ir::Graph *result,
+                                                const std::string &p_name,
+                                                size_t src_dev_id) const {
+#ifdef PADDLE_WITH_CUDA
+  auto *op_handle = new BroadcastOpHandle(
+      result->CreateEmptyNode("broadcast", ir::Node::Type::kOperation),
+      local_scopes_, places_, nccl_ctxs_);
+#else
+  auto *op_handle = new BroadcastOpHandle(
+      result->CreateEmptyNode("broadcast", ir::Node::Type::kOperation),
+      local_scopes_, places_);
+#endif
+  result->Get<GraphOps>(kGraphOps).emplace_back(op_handle);
+
+  auto *in =
+      result->Get<GraphVars>(kGraphVars).at(src_dev_id).at(p_name).back().get();
+  op_handle->AddInput(in);
+
+  for (size_t i = 0; i < places_.size(); ++i) {
+    auto &p = places_[i];
+    SetCommunicationContext(op_handle, p);
+    auto &vars = result->Get<GraphVars>(kGraphVars).at(i).at(p_name);
+    auto *out_var = new VarHandle(
+        result->CreateEmptyNode(p_name, ir::Node::Type::kVariable), vars.size(),
+        i, p_name, p);
+    vars.emplace_back(out_var);
+    op_handle->AddOutput(out_var);
+  }
+}
+
+void MultiDevSSAGraphBuilder::CreateComputationalOp(ir::Graph *result,
+                                                    ir::Node *node,
+                                                    int dev_id) const {
+  result->Get<GraphOps>(kGraphOps).emplace_back(
+      new ComputationOpHandle(result->CreateOpNode(node->Op()),
+                              local_scopes_[dev_id], places_[dev_id]));
+  CreateOpHandleIOs(result, node, dev_id);
+}
+
+void MultiDevSSAGraphBuilder::InsertAllReduceOp(ir::Graph *result,
+                                                const std::string &og) const {
+#ifdef PADDLE_WITH_CUDA
+  result->Get<GraphOps>(kGraphOps).emplace_back(new AllReduceOpHandle(
+      result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
+      local_scopes_, places_, nccl_ctxs_));
+#else
+  result->Get<GraphOps>(kGraphOps).emplace_back(new AllReduceOpHandle(
+      result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
+      local_scopes_, places_));
+#endif
+  auto *op_handle = result->Get<GraphOps>(kGraphOps).back().get();
+
+  for (size_t i = 0; i < places_.size(); ++i) {
+    auto &p = places_[i];
+    SetCommunicationContext(op_handle, p);
+    auto &vars = result->Get<GraphVars>(kGraphVars)[i][og];
+    PADDLE_ENFORCE(!vars.empty());
+    auto &prev_grad = vars.back();
+    op_handle->AddInput(prev_grad.get());
+
+    auto var =
+        new VarHandle(result->CreateEmptyNode(og, ir::Node::Type::kVariable),
+                      vars.size(), i, og, p);
+    vars.emplace_back(var);
+    op_handle->AddOutput(var);
+  }
+}
+
+void MultiDevSSAGraphBuilder::InsertDataBalanceOp(
+    ir::Graph *result, const std::vector<std::string> &datas) const {
+#ifdef PADDLE_WITH_CUDA
+  result->Get<GraphOps>(kGraphOps).emplace_back(new DataBalanceOpHandle(
+      result->CreateEmptyNode("data_balance", ir::Node::Type::kOperation),
+      local_scopes_, places_, nccl_ctxs_));
+#else
+  result->Get<GraphOps>(kGraphOps).emplace_back(new DataBalanceOpHandle(
+      result->CreateEmptyNode("data_balance", ir::Node::Type::kOperation),
+      local_scopes_, places_));
+#endif
+  auto *op_handle = result->Get<GraphOps>(kGraphOps).back().get();
+  for (size_t i = 0; i < places_.size(); ++i) {
+    auto &p = places_[i];
+    SetCommunicationContext(op_handle, p);
+    for (const std::string &d_name : datas) {
+      auto &vars = result->Get<GraphVars>(kGraphVars)[i][d_name];
+      PADDLE_ENFORCE(!vars.empty());
+      op_handle->AddInput(vars.back().get());
+      auto var = new VarHandle(
+          result->CreateEmptyNode(d_name, ir::Node::Type::kVariable),
+          vars.size(), i, d_name, p);
+      vars.emplace_back(var);
+      op_handle->AddOutput(var);
+    }
+  }
+}
+
+int MultiDevSSAGraphBuilder::GetOpDeviceID(const ir::Graph &graph,
+                                           ir::Node *node) const {
+  if (strategy_.reduce_ != BuildStrategy::ReduceStrategy::kReduce) {
+    return -1;
+  }
+  int op_role = boost::get<int>(
+      node->Op()->GetAttr(framework::OpProtoAndCheckerMaker::OpRoleAttrName()));
+  if (op_role != static_cast<int>(framework::OpRole::kOptimize)) {
+    return -1;
+  }
+  auto param_grad = boost::get<std::vector<std::string>>(
+      node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName()));
+
+  PADDLE_ENFORCE_EQ(param_grad.size(), 2U);
+  int dev_id = GetVarDeviceID(graph, param_grad[1]);
+  PADDLE_ENFORCE_NE(dev_id, -1, "dev_id should not be -1.[%s, %s, %s]",
+                    node->Op()->Type(), param_grad[0], param_grad[1]);
+  return dev_id;
+}
+
+int MultiDevSSAGraphBuilder::GetVarDeviceID(const ir::Graph &graph,
+                                            const std::string &varname) const {
+  auto &sharded_var_device = graph.Get<ShardedVarDevice>(kShardedVarDevice);
+  auto got = sharded_var_device.find(varname);
+  return got == sharded_var_device.end() ? -1 : got->second;
+}
+
+void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(
+    ir::Graph *result, const std::string &loss_grad_name) const {
+  for (size_t i = 0; i < places_.size(); ++i) {
+    // Insert ScaleCost OpHandle
+    auto *dev_ctx = platform::DeviceContextPool::Instance().Get(places_[i]);
+    auto *op_handle = new ScaleLossGradOpHandle(
+        result->CreateEmptyNode("scale_loss_grad", ir::Node::Type::kOperation),
+        local_scopes_.size(), local_scopes_[i], places_[i], dev_ctx);
+    result->Get<GraphOps>(kGraphOps).emplace_back(op_handle);
+
+    // FIXME: Currently ScaleLossGradOp only use device_count as scale
+    // factor. So it does not depend on any other operators.
+    // VarHandle *loss = GetVarHandle(loss_var_name, place);
+    // loss->pending_ops_.emplace_back(op_handle);
+    // op_handle->inputs_.emplace_back(loss);
+
+    CreateOpOutput(
+        result, op_handle,
+        result->CreateEmptyNode(loss_grad_name, ir::Node::Type::kVariable),
+        places_[i], i);
+  }
+}
+
+void MultiDevSSAGraphBuilder::CreateComputationalOps(ir::Graph *result,
+                                                     ir::Node *node,
+                                                     size_t num_places) const {
+  for (size_t scope_idx = 0; scope_idx < num_places; ++scope_idx) {
+    auto p = places_[scope_idx];
+    auto s = local_scopes_[scope_idx];
+    result->Get<GraphOps>(kGraphOps).emplace_back(
+        new ComputationOpHandle(result->CreateOpNode(node->Op()), s, p));
+    CreateOpHandleIOs(result, node, scope_idx);
+  }
+}
+
+VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(ir::Graph *result,
+                                                   const std::string &og,
+                                                   int dst_dev_id) const {
+#ifdef PADDLE_WITH_CUDA
+  result->Get<GraphOps>(kGraphOps).emplace_back(new ReduceOpHandle(
+      result->CreateEmptyNode("reduce", ir::Node::Type::kOperation),
+      local_scopes_, places_, nccl_ctxs_));
+#else
+  result->Get<GraphOps>(kGraphOps).emplace_back(new ReduceOpHandle(
+      result->CreateEmptyNode("reduce", ir::Node::Type::kOperation),
+      local_scopes_, places_));
+#endif
+  auto *op_handle = result->Get<GraphOps>(kGraphOps).back().get();
+
+  for (size_t i = 0; i < places_.size(); ++i) {
+    auto &p = places_[i];
+    SetCommunicationContext(op_handle, p);
+    auto &vars = result->Get<GraphVars>(kGraphVars)[i][og];
+    PADDLE_ENFORCE(!vars.empty());
+    auto &prev_grad = vars.back();
+    op_handle->AddInput(prev_grad.get());
+  }
+  auto &vars = result->Get<GraphVars>(kGraphVars)[dst_dev_id][og];
+  auto var =
+      new VarHandle(result->CreateEmptyNode(og, ir::Node::Type::kVariable),
+                    vars.size(), dst_dev_id, og, places_[dst_dev_id]);
+  vars.emplace_back(var);
+  op_handle->AddOutput(var);
+  return var;
+}
+
+int MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
+                                               ir::Node *node) const {
+  int op_dev_id = -1;
+  std::vector<std::string> input_var_names;
+  std::vector<std::string> output_var_names;
+  for (ir::Node *input : node->inputs) {
+    input_var_names.push_back(input->Name());
+  }
+  for (ir::Node *output : node->outputs) {
+    output_var_names.push_back(output->Name());
+  }
+
+  if (node->Op()->Type() == "split_byref" ||
+      node->Op()->Type() == "split_selected_rows") {
+    // TODO(paddle-dev): getting the first var is not safe.
+    op_dev_id = GetVarDeviceID(*result, input_var_names[0]);
+    if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) {
+      op_dev_id = GetAppropriateDeviceID(input_var_names);
+      for (auto &varname : input_var_names) {
+        result->Get<ShardedVarDevice>(kShardedVarDevice)
+            .emplace(varname, op_dev_id);
+      }
+    }
+    for (auto &varname : output_var_names) {
+      result->Get<ShardedVarDevice>(kShardedVarDevice)
+          .emplace(varname, op_dev_id);
+    }
+  } else if (node->Op()->Type() == "concat") {
+    op_dev_id = GetVarDeviceID(*result, input_var_names[0]);
+    for (auto &varname : output_var_names) {
+      result->Get<ShardedVarDevice>(kShardedVarDevice)
+          .emplace(varname, op_dev_id);
+    }
+  } else {
+    PADDLE_THROW(
+        "the distribute training related op should be in [split_byref, "
+        "concat].");
+  }
+
+  PADDLE_ENFORCE(op_dev_id != -1,
+                 "can not find right place for distributed op: %s",
+                 node->Op()->Type());
+
+  CreateComputationalOp(result, node, op_dev_id);
+  return op_dev_id;
+}
+
+void SetOpInputsAllPlaces(ir::Graph *result, ir::Node *node, int num_places) {
+  auto *op_handle = result->Get<GraphOps>(kGraphOps).back().get();
+  for (ir::Node *input : node->inputs) {
+    VarHandle *var = nullptr;
+    for (int place_offset = 0; place_offset < num_places; ++place_offset) {
+      auto &var_holders = result->Get<GraphVars>(kGraphVars)[place_offset];
+      auto &var_holder = var_holders[input->Name()];
+      if (!var_holder.empty()) {
+        var = var_holder.rbegin()->get();
+        op_handle->AddInput(var);
+      }
+    }
+  }
+}
+
+// Create RPC related op handles that connects its in ops and out ops.
+int MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result,
+                                         ir::Node *node) const {
+  int op_dev_id = -1;
+  if (node->Op()->Type() == "send") {
+    // TODO(paddle-dev): getting the first var is not safe.
+    op_dev_id = GetVarDeviceID(*result, node->inputs[0]->Name());
+    PADDLE_ENFORCE(!ir::IsControlDepVar(*node->inputs[0]),
+                   "This hack no longer holds, please fix.");
+    // the variable name which contains .block means it was splited by
+    // split_byref op
+    if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce &&
+        node->inputs[0]->Name().find(".block") == std::string::npos) {
+      std::vector<std::string> input_var_names;
+      for (ir::Node *n : node->inputs) {
+        input_var_names.push_back(n->Name());
+      }
+      auto send_param_grad = boost::get<std::vector<std::string>>(
+          node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName()));
+      PADDLE_ENFORCE_EQ(send_param_grad.size(), 2U);
+      op_dev_id = GetAppropriateDeviceID({send_param_grad[1]});
+      VLOG(10) << "send grad " << input_var_names[0] << " origin "
+               << send_param_grad[1] << " place: " << op_dev_id;
+      for (auto &varname : input_var_names) {
+        result->Get<ShardedVarDevice>(kShardedVarDevice)
+            .emplace(varname, op_dev_id);
+      }
+      result->Get<ShardedVarDevice>(kShardedVarDevice)
+          .emplace(send_param_grad[1], op_dev_id);
+    }
+  } else if (node->Op()->Type() == "recv") {
+    std::vector<std::string> output_var_names;
+    for (ir::Node *n : node->outputs) {
+      output_var_names.push_back(n->Name());
+    }
+    auto recv_param_grad = boost::get<std::vector<std::string>>(
+        node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName()));
+    if (recv_param_grad.size() == 2U) {
+      op_dev_id = GetVarDeviceID(*result, recv_param_grad[1]);
+      VLOG(10) << "recv param " << recv_param_grad[0]
+               << " get grad place: " << recv_param_grad[1]
+               << " place: " << op_dev_id;
+    } else {
+      op_dev_id = GetAppropriateDeviceID(output_var_names);
+    }
+    for (auto &varname : output_var_names) {
+      result->Get<ShardedVarDevice>(kShardedVarDevice)
+          .emplace(varname, op_dev_id);
+    }
+  } else {
+    // send_barrier, fetch_barrier will run on place 0;
+    op_dev_id = 0;
+  }
+
+  PADDLE_ENFORCE(op_dev_id != -1, "can not find the right place for rpc op: %s",
+                 node->Op()->Type());
+  result->Get<GraphOps>(kGraphOps).emplace_back(new RPCOpHandle(
+      result->CreateOpNode(node->Op()), *node->Op(), local_scopes_[op_dev_id],
+      node->Op()->Type(), places_[op_dev_id]));
+
+  if (node->Op()->Type() == "send") {
+    CreateOpHandleIOs(result, node, op_dev_id);
+  } else {
+    // send_barrier, recv, fetch_barrier's inputs are deps var, get them from
+    // all places
+    auto p = places_[op_dev_id];
+    auto *op_handle = result->Get<GraphOps>(kGraphOps).back().get();
+    op_handle->SetDeviceContext(p,
+                                platform::DeviceContextPool::Instance().Get(p));
+
+    SetOpInputsAllPlaces(result, node, places_.size());
+    for (ir::Node *output : node->outputs) {
+      int outvar_dev_id = op_dev_id;
+      if (node->Op()->Type() == "fetch_barrier") {
+        outvar_dev_id = GetVarDeviceID(*result, output->Name());
+        PADDLE_ENFORCE_NE(outvar_dev_id, -1);
+      }
+      p = places_[outvar_dev_id];
+      ir::Node *new_node = nullptr;
+      if (output->Var()) {
+        new_node = result->CreateVarNode(output->Var());
+      } else {
+        new_node =
+            result->CreateEmptyNode(output->Name(), ir::Node::Type::kVariable);
+      }
+      CreateOpOutput(result, op_handle, new_node, p, outvar_dev_id);
+    }
+  }
+  return op_dev_id;
+}
+
+bool MultiDevSSAGraphBuilder::IsScaleLossOp(ir::Node *node) const {
+  return boost::get<int>(
+             node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) ==
+             (static_cast<int>(OpRole::kBackward) |
+              static_cast<int>(OpRole::kLoss)) &&
+         !loss_var_name_.empty();  // If loss_var is empty. This is test mode
+}
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(multi_devices_pass,
+              paddle::framework::details::MultiDevSSAGraphBuilder)
+    .RequirePassAttr(paddle::framework::details::kLossVarName)
+    .RequirePassAttr(paddle::framework::details::kPlaces)
+    .RequirePassAttr(paddle::framework::details::kParams)
+    .RequirePassAttr(paddle::framework::details::kLocalScopes)
+    .RequirePassAttr(paddle::framework::details::kStrategy);
diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.h b/paddle/fluid/framework/details/multi_devices_graph_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..1ca8c4b855f9468589e537245380451a91a50b14
--- /dev/null
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h
@@ -0,0 +1,108 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/framework/details/build_strategy.h"
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
+#include "paddle/fluid/framework/ir/graph.h"
+
+namespace paddle {
+namespace platform {
+class NCCLContextMap;
+}
+
+namespace framework {
+class Scope;
+namespace details {
+
+class MultiDevSSAGraphBuilder : public ir::Pass {
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
+
+ private:
+  void CreateOpHandleIOs(ir::Graph *result, ir::Node *node,
+                         size_t device_id) const;
+  void Init() const;
+
+ private:
+  mutable std::string loss_var_name_;
+  mutable std::vector<platform::Place> places_;
+  mutable std::vector<Scope *> local_scopes_;
+  mutable std::unordered_set<std::string> grad_names_;
+
+#ifdef PADDLE_WITH_CUDA
+  mutable platform::NCCLContextMap *nccl_ctxs_;
+#endif
+
+  int GetVarDeviceID(const ir::Graph &graph, const std::string &varname) const;
+
+  bool IsScaleLossOp(ir::Node *node) const;
+
+  int CreateRPCOp(ir::Graph *result, ir::Node *node) const;
+  int CreateDistTrainOp(ir::Graph *result, ir::Node *node) const;
+
+  /**
+   * Is this operator as the end-point operator before/after send operator.
+   */
+  bool IsDistTrainOp(ir::Node *node, const std::vector<std::string> &send_vars,
+                     const std::vector<std::string> &recv_vars) const;
+
+  std::vector<std::string> FindDistTrainSendVars(
+      const std::vector<ir::Node *> &nodes) const;
+
+  std::vector<std::string> FindDistTrainRecvVars(
+      const std::vector<ir::Node *> &nodes) const;
+
+  void CreateComputationalOps(ir::Graph *result, ir::Node *node,
+                              size_t num_places) const;
+
+  void CreateScaleLossGradOp(ir::Graph *result,
+                             const std::string &loss_grad_name) const;
+
+  VarHandle *CreateReduceOp(ir::Graph *result, const std::string &og,
+                            int dst_dev_id) const;
+  void CreateComputationalOp(ir::Graph *result, ir::Node *node,
+                             int dev_id) const;
+
+  int GetOpDeviceID(const ir::Graph &graph, ir::Node *node) const;
+
+  void InsertAllReduceOp(ir::Graph *result, const std::string &og) const;
+
+  void InsertDataBalanceOp(ir::Graph *result,
+                           const std::vector<std::string> &datas) const;
+
+  void CreateBroadcastOp(ir::Graph *result, const std::string &p_name,
+                         size_t src_dev_id) const;
+
+  bool IsSparseGradient(const std::string &og) const;
+
+  size_t GetAppropriateDeviceID(
+      const std::vector<std::string> &var_names) const;
+
+ private:
+  mutable BuildStrategy strategy_;
+  mutable std::unordered_map<std::string, VarDesc *> all_vars_;
+  mutable std::vector<int64_t> balance_vars_;
+
+  void SetCommunicationContext(OpHandleBase *op_handle,
+                               const platform::Place &p) const;
+};
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/ssa_graph_printer.cc b/paddle/fluid/framework/details/multi_devices_graph_print_pass.cc
similarity index 76%
rename from paddle/fluid/framework/details/ssa_graph_printer.cc
rename to paddle/fluid/framework/details/multi_devices_graph_print_pass.cc
index 22a40ca4b25cdd8ed9856b6c71bffc79561edcac..361c91dc78c08a2cbf84ee88211d389c1e2312e5 100644
--- a/paddle/fluid/framework/details/ssa_graph_printer.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_print_pass.cc
@@ -12,17 +12,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/details/ssa_graph_printer.h"
+#include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h"
 #include <string>
-#include "paddle/fluid/framework/details/ssa_graph.h"
+#include "paddle/fluid/framework/ir/graph.h"
 
 namespace paddle {
 namespace framework {
 namespace details {
 
 template <typename Callback>
-static inline void IterAllVar(const SSAGraph &graph, Callback callback) {
-  for (auto &each : graph.vars_) {
+static inline void IterAllVar(const ir::Graph &graph, Callback callback) {
+  for (auto &each : graph.Get<GraphVars>(kGraphVars)) {
     for (auto &pair1 : each) {
       for (auto &pair2 : pair1.second) {
         callback(*pair2);
@@ -30,12 +30,12 @@ static inline void IterAllVar(const SSAGraph &graph, Callback callback) {
     }
   }
 
-  for (auto &var : graph.dep_vars_) {
+  for (auto &var : graph.Get<GraphDepVars>(kGraphDepVars)) {
     callback(*var);
   }
 }
 
-void GraphvizSSAGraphPrinter::Print(const SSAGraph &graph,
+void GraphvizSSAGraphPrinter::Print(const ir::Graph &graph,
                                     std::ostream &sout) const {
   size_t var_id = 0;
   std::unordered_map<const VarHandleBase *, size_t> vars;
@@ -54,14 +54,15 @@ void GraphvizSSAGraphPrinter::Print(const SSAGraph &graph,
       sout << "var_" << cur_var_id << " [label=\"" << var_handle_ptr->name_
            << "\\n"
            << var_handle_ptr->place_ << "\\n"
-           << var_handle_ptr->version_ << "\"]" << std::endl;
+           << "scope: " << var_handle_ptr->scope_idx_ << "\\n"
+           << "v" << var_handle_ptr->version_ << "\"]" << std::endl;
     } else if (dummy_ptr) {
       sout << "var_" << cur_var_id << " [label=\"dummy\"]" << std::endl;
     }
   });
 
   size_t op_id = 0;
-  for (auto &op : graph.ops_) {
+  for (auto &op : graph.Get<GraphOps>(kGraphOps)) {
     std::string op_name = "op_" + std::to_string(op_id++);
     sout << op_name << " [label=\"" << op->Name() << "\", shape=rect]"
          << std::endl;
@@ -81,3 +82,6 @@ void GraphvizSSAGraphPrinter::Print(const SSAGraph &graph,
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
+
+REGISTER_PASS(multi_devices_print_pass,
+              paddle::framework::details::SSAGraghBuilderWithPrinter);
diff --git a/paddle/fluid/framework/details/multi_devices_graph_print_pass.h b/paddle/fluid/framework/details/multi_devices_graph_print_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..c00685fa1629c0722c315c726053c2cba8bf17e7
--- /dev/null
+++ b/paddle/fluid/framework/details/multi_devices_graph_print_pass.h
@@ -0,0 +1,52 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <fstream>
+#include <iosfwd>
+#include <ostream>
+#include <string>
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+class SSAGraphPrinter {
+ public:
+  virtual ~SSAGraphPrinter() {}
+  virtual void Print(const ir::Graph& graph, std::ostream& sout) const = 0;
+};
+
+class GraphvizSSAGraphPrinter : public SSAGraphPrinter {
+ public:
+  void Print(const ir::Graph& graph, std::ostream& sout) const override;
+};
+
+class SSAGraghBuilderWithPrinter : public ir::Pass {
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override {
+    std::unique_ptr<std::ostream> fout(
+        new std::ofstream(Get<const std::string>("debug_graphviz_path")));
+    PADDLE_ENFORCE(fout->good());
+    Get<GraphvizSSAGraphPrinter>("graph_printer").Print(*graph, *fout);
+    return graph;
+  }
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/dot.cc b/paddle/fluid/framework/details/multi_devices_helper.cc
similarity index 79%
rename from paddle/fluid/inference/analysis/dot.cc
rename to paddle/fluid/framework/details/multi_devices_helper.cc
index d5471ffcb594a6915e9e65c0fee5adc5f5bdf40c..0242274a16c50508f2c0294264c175515c7293ef 100644
--- a/paddle/fluid/inference/analysis/dot.cc
+++ b/paddle/fluid/framework/details/multi_devices_helper.cc
@@ -11,13 +11,10 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-
-#include "paddle/fluid/inference/analysis/dot.h"
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
 
 namespace paddle {
-namespace inference {
-namespace analysis {
-size_t Dot::counter = 0;
-}  // namespace analysis
-}  // namespace inference
+namespace framework {
+namespace details {}  // namespace details
+}  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/details/multi_devices_helper.h b/paddle/fluid/framework/details/multi_devices_helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..175c5a9950be69d7bf6ae9e386af762007a18a51
--- /dev/null
+++ b/paddle/fluid/framework/details/multi_devices_helper.h
@@ -0,0 +1,57 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/details/var_handle.h"
+
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/platform/place.h"
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+// all variable in each devices.
+// The outside vector is the device vector. Each element of this vector is a
+// map from variable name to variables. The variables, who have the same name,
+// will have a differsent version. The offset in the
+// `std::vector<std::unique_ptr<VarHandle>>` is the version of varaibles.
+typedef std::vector<
+    std::unordered_map<std::string, std::vector<std::unique_ptr<VarHandle>>>>
+    GraphVars;
+const char kGraphVars[] = "vars";
+
+// aux variables to represent dependency. Useful to resolve data hazard.
+typedef std::unordered_set<std::unique_ptr<VarHandleBase>> GraphDepVars;
+const char kGraphDepVars[] = "dep_vars";
+
+// all operators. NOTE that even we use a vector here, the operators is
+// unordered.
+typedef std::vector<std::unique_ptr<OpHandleBase>> GraphOps;
+const char kGraphOps[] = "ops";
+
+typedef std::unordered_map<std::string, int> ShardedVarDevice;
+const char kShardedVarDevice[] = "sharded_var_device";
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc
index d80bdcf15d798925c137460125964d3d7e65f67e..3812f0abf1b7069525c4420054c61c01c908acfe 100644
--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -80,19 +80,21 @@ void OpHandleBase::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) {
 
 void OpHandleBase::AddInput(VarHandleBase *in) {
   this->inputs_.emplace_back(in);
-  in->pending_ops_.insert(this);
+  node_->inputs.push_back(in->Node());
+  in->AddOutput(this, this->Node());
 }
 
 void OpHandleBase::AddOutput(VarHandleBase *out) {
   outputs_.emplace_back(out);
-  out->generated_op_ = this;
+  node_->outputs.push_back(out->Node());
+  out->AddInput(this, this->Node());
 }
 
 void OpHandleBase::WaitInputVarGenerated() {
   for (auto in_var : inputs_) {
     if (NeedWait(in_var)) {
       for (auto &pair : dev_ctxes_) {
-        in_var->generated_op_->RecordWaitEventOnCtx(pair.second);
+        in_var->GeneratedOp()->RecordWaitEventOnCtx(pair.second);
       }
     }
   }
@@ -101,7 +103,7 @@ void OpHandleBase::WaitInputVarGenerated() {
 void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) {
   for (auto *in : inputs_) {
     if (NeedWait(in)) {
-      in->generated_op_->RecordWaitEventOnCtx(dev_ctxes_[place]);
+      in->GeneratedOp()->RecordWaitEventOnCtx(dev_ctxes_[place]);
     }
   }
 }
@@ -117,7 +119,7 @@ size_t OpHandleBase::NoDummyInputSize() const {
 }
 
 bool OpHandleBase::NeedWait(VarHandleBase *in_var) {
-  return in_var && in_var->generated_op_;
+  return in_var && in_var->GeneratedOp();
 }
 
 void OpHandleBase::RunAndRecordEvent(const std::function<void()> &callback) {
@@ -156,6 +158,16 @@ void OpHandleBase::RunAndRecordEvent(platform::Place p,
 #endif
 }
 
+size_t OpHandleBase::NotReadyInputSize() const {
+  std::unordered_set<VarHandleBase *> res;
+  for (auto *var : inputs_) {
+    if (var->GeneratedOp() != nullptr) {
+      res.emplace(var);
+    }
+  }
+  return res.size();
+}
+
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h
index 3de22a0235ffaae220a00c7253271b395970082a..d4e2c44482fa8f4f5677b29b84fe50f2fe7e392e 100644
--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -17,6 +17,7 @@
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/details/var_handle.h"
+#include "paddle/fluid/framework/ir/node.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/macros.h"
 
@@ -26,9 +27,11 @@ namespace details {
 
 constexpr char kLocalExecScopeName[] = "@LCOAL_SCOPE@";
 
+// Wraps ir::Node and provide helper utilities.
+// It's responsible for populating necessary fields of ir::Node.
 class OpHandleBase {
  public:
-  OpHandleBase() {}
+  explicit OpHandleBase(ir::Node *node) : node_(node) {}
 
   virtual ~OpHandleBase();
 
@@ -78,6 +81,8 @@ class OpHandleBase {
     return res.size();
   }
 
+  size_t NotReadyInputSize() const;
+
   const std::vector<VarHandleBase *> &Outputs() const { return outputs_; }
 
   size_t NoDummyInputSize() const;
@@ -97,6 +102,7 @@ class OpHandleBase {
 
   virtual void RunImpl() = 0;
 
+  ir::Node *node_;
   std::vector<VarHandleBase *> inputs_;
   std::vector<VarHandleBase *> outputs_;
   std::map<platform::Place, platform::DeviceContext *> dev_ctxes_;
diff --git a/paddle/fluid/framework/details/reduce_and_gather.h b/paddle/fluid/framework/details/reduce_and_gather.h
index c0cd873a1d83fa8c2c7b7cd5acfaad9949bcff7d..bd6153c0c736f6e32378eebcbf6c4d7e402c9b42 100644
--- a/paddle/fluid/framework/details/reduce_and_gather.h
+++ b/paddle/fluid/framework/details/reduce_and_gather.h
@@ -31,18 +31,20 @@ struct ReduceLoDTensor {
       : src_tensors_(src), dst_tensor_(*dst) {}
 
   template <typename T>
-  void operator()() const {
+  void apply() const {
     PADDLE_ENFORCE(!src_tensors_.empty());
     auto &t0 = *src_tensors_[0];
     PADDLE_ENFORCE_NE(t0.numel(), 0);
+
     dst_tensor_.Resize(t0.dims());
     T *dst = dst_tensor_.mutable_data<T>(platform::CPUPlace());
-    if (dst != t0.data<T>()) {
-      std::copy(t0.data<T>(), t0.data<T>() + t0.numel(), dst);
-    }
 
-    for (size_t i = 1; i < src_tensors_.size(); ++i) {
+    for (size_t i = 0; i < src_tensors_.size(); ++i) {
       auto &t = *src_tensors_[i];
+      if (dst == t.data<T>()) {
+        continue;
+      }
+
       PADDLE_ENFORCE_EQ(t.dims(), t0.dims());
       PADDLE_ENFORCE_EQ(t.type(), t0.type());
       std::transform(t.data<T>(), t.data<T>() + t.numel(), dst, dst,
diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc
index 7160e346dad0615e2fd32b70c096880af0359e1a..7fc06f234d42a992328c0b6164f17945d8075c28 100644
--- a/paddle/fluid/framework/details/reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle.cc
@@ -16,12 +16,19 @@
 #include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/reduce_and_gather.h"
 #include "paddle/fluid/framework/details/variable_visitor.h"
+#include "paddle/fluid/platform/profiler.h"
+
+DEFINE_bool(
+    cpu_deterministic, false,
+    "Whether to make the result of computation deterministic in CPU side.");
 
 namespace paddle {
 namespace framework {
 namespace details {
 
 void ReduceOpHandle::RunImpl() {
+  platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second);
+
   if (places_.size() == 1) return;
   // the input and output may have dummy var.
   auto in_var_handles = DynamicCast<VarHandle>(inputs_);
@@ -89,11 +96,33 @@ void ReduceOpHandle::RunImpl() {
   } else {
     std::vector<const LoDTensor *> lod_tensors =
         GetInputValues<LoDTensor>(in_var_handles, var_scopes);
+
     if (paddle::platform::is_cpu_place(lod_tensors[0]->place())) {
       this->RunAndRecordEvent([&] {
-        ReduceLoDTensor func(lod_tensors,
-                             out_var->GetMutable<framework::LoDTensor>());
-        VisitDataType(ToDataType(lod_tensors[0]->type()), func);
+        // FIXME(zcd): The order of summing is important,
+        // especially when the type of data is float or double.
+        // For example, the result of `a+b+c+d` may be different
+        // with the result of `c+a+b+d`, so the summing order should be fixed.
+        if (!FLAGS_cpu_deterministic) {
+          ReduceLoDTensor func(lod_tensors,
+                               out_var->GetMutable<framework::LoDTensor>());
+          VisitDataType(ToDataType(lod_tensors[0]->type()), func);
+        } else {
+          // We sum lod_tensors to reduce_sum_trg which is in local_scopes_0
+          // here, but it doesn't mean reduce_sum_trg must be in local_scopes_0.
+          auto &reduce_sum_trg = *this->local_scopes_[0]
+                                      ->FindVar(kLocalExecScopeName)
+                                      ->Get<Scope *>()
+                                      ->FindVar(out_var_handle->name_)
+                                      ->GetMutable<framework::LoDTensor>();
+          ReduceLoDTensor func(lod_tensors, &reduce_sum_trg);
+          VisitDataType(ToDataType(lod_tensors[0]->type()), func);
+
+          auto trg = out_var->GetMutable<framework::LoDTensor>();
+          if (reduce_sum_trg.data<void>() != trg->data<void>()) {
+            TensorCopy(reduce_sum_trg, platform::CPUPlace(), trg);
+          }
+        }
       });
     } else if (paddle::platform::is_gpu_place(lod_tensors[0]->place())) {
 #ifdef PADDLE_WITH_CUDA
diff --git a/paddle/fluid/framework/details/reduce_op_handle.h b/paddle/fluid/framework/details/reduce_op_handle.h
index 4d14334cdfe06e2e805c2577458d6689e6324cc7..a6289b055f97b7b0e57928358d84117b33cf2df8 100644
--- a/paddle/fluid/framework/details/reduce_op_handle.h
+++ b/paddle/fluid/framework/details/reduce_op_handle.h
@@ -37,10 +37,13 @@ struct ReduceOpHandle : public OpHandleBase {
 
 #ifdef PADDLE_WITH_CUDA
   const platform::NCCLContextMap *nccl_ctxs_;
-  ReduceOpHandle(const std::vector<Scope *> &local_scopes,
+  ReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
                  const std::vector<platform::Place> &places,
                  const platform::NCCLContextMap *nccl_ctxs)
-      : local_scopes_(local_scopes), places_(places), nccl_ctxs_(nccl_ctxs) {
+      : OpHandleBase(node),
+        local_scopes_(local_scopes),
+        places_(places),
+        nccl_ctxs_(nccl_ctxs) {
     if (nccl_ctxs_) {
       for (auto &p_ctx : nccl_ctxs_->contexts_) {
         dev_ctxes_[platform::CUDAPlace(p_ctx.first)] = p_ctx.second.ctx_.get();
@@ -48,9 +51,9 @@ struct ReduceOpHandle : public OpHandleBase {
     }
   }
 #else
-  ReduceOpHandle(const std::vector<Scope *> &local_scopes,
+  ReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
                  const std::vector<platform::Place> &places)
-      : local_scopes_(local_scopes), places_(places) {}
+      : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {}
 #endif
 
   std::string Name() const override;
diff --git a/paddle/fluid/framework/details/reduce_op_handle_test.cc b/paddle/fluid/framework/details/reduce_op_handle_test.cc
index ffdd7c14eb5097cc8285da090e4a72e1e3f43d86..3a9a58412391b188c5e804b41fa47b3607a36bd1 100644
--- a/paddle/fluid/framework/details/reduce_op_handle_test.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle_test.cc
@@ -84,6 +84,7 @@ struct TestReduceOpHandle {
   }
 
   void InitReduceOp(size_t out_scope_idx) {
+    std::vector<std::unique_ptr<ir::Node>> nodes;
     // init scope
     for (size_t j = 0; j < gpu_list_.size(); ++j) {
       local_scopes_.push_back(&(g_scope_.NewScope()));
@@ -96,19 +97,21 @@ struct TestReduceOpHandle {
     }
     param_scopes_[out_scope_idx]->Var("out");
 
+    nodes.emplace_back(new ir::Node("node"));
     if (use_gpu_) {
 #ifdef PADDLE_WITH_CUDA
-      op_handle_.reset(
-          new ReduceOpHandle(local_scopes_, gpu_list_, nccl_ctxs_.get()));
+      op_handle_.reset(new ReduceOpHandle(nodes.back().get(), local_scopes_,
+                                          gpu_list_, nccl_ctxs_.get()));
 #else
       PADDLE_THROW("CUDA is not support.");
 #endif
     } else {
 #ifdef PADDLE_WITH_CUDA
-      op_handle_.reset(
-          new ReduceOpHandle(local_scopes_, gpu_list_, nccl_ctxs_.get()));
+      op_handle_.reset(new ReduceOpHandle(nodes.back().get(), local_scopes_,
+                                          gpu_list_, nccl_ctxs_.get()));
 #else
-      op_handle_.reset(new ReduceOpHandle(local_scopes_, gpu_list_));
+      op_handle_.reset(
+          new ReduceOpHandle(nodes.back().get(), local_scopes_, gpu_list_));
 #endif
     }
 
@@ -118,8 +121,10 @@ struct TestReduceOpHandle {
       if (!use_gpu_) {
         op_handle_->SetDeviceContext(gpu_list_[j], ctxs_[j].get());
       }
-      auto *in_var_handle = new VarHandle(1, j, "input", gpu_list_[j]);
-      in_var_handle->generated_op_ = nullptr;
+      nodes.emplace_back(new ir::Node("node1"));
+      auto *in_var_handle =
+          new VarHandle(nodes.back().get(), 1, j, "input", gpu_list_[j]);
+      in_var_handle->ClearGeneratedOp();
       vars_.emplace_back(in_var_handle);
       op_handle_->AddInput(in_var_handle);
     }
@@ -128,12 +133,13 @@ struct TestReduceOpHandle {
     vars_.emplace_back(new DummyVarHandle());
     DummyVarHandle *in_dummy_var_handle =
         static_cast<DummyVarHandle *>(vars_.back().get());
-    in_dummy_var_handle->generated_op_ = nullptr;
+    in_dummy_var_handle->ClearGeneratedOp();
     op_handle_->AddInput(in_dummy_var_handle);
 
     // add output
-    auto *out_var_handle =
-        new VarHandle(2, out_scope_idx, "out", gpu_list_[out_scope_idx]);
+    nodes.emplace_back(new ir::Node("node2"));
+    auto *out_var_handle = new VarHandle(nodes.back().get(), 2, out_scope_idx,
+                                         "out", gpu_list_[out_scope_idx]);
     vars_.emplace_back(out_var_handle);
     op_handle_->AddOutput(out_var_handle);
 
diff --git a/paddle/fluid/framework/details/rpc_op_handle.cc b/paddle/fluid/framework/details/rpc_op_handle.cc
index 586465f99fd94117c821be2952bffda385fbcf75..f44b374edb29228dff5a8bf003d945291f166d49 100644
--- a/paddle/fluid/framework/details/rpc_op_handle.cc
+++ b/paddle/fluid/framework/details/rpc_op_handle.cc
@@ -13,15 +13,17 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/rpc_op_handle.h"
+#include "paddle/fluid/framework/ir/graph.h"
 
 namespace paddle {
 namespace framework {
 namespace details {
 
-RPCOpHandle::RPCOpHandle(const framework::OpDesc &op_desc,
+RPCOpHandle::RPCOpHandle(ir::Node *node, const framework::OpDesc &op_desc,
                          const Scope *local_scope, const std::string &name,
                          const platform::Place &place)
-    : op_(framework::OpRegistry::CreateOp(op_desc)),
+    : OpHandleBase(node),
+      op_(framework::OpRegistry::CreateOp(op_desc)),
       local_scope_(local_scope),
       name_(name),
       place_(place) {}
@@ -32,11 +34,11 @@ void RPCOpHandle::RunImpl() {
   for (auto *in : inputs_) {
     auto &p = static_cast<VarHandle *>(in)->place_;
     // FIXME(Yancey1989): need a better solution instead of use DebugString()
-    if (in->DebugString() == "dummy") {  // HACK
+    if (ir::IsControlDepVar(*in->Node())) {  // HACK
       continue;
     }
-    if (in->generated_op_) {
-      in->generated_op_->RecordWaitEventOnCtx(dev_ctxes_[p]);
+    if (in->GeneratedOp()) {
+      in->GeneratedOp()->RecordWaitEventOnCtx(dev_ctxes_[p]);
     }
   }
   auto &tmp_scope = local_scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
diff --git a/paddle/fluid/framework/details/rpc_op_handle.h b/paddle/fluid/framework/details/rpc_op_handle.h
index ae38c7fe19e102a330455d89a1068414a7835fab..7f99cdeacf618a9496eaef98520685d6d1621ae1 100644
--- a/paddle/fluid/framework/details/rpc_op_handle.h
+++ b/paddle/fluid/framework/details/rpc_op_handle.h
@@ -28,8 +28,9 @@ namespace framework {
 namespace details {
 
 struct RPCOpHandle : public OpHandleBase {
-  RPCOpHandle(const framework::OpDesc& op_desc, const Scope* local_scope,
-              const std::string& name, const platform::Place& place);
+  RPCOpHandle(ir::Node* node, const framework::OpDesc& op_desc,
+              const Scope* local_scope, const std::string& name,
+              const platform::Place& place);
 
   std::string Name() const override;
 
diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
index d9c387e79dc71288e7330597fed57171d447f31b..ba243979b34aa1f683de707525403becaf0a1c00 100644
--- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
@@ -19,10 +19,14 @@
 namespace paddle {
 namespace framework {
 namespace details {
-ScaleLossGradOpHandle::ScaleLossGradOpHandle(size_t num_dev, Scope *scope,
+ScaleLossGradOpHandle::ScaleLossGradOpHandle(ir::Node *node, size_t num_dev,
+                                             Scope *scope,
                                              platform::Place place,
                                              platform::DeviceContext *dev_ctx)
-    : coeff_(static_cast<float>(1.0 / num_dev)), scope_(scope), place_(place) {
+    : OpHandleBase(node),
+      coeff_(static_cast<float>(1.0 / num_dev)),
+      scope_(scope),
+      place_(place) {
   dev_ctxes_[place_] = dev_ctx;
 }
 
@@ -47,7 +51,7 @@ void ScaleLossGradOpHandle::RunImpl() {
               ->stream();
       memory::Copy(boost::get<platform::CUDAPlace>(place_), tmp,
                    platform::CPUPlace(), &coeff_, sizeof(float), stream);
-      VLOG(1) << place_ << "RUN Scale loss grad op";
+      VLOG(10) << place_ << "RUN Scale loss grad op";
     });
 #endif
   }
diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.h b/paddle/fluid/framework/details/scale_loss_grad_op_handle.h
index d93d599d46f130cf98f39f15697ce994a31e20c3..523b55724c82d4e2bef0520c10e5708c952a3ecc 100644
--- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.h
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.h
@@ -25,7 +25,8 @@ namespace framework {
 namespace details {
 
 struct ScaleLossGradOpHandle : public OpHandleBase {
-  ScaleLossGradOpHandle(size_t num_dev, Scope *scope, platform::Place place,
+  ScaleLossGradOpHandle(ir::Node *node, size_t num_dev, Scope *scope,
+                        platform::Place place,
                         platform::DeviceContext *context);
 
   ~ScaleLossGradOpHandle() final;
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
index 51e840ffa6c8e9818058cdbb87d631f0004e9d93..e5b1eaa7318aecde1dbf89de8fe242a3008db97c 100644
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
+#include <stdexcept>
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/executor.h"
@@ -57,8 +58,15 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
       }
     }
   }
+  std::vector<framework::LoDTensor> fetch_data;
+  std::exception_ptr eptr;
+  try {
+    fetch_data = underlying_executor_->Run(fetch_tensors);
+  } catch (...) {
+    eptr = std::current_exception();
+  }
 
-  auto fetch_data = underlying_executor_->Run(fetch_tensors);
+  platform::RecordEvent e("ScopeBufferedSSAGraphExecutorAfterRun", nullptr);
   drop_scope_counter_ += 1;
 
 #ifdef PADDLE_WITH_CUDA
@@ -89,7 +97,11 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
       scope->DeleteScope(local_scope);
     }
   }
-  return fetch_data;
+  if (eptr) {
+    std::rethrow_exception(eptr);
+  } else {
+    return fetch_data;
+  }
 }
 }  // namespace details
 }  // namespace framework
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
index 20df7a4722d589ffd168f842e927cff8411096bb..5e87e0bf50b51d2b630aba06a5907dd721754d1f 100644
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
@@ -17,6 +17,9 @@
 #include <memory>
 #include <string>
 #include <vector>
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/details/var_handle.h"
+
 #include "paddle/fluid/framework/details/execution_strategy.h"
 #include "paddle/fluid/framework/details/ssa_graph_executor.h"
 #include "paddle/fluid/framework/scope.h"
@@ -37,6 +40,11 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor {
       ExecutionStrategy strategy, std::vector<Scope*> local_scopes,
       std::vector<VariableInfo> var_infos, std::vector<platform::Place> places,
       std::unique_ptr<SSAGraphExecutor>&& underlying_executor);
+
+  const ir::Graph& Graph() const override {
+    return underlying_executor_->Graph();
+  }
+
   FeedFetchList Run(const std::vector<std::string>& fetch_tensors) override;
 
  private:
diff --git a/paddle/fluid/framework/details/ssa_graph.h b/paddle/fluid/framework/details/ssa_graph.h
deleted file mode 100644
index e996a00c162186e47e77d007503ac67caa9f8024..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/ssa_graph.h
+++ /dev/null
@@ -1,49 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <map>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/details/op_handle_base.h"
-#include "paddle/fluid/framework/details/var_handle.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-// A SSA graph used by parallel executor.
-struct SSAGraph {
-  // all variable in each devices.
-  // The outside vector is the device vector. Each element of this vector is a
-  // map from variable name to variables. The variables, who have the same name,
-  // will have a different version. The offset in the
-  // `std::vector<std::unique_ptr<VarHandle>>` is the version of varaibles.
-  std::vector<
-      std::unordered_map<std::string, std::vector<std::unique_ptr<VarHandle>>>>
-      vars_;
-
-  // aux variables to represent dependency. Useful to resolve data hazard.
-  std::unordered_set<std::unique_ptr<VarHandleBase>> dep_vars_;
-
-  // all operators. NOTE that even we use a vector here, the operators is
-  // unordered.
-  std::vector<std::unique_ptr<OpHandleBase>> ops_;
-};
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/ssa_graph_builder.cc b/paddle/fluid/framework/details/ssa_graph_builder.cc
deleted file mode 100644
index 88a21f48879a15450051ad94ed76e1c48bf23014..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/ssa_graph_builder.cc
+++ /dev/null
@@ -1,88 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/framework/details/ssa_graph_builder.h"
-#include <utility>
-
-namespace paddle {
-namespace framework {
-namespace details {
-void SSAGraphBuilder::PolishGraphToSupportDataHazards(SSAGraph *graph) {
-  for (auto &var_map : graph->vars_) {
-    for (auto &name_pair : var_map) {
-      if (name_pair.second.size() <= 1) {
-        continue;
-      }
-      auto it_new = name_pair.second.rbegin();
-      auto it_old = name_pair.second.rbegin();
-      ++it_old;
-      for (; it_old != name_pair.second.rend(); it_new = it_old, ++it_old) {
-        auto *write_op = (*it_new)->generated_op_;
-        auto &read_ops = (*it_old)->pending_ops_;
-
-        for (auto *read_op : read_ops) {
-          // Manually add a dependency var from read_op to write_op;
-          if (read_op == write_op) {
-            // Read Write is the same op.
-            continue;
-          }
-
-          auto *dep_var = new DummyVarHandle();
-          read_op->AddOutput(dep_var);
-          write_op->AddInput(dep_var);
-          graph->dep_vars_.emplace(dep_var);
-        }
-      }
-    }
-  }
-}
-
-VarHandle *SSAGraphBuilder::CreateOrGetLatestVarHandle(
-    SSAGraph *graph, const std::string &each_var_name,
-    const platform::Place &place, size_t place_offset) {
-  auto &var_holders = graph->vars_[place_offset];
-  auto &var_holder = var_holders[each_var_name];
-  VarHandle *var = nullptr;
-  if (var_holder.empty()) {
-    var = new VarHandle(0, place_offset, each_var_name, place);
-    var_holder.emplace_back(var);
-  } else {
-    var = var_holder.rbegin()->get();
-  }
-  return var;
-}
-
-void SSAGraphBuilder::CreateOpOutput(SSAGraph *graph, OpHandleBase *op_handle,
-                                     const std::string &each_var_name,
-                                     const platform::Place &place,
-                                     size_t place_offset) {
-  auto &vars = graph->vars_[place_offset][each_var_name];
-  size_t version = vars.size();
-  auto var = new VarHandle(version, place_offset, each_var_name, place);
-  vars.emplace_back(var);
-  op_handle->AddOutput(var);
-}
-
-void SSAGraphBuilder::AddOutputToLeafOps(SSAGraph *graph) {
-  for (auto &op : graph->ops_) {
-    if (!op->Outputs().empty()) {
-      continue;
-    }
-    auto *dummy_leaf = new DummyVarHandle();
-    graph->dep_vars_.emplace(dummy_leaf);
-    op->AddOutput(dummy_leaf);
-  }
-}
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/ssa_graph_builder.h b/paddle/fluid/framework/details/ssa_graph_builder.h
deleted file mode 100644
index 18612c3c1b62cf4c2ebdc221c301c59ec81c2da7..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/ssa_graph_builder.h
+++ /dev/null
@@ -1,62 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/framework/details/ssa_graph.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-class SSAGraphBuilder {
- public:
-  SSAGraphBuilder() {}
-  virtual ~SSAGraphBuilder() {}
-  virtual std::unique_ptr<SSAGraph> Build(const ProgramDesc &program) const = 0;
-  virtual int GetVarDeviceID(const std::string &var_name) const = 0;
-
-  DISABLE_COPY_AND_ASSIGN(SSAGraphBuilder);
-
- protected:
-  /**
-   * We only handle write after read(WAR), since it should not have a write
-   * after write in program. If there are write after write operators, we need
-   * prune them.
-   *
-   * https://en.wikipedia.org/wiki/Hazard_(computer_architecture)#Write_after_read_(WAR)
-   */
-  static void PolishGraphToSupportDataHazards(SSAGraph *graph);
-
-  static VarHandle *CreateOrGetLatestVarHandle(SSAGraph *graph,
-                                               const std::string &each_var_name,
-                                               const platform::Place &place,
-                                               size_t place_offset);
-
-  // Add an output variable (each_var_name, place, place_offset) to op_handle,
-  // which belongs to graph
-  static void CreateOpOutput(SSAGraph *graph, OpHandleBase *op_handle,
-                             const std::string &each_var_name,
-                             const platform::Place &place, size_t place_offset);
-
-  static void AddOutputToLeafOps(SSAGraph *graph);
-};
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/ssa_graph_builder_factory.cc b/paddle/fluid/framework/details/ssa_graph_builder_factory.cc
deleted file mode 100644
index b4b49d3de6da2e5fd7836668619e42d10bb6b35a..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/ssa_graph_builder_factory.cc
+++ /dev/null
@@ -1,50 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/details/ssa_graph_builder_factory.h"
-#include <fstream>
-#include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
-#include "paddle/fluid/framework/details/ssa_graph_checker.h"
-#include "paddle/fluid/framework/details/ssa_graph_printer.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-std::unique_ptr<SSAGraphBuilder> SSAGraphBuilderFactory::Create() {
-  std::unique_ptr<SSAGraphBuilder> res(
-#ifdef PADDLE_WITH_CUDA
-      new MultiDevSSAGraphBuilder(places_, loss_var_name_, param_names_,
-                                  local_scopes_, nccl_ctxs_, strategy_)
-#else
-      new MultiDevSSAGraphBuilder(places_, loss_var_name_, param_names_,
-                                  local_scopes_, strategy_)
-#endif
-          );  // NOLINT
-
-  if (!strategy_.debug_graphviz_path_.empty()) {
-    std::unique_ptr<std::ostream> fout(
-        new std::ofstream(strategy_.debug_graphviz_path_));
-    PADDLE_ENFORCE(fout->good());
-    std::unique_ptr<GraphvizSSAGraphPrinter> graphviz_printer(
-        new GraphvizSSAGraphPrinter());
-    res.reset(new SSAGraghBuilderWithPrinter(
-        std::move(fout), std::move(graphviz_printer), std::move(res)));
-  }
-  res.reset(new SSAGraghBuilderWithChecker(std::move(res)));
-
-  return res;
-}
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/ssa_graph_builder_factory.h b/paddle/fluid/framework/details/ssa_graph_builder_factory.h
deleted file mode 100644
index 91a119de83ed3d1573803e48faf86c874eed98d6..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/ssa_graph_builder_factory.h
+++ /dev/null
@@ -1,71 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <memory>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/details/build_strategy.h"
-#include "paddle/fluid/framework/details/ssa_graph_builder.h"
-#include "paddle/fluid/platform/place.h"
-
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/nccl_helper.h"
-#endif
-
-namespace paddle {
-namespace framework {
-class Scope;
-namespace details {
-
-class SSAGraphBuilderFactory {
- public:
-  SSAGraphBuilderFactory(const std::vector<platform::Place>& places,
-                         const std::string& loss_var_name,
-                         const std::unordered_set<std::string>& param_names,
-                         const std::vector<Scope*>& local_scopes,
-                         const BuildStrategy& strategy)
-      : places_(places),
-        loss_var_name_(loss_var_name),
-        param_names_(param_names),
-        local_scopes_(local_scopes),
-        strategy_(strategy) {
-#ifdef PADDLE_WITH_CUDA
-    nccl_ctxs_ = nullptr;
-#endif
-  }
-
-#ifdef PADDLE_WITH_CUDA
-  void SetNCCLContextMap(platform::NCCLContextMap* nccl_ctxs) {
-    nccl_ctxs_ = nccl_ctxs;
-  }
-#endif
-
-  std::unique_ptr<SSAGraphBuilder> Create();
-
- private:
-  std::vector<platform::Place> places_;
-  std::string loss_var_name_;
-  std::unordered_set<std::string> param_names_;
-  std::vector<Scope*> local_scopes_;
-  BuildStrategy strategy_;
-
-#ifdef PADDLE_WITH_CUDA
-  platform::NCCLContextMap* nccl_ctxs_;
-#endif
-};
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/ssa_graph_executor.h b/paddle/fluid/framework/details/ssa_graph_executor.h
index 958086033607a4ed8fb840f5b14fe5779625bd82..96fffb7d9430cd00b3823ada9fbe9a65a6bd718c 100644
--- a/paddle/fluid/framework/details/ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/ssa_graph_executor.h
@@ -18,8 +18,8 @@
 #include <string>
 #include <vector>
 
-#include "paddle/fluid/framework/details/ssa_graph.h"
 #include "paddle/fluid/framework/feed_fetch_type.h"
+#include "paddle/fluid/framework/ir/graph.h"
 
 namespace paddle {
 namespace framework {
@@ -32,7 +32,9 @@ class SSAGraphExecutor {
 
   virtual ~SSAGraphExecutor();
 
-  virtual FeedFetchList Run(const std::vector<std::string> &fetch_tensors) = 0;
+  virtual const ir::Graph& Graph() const = 0;
+
+  virtual FeedFetchList Run(const std::vector<std::string>& fetch_tensors) = 0;
 };
 }  // namespace details
 }  // namespace framework
diff --git a/paddle/fluid/framework/details/ssa_graph_printer.h b/paddle/fluid/framework/details/ssa_graph_printer.h
deleted file mode 100644
index 09b0333ef2cb43a306133aa5af98d37c11454d4d..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/ssa_graph_printer.h
+++ /dev/null
@@ -1,72 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <iosfwd>
-#include <string>
-#include "paddle/fluid/framework/details/ssa_graph_builder.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-struct SSAGraph;
-class SSAGraphPrinter {
- public:
-  virtual ~SSAGraphPrinter() {}
-  virtual void Print(const SSAGraph& graph, std::ostream& sout) const = 0;
-};
-
-class GraphvizSSAGraphPrinter : public SSAGraphPrinter {
- public:
-  void Print(const SSAGraph& graph, std::ostream& sout) const override;
-};
-
-class SSAGraghBuilderWithPrinter : public SSAGraphBuilder {
- public:
-  SSAGraghBuilderWithPrinter(std::ostream& sout,
-                             std::unique_ptr<SSAGraphPrinter>&& printer,
-                             std::unique_ptr<SSAGraphBuilder>&& builder)
-      : printer_(std::move(printer)),
-        builder_(std::move(builder)),
-        stream_ref_(sout) {}
-
-  SSAGraghBuilderWithPrinter(std::unique_ptr<std::ostream>&& sout,
-                             std::unique_ptr<SSAGraphPrinter>&& printer,
-                             std::unique_ptr<SSAGraphBuilder>&& builder)
-      : printer_(std::move(printer)),
-        builder_(std::move(builder)),
-        stream_ptr_(std::move(sout)),
-        stream_ref_(*stream_ptr_) {}
-
-  std::unique_ptr<SSAGraph> Build(const ProgramDesc& program) const override {
-    auto graph = builder_->Build(program);
-    printer_->Print(*graph, stream_ref_);
-    return graph;
-  }
-
-  int GetVarDeviceID(const std::string& var_name) const override {
-    return builder_->GetVarDeviceID(var_name);
-  }
-
- private:
-  std::unique_ptr<SSAGraphPrinter> printer_;
-  std::unique_ptr<SSAGraphBuilder> builder_;
-  std::unique_ptr<std::ostream> stream_ptr_;
-  std::ostream& stream_ref_;
-};
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index 99b10254a7961bf7b27b256acaece573a71c4115..c9e331ef359f853263f8dad38dd0a2be4d9618ad 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -14,13 +14,16 @@
 
 #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
 
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
+#include "paddle/fluid/platform/profiler.h"
+
 namespace paddle {
 namespace framework {
 namespace details {
 ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(
     const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
     const std::vector<platform::Place> &places,
-    std::unique_ptr<SSAGraph> &&graph)
+    std::unique_ptr<ir::Graph> &&graph)
     : graph_(std::move(graph)),
       pool_(strategy.num_threads_ >= 2 ? new ::ThreadPool(strategy.num_threads_)
                                        : nullptr),
@@ -32,6 +35,8 @@ ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(
 
 FeedFetchList ThreadedSSAGraphExecutor::Run(
     const std::vector<std::string> &fetch_tensors) {
+  std::unique_ptr<platform::RecordEvent> event(
+      new platform::RecordEvent("ThreadedSSAGraphExecutorPrepare", nullptr));
   std::unordered_map<OpHandleBase *, size_t> pending_ops;
   std::unordered_set<VarHandleBase *> pending_vars;
   BlockingQueue<VarHandleBase *> ready_vars;
@@ -43,18 +48,18 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
   std::unordered_set<OpHandleBase *> delayed_ops;
 
   // Transform SSAGraph to pending_ops & pending_vars
-  for (auto &var_map : graph_->vars_) {
+  for (auto &var_map : graph_->Get<details::GraphVars>(details::kGraphVars)) {
     for (auto &name_pair : var_map) {
       for (auto &version_pair : name_pair.second) {
         InsertPendingVar(&pending_vars, &ready_vars, version_pair.get());
       }
     }
   }
-  for (auto &var : graph_->dep_vars_) {
+  for (auto &var : graph_->Get<details::GraphDepVars>(details::kGraphDepVars)) {
     InsertPendingVar(&pending_vars, &ready_vars, var.get());
   }
 
-  for (auto &op : graph_->ops_) {
+  for (auto &op : graph_->Get<details::GraphOps>(details::kGraphOps)) {
     if (op->Inputs().empty()) {  // Special case, Op has no input.
       ready_ops.insert(op.get());
     } else {
@@ -64,11 +69,12 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
 
   // Step 2. Insert FetchOps
   std::vector<std::unique_ptr<FetchOpHandle>> fetch_ops;
+  std::vector<std::unique_ptr<ir::Node>> tmp_nodes;
   std::unordered_set<std::unique_ptr<VarHandleBase>> fetch_dependencies;
   FeedFetchList fetch_data(fetch_tensors.size());
 
-  InsertFetchOps(fetch_tensors, &fetch_ops, &fetch_dependencies, &pending_ops,
-                 &pending_vars, &ready_vars, &fetch_data);
+  InsertFetchOps(fetch_tensors, &fetch_ops, &tmp_nodes, &fetch_dependencies,
+                 &pending_ops, &pending_vars, &ready_vars, &fetch_data);
 
   auto run_all_ops = [&](std::unordered_set<OpHandleBase *> &set) {
     for (auto *op : set) {
@@ -78,6 +84,11 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
     set.clear();
   };
 
+  // Clean run context
+  run_op_futures_.clear();
+  exception_holder_.Clear();
+  event.reset(nullptr);
+
   // Step 3. Execution
   while (!pending_vars.empty()) {
     // 1. Run All Ready ops
@@ -96,20 +107,11 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
     auto cur_ready_vars = ready_vars.PopAll(1, &timeout);
 
     if (timeout) {
-      std::lock_guard<std::mutex> l(exception_mu_);
-      if (exception_) {
-        std::exception *exp = exception_.get();
-        if (dynamic_cast<platform::EOFException *>(exp)) {
-          auto e = *static_cast<platform::EOFException *>(exp);
-          exception_.reset();
-          throw e;
-        } else if (dynamic_cast<platform::EnforceNotMet *>(exp)) {
-          auto e = *static_cast<platform::EnforceNotMet *>(exp);
-          exception_.reset();
-          throw e;
-        } else {
-          LOG(FATAL) << "Unknown exception.";
+      if (exception_holder_.IsCaught()) {
+        for (auto &run_op_future : run_op_futures_) {
+          run_op_future.wait();
         }
+        exception_holder_.ReThrow();
       } else {
         continue;
       }
@@ -118,7 +120,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
     // Find the ready_ops after the ready_var.
     for (auto ready_var : cur_ready_vars) {
       pending_vars.erase(ready_var);
-      for (auto *op : ready_var->pending_ops_) {
+      for (auto *op : ready_var->PendingOps()) {
         auto &deps = pending_ops[op];
         --deps;
         if (deps == 0) {
@@ -144,6 +146,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
 void ThreadedSSAGraphExecutor::InsertFetchOps(
     const std::vector<std::string> &fetch_tensors,
     std::vector<std::unique_ptr<FetchOpHandle>> *fetch_ops,
+    std::vector<std::unique_ptr<ir::Node>> *temp_nodes,
     std::unordered_set<std::unique_ptr<VarHandleBase>> *fetch_dependencies,
     std::unordered_map<OpHandleBase *, size_t> *pending_ops,
     std::unordered_set<VarHandleBase *> *pending_vars,
@@ -151,7 +154,7 @@ void ThreadedSSAGraphExecutor::InsertFetchOps(
   std::unordered_map<std::string, std::vector<VarHandleBase *>> fetched_vars;
 
   for (auto &fetch_var_name : fetch_tensors) {
-    for (auto &var_map : graph_->vars_) {
+    for (auto &var_map : graph_->Get<details::GraphVars>(details::kGraphVars)) {
       auto it = var_map.find(fetch_var_name);
       if (it != var_map.end()) {
         fetched_vars[fetch_var_name].push_back(it->second.rbegin()->get());
@@ -161,8 +164,16 @@ void ThreadedSSAGraphExecutor::InsertFetchOps(
 
   for (size_t i = 0; i < fetch_tensors.size(); ++i) {
     auto &var_name = fetch_tensors[i];
-    auto &vars = fetched_vars.at(var_name);
-    auto *op = new FetchOpHandle(fetch_data, i, &local_scopes_);
+    auto fetched_var_it = fetched_vars.find(var_name);
+    PADDLE_ENFORCE(fetched_var_it != fetched_vars.end(),
+                   "Cannot find fetched variable.(Perhaps the main_program "
+                   "is not set to ParallelExecutor)");
+
+    auto &vars = fetched_var_it->second;
+
+    temp_nodes->emplace_back(new ir::Node("fetch", ir::Node::Type::kOperation));
+    auto *op = new FetchOpHandle(temp_nodes->back().get(), fetch_data, i,
+                                 &local_scopes_);
     fetch_ops->emplace_back(op);
 
     for (auto &p : places_) {
@@ -173,7 +184,8 @@ void ThreadedSSAGraphExecutor::InsertFetchOps(
       op->AddInput(var);
     }
 
-    auto *fetch_dummy = new DummyVarHandle();
+    temp_nodes->emplace_back(new ir::Node("fetch", ir::Node::Type::kOperation));
+    auto *fetch_dummy = new DummyVarHandle(temp_nodes->back().get());
     op->AddOutput(fetch_dummy);
     fetch_dependencies->emplace(fetch_dummy);
     this->InsertPendingVar(pending_vars, ready_vars, fetch_dummy);
@@ -191,7 +203,7 @@ void ThreadedSSAGraphExecutor::InsertPendingVar(
     std::unordered_set<VarHandleBase *> *pending_vars,
     BlockingQueue<VarHandleBase *> *ready_vars, VarHandleBase *var) const {
   pending_vars->insert(var);
-  if (var->generated_op_ == nullptr) {
+  if (var->GeneratedOp() == nullptr) {
     ready_vars->Push(var);
   }
 }
@@ -208,21 +220,12 @@ void ThreadedSSAGraphExecutor::RunOp(
       running_ops_--;
       ready_var_q->Extend(op->Outputs());
       VLOG(10) << op << " " << op->Name() << "Signal posted";
-    } catch (platform::EOFException ex) {
-      std::lock_guard<std::mutex> l(exception_mu_);
-      // EOFException will not cover up existing EnforceNotMet.
-      if (exception_.get() == nullptr) {
-        exception_.reset(new platform::EOFException(ex));
-      }
-    } catch (platform::EnforceNotMet ex) {
-      std::lock_guard<std::mutex> l(exception_mu_);
-      exception_.reset(new platform::EnforceNotMet(ex));
     } catch (...) {
-      LOG(FATAL) << "Unknown exception catched";
+      exception_holder_.Catch(std::current_exception());
     }
   };
   if (pool_) {
-    pool_->enqueue(op_run);
+    run_op_futures_.emplace_back(pool_->enqueue(op_run));
   } else {
     op_run();
   }
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
index c69e0487e2e503a0d445300aa2fd6bb9c30b06c9..9135c1f5d435d5e2c60eb90c80803361aa31a3c4 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <deque>
+#include <list>
 #include <string>
 #include <unordered_set>
 #include <utility>
@@ -23,9 +24,11 @@
 #include <functional>
 #include "ThreadPool.h"  // ThreadPool in thrird party
 #include "paddle/fluid/framework/blocking_queue.h"
+#include "paddle/fluid/framework/details/exception_holder.h"
 #include "paddle/fluid/framework/details/execution_strategy.h"
 #include "paddle/fluid/framework/details/fetch_op_handle.h"
 #include "paddle/fluid/framework/details/ssa_graph_executor.h"
+#include "paddle/fluid/framework/ir/graph.h"
 
 namespace paddle {
 namespace framework {
@@ -38,8 +41,9 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
   ThreadedSSAGraphExecutor(const ExecutionStrategy &strategy,
                            const std::vector<Scope *> &local_scopes,
                            const std::vector<platform::Place> &places,
-                           std::unique_ptr<SSAGraph> &&graph);
+                           std::unique_ptr<ir::Graph> &&graph);
 
+  const ir::Graph &Graph() const override { return *graph_; }
   // Run a SSAGraph by a thread pool
   // Use topological sort algorithm
   FeedFetchList Run(const std::vector<std::string> &fetch_tensors) override;
@@ -51,13 +55,12 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
              details::OpHandleBase *op);
 
  private:
-  std::unique_ptr<SSAGraph> graph_;
+  std::unique_ptr<ir::Graph> graph_;
   std::unique_ptr<::ThreadPool> pool_;
   std::vector<Scope *> local_scopes_;
   std::vector<platform::Place> places_;
   platform::DeviceContextPool fetch_ctxs_;
-  std::mutex exception_mu_;
-  std::unique_ptr<std::exception> exception_;
+  ExceptionHolder exception_holder_;
   std::atomic<int> running_ops_;
 
   void InsertPendingOp(std::unordered_map<OpHandleBase *, size_t> *pending_ops,
@@ -70,6 +73,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
   void InsertFetchOps(
       const std::vector<std::string> &fetch_tensors,
       std::vector<std::unique_ptr<FetchOpHandle>> *fetch_ops,
+      std::vector<std::unique_ptr<ir::Node>> *temp_nodes,
       std::unordered_set<std::unique_ptr<VarHandleBase>> *fetch_dependencies,
       std::unordered_map<OpHandleBase *, size_t> *pending_ops,
       std::unordered_set<VarHandleBase *> *pending_vars,
@@ -77,6 +81,8 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
 
  private:
   ExecutionStrategy strategy_;
+  // use std::list because clear(), push_back, and for_each are O(1)
+  std::list<std::future<void>> run_op_futures_;
 };
 
 }  // namespace details
diff --git a/paddle/fluid/framework/details/var_handle.cc b/paddle/fluid/framework/details/var_handle.cc
index 6f00abd9473a84a77ed1a39015e2ae079e00be79..5457870e9ff5d7cf67c9c7076b9aae94eeada779 100644
--- a/paddle/fluid/framework/details/var_handle.cc
+++ b/paddle/fluid/framework/details/var_handle.cc
@@ -26,7 +26,7 @@ std::string VarHandle::DebugString() const {
   return ss.str();
 }
 
-std::string DummyVarHandle::DebugString() const { return "dummy"; }
+std::string DummyVarHandle::DebugString() const { return node_->Name(); }
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/details/var_handle.h b/paddle/fluid/framework/details/var_handle.h
index cae9af7217660fb7e4b8535ee8e022fb3a127668..d8c2bc40b9458a1d5a7dd8a32277d04f69295f09 100644
--- a/paddle/fluid/framework/details/var_handle.h
+++ b/paddle/fluid/framework/details/var_handle.h
@@ -13,11 +13,14 @@
 // limitations under the License.
 
 #pragma once
+
+#include <algorithm>
 #include <sstream>
 #include <string>
 #include <unordered_set>
 #include <utility>
 
+#include "paddle/fluid/framework/ir/node.h"
 #include "paddle/fluid/platform/place.h"
 
 namespace paddle {
@@ -25,19 +28,60 @@ namespace framework {
 namespace details {
 class OpHandleBase;
 
+// Wraps ir::Node and provide helper utilities.
+// It's responsible for populating necessary fields of ir::Node.
+//
 // VarHandleBase is the var node in the dependency graph.
 // A variable can only be generated by a single operator. i.e.
 // This is a single assignment graph.
 struct VarHandleBase {
+  explicit VarHandleBase(ir::Node* node) : node_(node) {}
+
   virtual ~VarHandleBase();
+
   virtual std::string DebugString() const = 0;
 
+  void AddInput(OpHandleBase* in, ir::Node* node) {
+    node_->inputs.clear();
+    node_->inputs.push_back(node);
+    generated_op_ = in;
+  }
+
+  void AddOutput(OpHandleBase* out, ir::Node* node) {
+    if (pending_ops_.find(out) == pending_ops_.end()) {
+      pending_ops_.insert(out);
+      node_->outputs.push_back(node);
+    }
+  }
+
+  void RemoveOutput(OpHandleBase* out, ir::Node* node) {
+    pending_ops_.erase(out);
+    node_->outputs.erase(
+        std::remove(node_->outputs.begin(), node_->outputs.end(), node),
+        node_->outputs.end());
+  }
+
+  void ClearGeneratedOp() {
+    generated_op_ = nullptr;
+    node_->inputs.clear();
+  }
+
+  OpHandleBase* GeneratedOp() { return generated_op_; }
+
+  const std::unordered_set<OpHandleBase*>& PendingOps() const {
+    return pending_ops_;
+  }
+
+  ir::Node* Node() { return node_; }
+
+ protected:
   // The operator who generate this variable. nullptr if the variable
   // is a root node.
   OpHandleBase* generated_op_{nullptr};
 
   // Operators which depend on this variable ready.
   std::unordered_set<OpHandleBase*> pending_ops_;
+  ir::Node* node_;
 };
 
 // VarHandle is actually a single version of Runtime Variable.
@@ -46,11 +90,14 @@ struct VarHandleBase {
 //
 // NOTE: runtime variables have place.
 struct VarHandle : public VarHandleBase {
+  explicit VarHandle(ir::Node* node) : VarHandleBase(node) {}
+
   std::string DebugString() const override;
 
-  VarHandle(size_t version, size_t scope_index, std::string name,
-            platform::Place place)
-      : version_(version),
+  VarHandle(ir::Node* node, size_t version, size_t scope_index,
+            std::string name, platform::Place place)
+      : VarHandleBase(node),
+        version_(version),
         scope_idx_(scope_index),
         name_(std::move(name)),
         place_(std::move(place)) {}
@@ -70,6 +117,8 @@ struct VarHandle : public VarHandleBase {
 
 // Dummy Variable. It is used to represent dependencies between operators
 struct DummyVarHandle : public VarHandleBase {
+  explicit DummyVarHandle(ir::Node* node) : VarHandleBase(node) {}
+
   std::string DebugString() const override;
 };
 
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 6868f639a03bef9acdd9d6418883e7c502761ec5..fd58de28afe868ca8574e7d2550fef1a313f1e8b 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -47,19 +47,13 @@ ExecutorPrepareContext::~ExecutorPrepareContext() {
 
 Executor::Executor(const platform::Place& place) : place_(place) {}
 
+void Executor::Close() {
 #ifdef PADDLE_WITH_DISTRIBUTE
-void Executor::BeginPass() {
   ::paddle::operators::distributed::RPCClient::GetInstance<
       ::paddle::operators::distributed::GRPCClient>()
-      ->SendBeginPass();
-}
-
-void Executor::EndPass() {
-  ::paddle::operators::distributed::RPCClient::GetInstance<
-      ::paddle::operators::distributed::GRPCClient>()
-      ->SendEndPass();
-}
+      ->SendComplete();
 #endif
+}
 
 void InitializeVariable(Variable* var, proto::VarType::Type var_type) {
   if (var_type == proto::VarType::LOD_TENSOR) {
@@ -357,7 +351,6 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
   }
 
   for (auto& op : ctx->ops_) {
-    VLOG(4) << place_ << " " << op->DebugStringEx(local_scope);
     op->Run(*local_scope, place_);
 
 #ifdef PADDLE_WITH_CUDA
diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h
index 81d83ecea50e360b6c1935777dd246f012160d5a..122bafedce3607d3fb6d1a6cb75bf038d00aa771 100644
--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
@@ -89,17 +89,11 @@ class Executor {
 
   explicit Executor(const platform::Place& place);
 
-#ifdef PADDLE_WITH_DISTRIBUTE
   /*
-   * Sending signal to pserver to mark current pass started.
+   * Close this Executor.
+   * Calling this method will send complete messages to all pserver instances.
    */
-  void BeginPass();
-
-  /*
-   * Sending signal to pserver to mark current pass finished.
-   */
-  void EndPass();
-#endif
+  void Close();
 
   /* @Brief
    * Runtime evaluation of the given ProgramDesc under certain Scope
@@ -111,6 +105,7 @@ class Executor {
   void Run(const ProgramDesc& prog, Scope* scope, int block_id,
            bool create_local_scope = true, bool create_vars = true);
 
+  // This API is very slow.
   void Run(const ProgramDesc& program, Scope* scope,
            std::map<std::string, const LoDTensor*>* feed_targets,
            std::map<std::string, LoDTensor*>* fetch_targets,
@@ -130,6 +125,7 @@ class Executor {
                           bool create_local_scope = true,
                           bool create_vars = true, bool keep_kids = false);
 
+  // This API is very slow.
   void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
                           std::map<std::string, const LoDTensor*>* feed_targets,
                           std::map<std::string, LoDTensor*>* fetch_targets,
diff --git a/paddle/fluid/framework/framework.proto b/paddle/fluid/framework/framework.proto
index 2cf14bd371831ab682166f4256d6966b5ab278c8..460401df5473f8650f450a2bd247a703d91b6048 100644
--- a/paddle/fluid/framework/framework.proto
+++ b/paddle/fluid/framework/framework.proto
@@ -16,6 +16,13 @@ syntax = "proto2";
 option optimize_for = LITE_RUNTIME;
 package paddle.framework.proto;
 
+// Any incompatible changes to ProgramDesc and its dependencies should
+// raise the version defined version.h.
+//
+// Serailization and Deserialization codes should be modified in a way
+// that supports old versions following the version and compatibility policy.
+message Version { optional int64 version = 1 [ default = 0 ]; }
+
 enum AttrType {
   INT = 0;
   FLOAT = 1;
@@ -107,6 +114,7 @@ message VarType {
     // Tensor<size_t> is used in C++.
     SIZE_T = 19;
     UINT8 = 20;
+    INT8 = 21;
 
     // Other types that may need additional descriptions
     LOD_TENSOR = 7;
@@ -179,4 +187,8 @@ message BlockDesc {
 // for more details.
 // TODO(panyx0718): A model can have multiple programs. Need a
 // way to distinguish them. Maybe ID or name?
-message ProgramDesc { repeated BlockDesc blocks = 1; }
+message ProgramDesc {
+  repeated BlockDesc blocks = 1;
+
+  optional Version version = 2;
+}
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7004f484a9975124750fad4cb8f773342082b514
--- /dev/null
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -0,0 +1,50 @@
+set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h)
+file(WRITE ${pass_file} "// Generated by the paddle/fluid/framework/ir/CMakeLists.txt.  DO NOT EDIT!\n\n")
+file(APPEND ${pass_file} "\#include \"paddle/fluid/framework/ir/pass.h\"\n")
+
+
+# Usage: pass_library(target inference) will append to paddle_inference_pass.h
+function(pass_library TARGET DEST)
+    set(options "")
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS)
+    cmake_parse_arguments(op_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    cc_library(${TARGET} SRCS ${TARGET}.cc DEPS graph_pattern_detector pass ${op_library_DEPS})
+    # add more DEST here, such as train, dist and collect USE_PASS into a file automatically.
+    if (${DEST} STREQUAL "base" OR ${DEST} STREQUAL "inference")
+        message(STATUS "add pass ${TARGET} ${DEST}")
+        file(APPEND ${pass_file} "USE_PASS(${TARGET});\n")
+        set(PASS_LIBRARY ${TARGET} ${PASS_LIBRARY} PARENT_SCOPE)
+    endif()
+endfunction()
+
+cc_library(node SRCS node.cc DEPS proto_desc)
+cc_library(graph SRCS graph.cc DEPS node pretty_log)
+cc_library(graph_helper SRCS graph_helper.cc DEPS graph)
+cc_library(pass SRCS pass.cc DEPS graph node graph_helper)
+cc_library(graph_traits SRCS graph_traits.cc DEPS graph)
+cc_library(graph_pattern_detector SRCS graph_pattern_detector.cc DEPS graph graph_helper graph_traits)
+
+pass_library(graph_to_program_pass base)
+pass_library(graph_viz_pass base)
+pass_library(fc_fuse_pass inference)
+if(WITH_MKLDNN)
+  pass_library(conv_relu_mkldnn_fuse_pass inference)
+endif()
+pass_library(attention_lstm_fuse_pass inference)
+pass_library(infer_clean_graph_pass inference)
+pass_library(fc_lstm_fuse_pass inference)
+pass_library(fc_gru_fuse_pass inference)
+pass_library(seq_concat_fc_fuse_pass inference)
+
+set(GLOB_PASS_LIB ${PASS_LIBRARY} CACHE INTERNAL "Global PASS library")
+
+cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper)
+cc_test(graph_test SRCS graph_test.cc DEPS graph graph_helper op_registry)
+cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_registry)
+cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass)
+cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector)
+cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto)
+if(WITH_MKLDNN)
+  cc_test(test_conv_relu_mkldnn_fuse_pass SRCS conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass)
+endif()
diff --git a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bb52d7e498e55c02ddc2cd6d07ccccd51ce4edc5
--- /dev/null
+++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
@@ -0,0 +1,269 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/attention_lstm_fuse_pass.h"
+#include <string>
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/ir/graph_viz_pass.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+struct Param {
+  std::string X = "concat_0.tmp_0";
+  std::string C0 = "cell_init";
+  std::string H0 = "hidden_init";
+  std::string AttentionWeight = "attention_fc.w_0";
+  std::string AttentionBias = "attention_fc.b_0";
+  std::string AttentionScalar = "attention_output.w_0";
+  std::string AttentionScalarBias = "attention_output.b_0";
+  std::string LSTMWeight = "attention_w.new";
+  std::string LSTMBias = "attention_b.new";
+  std::string Hidden = "array_to_lod_tensor_0.tmp_0";
+  std::string Cell = "at.cell.new";
+  std::string AttentionedX = "at.x.new";
+  std::string AttentionFCOut = "at.fc.new";
+  std::string LSTMX = "at.lstmx.new";
+  std::string LSTMOUT = "at.lstmout.new";
+};
+
+void PrepareParameters(Graph* graph, const Param& param);
+
+void FindWhileOp(Graph* graph) {
+  GraphPatternDetector gpd;
+  std::unordered_set<int> fused_external_ops(
+      {35, 36, 37, 38, 43, 44, 49, 45, 46, 47, 41, 42, 53, 54, 48,
+       57, 55, 56, 52, 74, 80, 77, 78, 79, 50, 77, 39, 40, 51});
+
+  gpd.mutable_pattern()->NewNode(
+      [&](Node* n) { return fused_external_ops.count(n->id()); }, "while");
+
+  if (!graph->Has(kGraphvizMarkedNodeAttr)) {
+    graph->Set(kGraphvizMarkedNodeAttr, new GraphVizPass::marked_nodes_t);
+  }
+  auto& marked_nodes =
+      graph->Get<GraphVizPass::marked_nodes_t>(kGraphvizMarkedNodeAttr);
+
+  auto handle = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                    Graph* g) {
+    auto* while_pat_node = gpd.pattern().RetrieveNode("while");
+    auto* while_node = subgraph.at(while_pat_node);
+    marked_nodes.insert(while_node);
+  };
+  gpd(graph, handle);
+
+  Param param;
+  // Add AttentionLSTM node
+  OpDesc op_desc;
+  op_desc.SetType("attention_lstm");
+
+#define OP_SET_IN(x) op_desc.SetInput(#x, {param.x});
+#define OP_SET_OUT(x) op_desc.SetOutput(#x, {param.x});
+  OP_SET_IN(X);
+  OP_SET_IN(C0);
+  OP_SET_IN(H0);
+  OP_SET_IN(AttentionWeight);
+  OP_SET_IN(AttentionBias);
+  OP_SET_IN(AttentionScalar);
+  OP_SET_IN(AttentionScalarBias);
+  OP_SET_IN(LSTMWeight);
+  OP_SET_IN(LSTMBias);
+
+  OP_SET_OUT(Hidden);
+  OP_SET_OUT(Cell);
+  OP_SET_OUT(AttentionedX);
+  OP_SET_OUT(AttentionFCOut);
+  OP_SET_OUT(LSTMX);
+  OP_SET_OUT(LSTMOUT);
+#undef OP_SET_IN
+#undef OP_SET_OUT
+
+  auto* X = graph->RetriveNode(34);
+  auto* LSTMOUT = graph->RetriveNode(81);
+  auto* cell_init = graph->RetriveNode(6);
+  auto* hidden_init = graph->RetriveNode(8);
+
+  auto* lstm_op = graph->CreateOpNode(&op_desc);
+  PrepareParameters(graph, param);
+
+  IR_NODE_LINK_TO(X, lstm_op);
+  IR_NODE_LINK_TO(cell_init, lstm_op);
+  IR_NODE_LINK_TO(hidden_init, lstm_op);
+  IR_NODE_LINK_TO(lstm_op, LSTMOUT);
+
+  GraphSafeRemoveNodes(graph, marked_nodes);
+}
+
+#define CHECK_P1(x) PADDLE_ENFORCE_NOT_NULL(x);
+#define CHECK_P2(x0, x1) \
+  CHECK_P1(x0);          \
+  CHECK_P1(x1);
+#define CHECK_P3(x0, x1, x2) \
+  CHECK_P2(x0, x1);          \
+  CHECK_P1(x2);
+#define CHECK_P4(x0, x1, x2, x3) \
+  CHECK_P3(x0, x1, x2);          \
+  CHECK_P1(x3);
+#define CHECK_P5(x0, x1, x2, x3, x4) \
+  CHECK_P4(x0, x1, x2, x3);          \
+  CHECK_P1(x4);
+
+void PrepareLSTMWeight(const LoDTensor& W_forget_w0,
+                       const LoDTensor& W_forget_w1,
+                       const LoDTensor& W_input_w0, const LoDTensor& W_input_w1,
+                       const LoDTensor& W_output_w0,
+                       const LoDTensor& W_output_w1, const LoDTensor& W_cell_w0,
+                       const LoDTensor& W_cell_w1, LoDTensor* out);
+
+void PrepareLSTMBias(const LoDTensor& B_forget, const LoDTensor& B_input,
+                     const LoDTensor& B_output, const LoDTensor& B_cell,
+                     LoDTensor* out);
+
+void PrepareParameters(Graph* graph, const Param& param) {
+  // Check parameters
+  PADDLE_ENFORCE(graph->Has(kParamScopeAttr));
+  auto* scope = graph->Get<Scope*>(kParamScopeAttr);
+
+  // Create new parameters.
+  scope->Var(param.LSTMWeight)->GetMutable<LoDTensor>();
+  scope->Var(param.LSTMBias)->GetMutable<LoDTensor>();
+  scope->Var(param.Hidden)->GetMutable<LoDTensor>();
+  scope->Var(param.Cell)->GetMutable<LoDTensor>();
+  scope->Var(param.AttentionedX)->GetMutable<LoDTensor>();
+  scope->Var(param.AttentionFCOut)->GetMutable<LoDTensor>();
+  scope->Var(param.LSTMX)->GetMutable<LoDTensor>();
+  scope->Var(param.LSTMOUT)->GetMutable<LoDTensor>();
+
+#define GATE_W(name__)                                               \
+  auto* W_##name__##_w0 = scope->FindVar(#name__ ".w_0");            \
+  auto* W_##name__##_w1 = scope->FindVar(#name__ ".w_1");            \
+  auto* W_##name__##_b0 = scope->FindVar(#name__ ".b_0");            \
+  CHECK_P3(W_##name__##_w0, W_##name__##_w1, W_##name__##_b0);       \
+  VLOG(4) << #name__ "_w0"                                           \
+          << " shape: " << W_##name__##_w0->Get<LoDTensor>().dims(); \
+  VLOG(4) << #name__ "_w1"                                           \
+          << " shape: " << W_##name__##_w1->Get<LoDTensor>().dims(); \
+  VLOG(4) << #name__ "_b0"                                           \
+          << " shape: " << W_##name__##_b0->Get<LoDTensor>().dims(); \
+  auto& W_##name__##_w0_t = W_##name__##_w0->Get<LoDTensor>();       \
+  auto& W_##name__##_w1_t = W_##name__##_w1->Get<LoDTensor>();       \
+  auto& W_##name__##_b0_t = W_##name__##_b0->Get<LoDTensor>();
+
+  GATE_W(forget);
+  GATE_W(input);
+  GATE_W(output);
+  GATE_W(c);
+#undef GATE_W
+
+  auto* attention_fc_w = scope->FindVar("attention_fc.w_0");
+  auto* attention_fc_b = scope->FindVar("attention_fc.b_0");
+  auto* attention_output_w = scope->FindVar("attention_output.w_0");
+  auto* attention_output_b = scope->FindVar("attention_output.b_0");
+  CHECK_P4(attention_fc_w, attention_fc_b, attention_output_w,
+           attention_output_b);
+
+  auto* lstm_weight = scope->Var(param.LSTMWeight);
+  auto* lstm_weight_t = lstm_weight->GetMutable<LoDTensor>();
+  auto* lstm_bias = scope->Var(param.LSTMBias);
+  auto* lstm_bias_t = lstm_bias->GetMutable<LoDTensor>();
+
+  // reshape attention_bias
+  auto* attention_bias_t =
+      scope->FindVar(param.AttentionBias)->GetMutable<LoDTensor>();
+  PADDLE_ENFORCE_EQ(attention_bias_t->dims().size(), 1);
+  attention_bias_t->Resize(make_ddim({1, attention_bias_t->dims()[0]}));
+
+  auto* attention_scalar_bias_t =
+      scope->FindVar(param.AttentionScalarBias)->GetMutable<LoDTensor>();
+  attention_scalar_bias_t->Resize(
+      make_ddim({1, attention_scalar_bias_t->dims()[0]}));
+
+  PrepareLSTMWeight(W_forget_w0_t, W_forget_w1_t, W_input_w0_t, W_input_w1_t,
+                    W_output_w0_t, W_output_w1_t, W_c_w0_t, W_c_w1_t,
+                    lstm_weight_t);
+  PrepareLSTMBias(W_forget_b0_t, W_input_b0_t, W_output_b0_t, W_c_b0_t,
+                  lstm_bias_t);
+}
+
+// Prepare parameters
+void PrepareLSTMWeight(const LoDTensor& W_forget_w0,
+                       const LoDTensor& W_forget_w1,
+                       const LoDTensor& W_input_w0, const LoDTensor& W_input_w1,
+                       const LoDTensor& W_output_w0,
+                       const LoDTensor& W_output_w1, const LoDTensor& W_cell_w0,
+                       const LoDTensor& W_cell_w1, LoDTensor* out) {
+  int D = W_forget_w0.dims()[0];
+  int M = W_forget_w1.dims()[0];
+  out->Resize(make_ddim({D + M, 4 * D}));
+  VLOG(3) << "LSTMWeight resized to " << out->dims();
+
+  float* out_data = out->mutable_data<float>(platform::CPUPlace());
+  std::array<const float*, 4> tensors(
+      {{W_forget_w0.data<float>(), W_input_w0.data<float>(),
+        W_output_w0.data<float>(), W_cell_w0.data<float>()}});
+  std::array<const float*, 4> tensors1(
+      {{W_forget_w1.data<float>(), W_input_w1.data<float>(),
+        W_output_w1.data<float>(), W_cell_w1.data<float>()}});
+
+  for (int row = 0; row < D; row++) {
+    for (int col = 0; col < 4; col++) {
+      float* dst = out_data + 4 * D * row + D * col;
+      const float* src = tensors[col] + D * row;
+      memcpy(dst, src, D * sizeof(float));
+    }
+  }
+
+  for (int row = 0; row < M; row++) {
+    for (int col = 0; col < 4; col++) {
+      float* dst = out_data + 4 * D * (D + row) + D * col;
+      const float* src = tensors1[col] + D * row;
+      memcpy(dst, src, D * sizeof(float));
+    }
+  }
+}
+
+void PrepareLSTMBias(const LoDTensor& B_forget, const LoDTensor& B_input,
+                     const LoDTensor& B_output, const LoDTensor& B_cell,
+                     LoDTensor* out) {
+  std::array<const float*, 4> tensors(
+      {{B_forget.data<float>(), B_input.data<float>(), B_output.data<float>(),
+        B_cell.data<float>()}});
+
+  PADDLE_ENFORCE_EQ(B_forget.dims().size(), 1);
+  int D = B_forget.dims()[0];
+  out->Resize(make_ddim({1, 4 * D}));
+  auto* out_data = out->mutable_data<float>(platform::CPUPlace());
+  for (size_t i = 0; i < tensors.size(); i++) {
+    memcpy(out_data + D * i, tensors[i], D * sizeof(float));
+  }
+}
+
+// Parameters
+
+std::unique_ptr<ir::Graph> AttentionLSTMFusePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  PDPattern external_pattern, subblock_pattern;
+
+  FindWhileOp(graph.get());
+  return graph;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(attention_lstm_fuse_pass,
+              paddle::framework::ir::AttentionLSTMFusePass);
diff --git a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.h b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..a756dfc1b98e1de55c809c73e2c4df1e628950ae
--- /dev/null
+++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class AttentionLSTMFusePass : public FusePassBase {
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..09c5ec59d66445bdbd5349447b125be89cb2efdf
--- /dev/null
+++ b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc
@@ -0,0 +1,90 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h"
+#include <string>
+#include <vector>
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+std::unique_ptr<ir::Graph> ConvReLUFusePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  PADDLE_ENFORCE(graph.get());
+  FusePassBase::Init("conv_relu_mkldnn_fuse", graph.get());
+
+  std::unordered_set<Node*> nodes2delete;
+
+  GraphPatternDetector gpd;
+  auto* conv_input = gpd.mutable_pattern()
+                         ->NewNode("conv_relu_mkldnn_fuse/conv_input")
+                         ->AsInput()
+                         ->assert_is_op_input("conv2d", "Input");
+  patterns::ConvReLU conv_relu_pattern(gpd.mutable_pattern(),
+                                       "conv_relu_mkldnn_fuse");
+  conv_relu_pattern(conv_input);
+
+  int found_conv_relu_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    VLOG(4) << "handle ConvReLU fuse";
+    GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight,
+                              conv_relu_pattern);  // Filter
+    GET_IR_NODE_FROM_SUBGRAPH(conv_bias, conv_bias, conv_relu_pattern);  // Bias
+    GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, conv_relu_pattern);    // tmp
+    GET_IR_NODE_FROM_SUBGRAPH(conv, conv, conv_relu_pattern);  // CONV op
+    GET_IR_NODE_FROM_SUBGRAPH(relu_out, relu_out, conv_relu_pattern);  // Out
+    GET_IR_NODE_FROM_SUBGRAPH(relu, relu, conv_relu_pattern);  // ReLU op
+
+    // Create an ConvReLU Node.
+    OpDesc desc;
+    std::string conv_relu_i_in = subgraph.at(conv_input)->Name();
+    std::string conv_relu_w_in = conv_weight->Name();
+    std::string conv_relu_b_in = conv_bias->Name();
+    std::string conv_relu_out = relu_out->Name();
+    desc.SetInput("Input", std::vector<std::string>({conv_relu_i_in}));
+    desc.SetInput("Filter", std::vector<std::string>({conv_relu_w_in}));
+    desc.SetInput("Bias", std::vector<std::string>({conv_relu_b_in}));
+    desc.SetOutput("Output", std::vector<std::string>({conv_relu_out}));
+    desc.SetType("conv2d");
+    for (auto& attr : conv->Op()->GetAttrMap()) {
+      desc.SetAttr(attr.first, attr.second);
+    }
+    desc.SetAttr("fuse_relu", true);
+    auto conv_relu_node = g->CreateOpNode(&desc);  // OpDesc will be copied.
+    GraphSafeRemoveNodes(graph.get(), {conv, relu, conv_out});
+
+    PADDLE_ENFORCE(subgraph.count(conv_input));
+    IR_NODE_LINK_TO(subgraph.at(conv_input), conv_relu_node);
+    IR_NODE_LINK_TO(conv_weight, conv_relu_node);
+    IR_NODE_LINK_TO(conv_bias, conv_relu_node);
+    IR_NODE_LINK_TO(conv_relu_node, relu_out);
+
+    found_conv_relu_count++;
+  };
+
+  gpd(graph.get(), handler);
+
+  AddStatis(found_conv_relu_count);
+  return graph;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(conv_relu_mkldnn_fuse_pass,
+              paddle::framework::ir::ConvReLUFusePass);
diff --git a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..b5de0d548713772e7ad41cfb6d8b3e9460683efb
--- /dev/null
+++ b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+/*
+ * Fuse the CONV and ReLU to a ConvReLUOp.
+ */
+class ConvReLUFusePass : public FusePassBase {
+ public:
+  virtual ~ConvReLUFusePass() {}
+
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..82b5fa1886098ca3b19c147c307d3f2fc3ba03d6
--- /dev/null
+++ b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc
@@ -0,0 +1,108 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h"
+
+#include <gtest/gtest.h>
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void SetOp(ProgramDesc* prog, const std::string& type,
+           const std::vector<std::string>& inputs,
+           const std::vector<std::string>& outputs) {
+  auto* op = prog->MutableBlock(0)->AppendOp();
+  op->SetType(type);
+  if (type == "conv2d") {
+    op->SetAttr("use_mkldnn", true);
+    op->SetInput("Input", {inputs[0]});
+    op->SetInput("Filter", {inputs[1]});
+    op->SetInput("Bias", {inputs[2]});
+  } else if (type == "relu") {
+    op->SetInput("X", inputs);
+  }
+  op->SetOutput("Out", outputs);
+}
+
+// a->OP0->b
+// b->OP1->c
+// (c, weights, bias)->conv->f
+// (f)->relu->g
+ProgramDesc BuildProgramDesc() {
+  ProgramDesc prog;
+  for (auto& v :
+       std::vector<std::string>({"a", "b", "c", "weights", "bias", "f", "g"})) {
+    auto* var = prog.MutableBlock(0)->Var(v);
+    var->SetType(proto::VarType::SELECTED_ROWS);
+    if (v == "weights" || v == "bias") {
+      var->SetPersistable(true);
+    }
+  }
+
+  SetOp(&prog, "OP0", std::vector<std::string>({"a"}),
+        std::vector<std::string>({"b"}));
+  SetOp(&prog, "OP1", std::vector<std::string>({"b"}),
+        std::vector<std::string>({"c"}));
+  SetOp(&prog, "conv2d", std::vector<std::string>({"c", "weights", "bias"}),
+        std::vector<std::string>({"f"}));
+  SetOp(&prog, "relu", std::vector<std::string>({"f"}),
+        std::vector<std::string>({"g"}));
+
+  return prog;
+}
+
+TEST(ConvReLUFusePass, basic) {
+  auto prog = BuildProgramDesc();
+
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+
+  auto pass = PassRegistry::Instance().Get("conv_relu_mkldnn_fuse_pass");
+
+  int original_nodes_num = graph->Nodes().size();
+
+  graph = pass->Apply(std::move(graph));
+
+  int current_nodes_num = graph->Nodes().size();
+
+  // Remove 3 Nodes: CONV, RELU, conv_out
+  // Add 1 Node: ConvReLU
+  EXPECT_EQ(original_nodes_num - 2, current_nodes_num);
+
+  // Assert conv_relu op in newly generated graph
+  int conv_relu_count = 0;
+
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp() && node->Op()->Type() == "conv2d") {
+      if (node->Op()->HasAttr("use_mkldnn")) {
+        bool use_mkldnn = boost::get<bool>(node->Op()->GetAttr("use_mkldnn"));
+        if (use_mkldnn) {
+          if (node->Op()->HasAttr("fuse_relu")) {
+            bool fuse_relu = boost::get<bool>(node->Op()->GetAttr("fuse_relu"));
+            if (fuse_relu) {
+              ++conv_relu_count;
+            }
+          }
+        }
+      }
+    }
+  }
+  EXPECT_EQ(conv_relu_count, 1);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+USE_PASS(conv_relu_mkldnn_fuse_pass);
diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.cc b/paddle/fluid/framework/ir/fc_fuse_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ca704c7f5631bbaa88f1bc2caaa22fd021de11c4
--- /dev/null
+++ b/paddle/fluid/framework/ir/fc_fuse_pass.cc
@@ -0,0 +1,83 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/fc_fuse_pass.h"
+#include <string>
+#include <vector>
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+std::unique_ptr<ir::Graph> FCFusePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  PADDLE_ENFORCE(graph.get());
+  FusePassBase::Init("fc_fuse", graph.get());
+
+  std::unordered_set<Node*> nodes2delete;
+
+  GraphPatternDetector gpd;
+  auto* x = gpd.mutable_pattern()
+                ->NewNode("fc_fuse/x")
+                ->AsInput()
+                ->assert_is_op_input("mul", "X");
+  patterns::FC fc_pattern(gpd.mutable_pattern(), "fc_fuse");
+  fc_pattern(x, true /*with bias*/);
+
+  int found_fc_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    VLOG(4) << "handle FC fuse";
+    GET_IR_NODE_FROM_SUBGRAPH(w, w, fc_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(fc_bias, bias, fc_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(fc_out, Out, fc_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(mul, mul, fc_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add, elementwise_add, fc_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(mul_out, mul_out, fc_pattern);
+
+    // Create an FC Node.
+    OpDesc desc;
+    std::string fc_x_in = subgraph.at(x)->Name();
+    std::string fc_Y_in = w->Name();
+    std::string fc_bias_in = fc_bias->Name();
+    std::string fc_out_out = fc_out->Name();
+    desc.SetInput("Input", std::vector<std::string>({fc_x_in}));
+    desc.SetInput("W", std::vector<std::string>({fc_Y_in}));
+    desc.SetInput("Bias", std::vector<std::string>({fc_bias_in}));
+    desc.SetOutput("Out", std::vector<std::string>({fc_out_out}));
+    desc.SetType("fc");
+    auto fc_node = g->CreateOpNode(&desc);  // OpDesc will be copied.
+    GraphSafeRemoveNodes(graph.get(), {mul, elementwise_add, mul_out});
+
+    PADDLE_ENFORCE(subgraph.count(x));
+    IR_NODE_LINK_TO(subgraph.at(x), fc_node);
+    IR_NODE_LINK_TO(w, fc_node);
+    IR_NODE_LINK_TO(fc_bias, fc_node);
+    IR_NODE_LINK_TO(fc_node, fc_out);
+
+    found_fc_count++;
+  };
+
+  gpd(graph.get(), handler);
+
+  AddStatis(found_fc_count);
+  return graph;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(fc_fuse_pass, paddle::framework::ir::FCFusePass);
diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.h b/paddle/fluid/framework/ir/fc_fuse_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..6c69539d1e48268afc2435f8f73b3818d13107cd
--- /dev/null
+++ b/paddle/fluid/framework/ir/fc_fuse_pass.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+/*
+ * Fuse the MUL and ELEMENTWISE_ADD to a FCOp.
+ */
+class FCFusePass : public FusePassBase {
+ public:
+  virtual ~FCFusePass() {}
+
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc b/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..06286a109d01af638e74e06ccc83e2a5500663ea
--- /dev/null
+++ b/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc
@@ -0,0 +1,95 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/fc_fuse_pass.h"
+
+#include <gtest/gtest.h>
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void SetOp(ProgramDesc* prog, const std::string& type,
+           const std::vector<std::string>& inputs,
+           const std::vector<std::string>& outputs) {
+  auto* op = prog->MutableBlock(0)->AppendOp();
+  op->SetType(type);
+  if (type == "mul") {
+    op->SetInput("X", {inputs[0]});
+    op->SetInput("Y", {inputs[1]});
+  } else if (type == "elementwise_add") {
+    op->SetInput("X", inputs);
+  }
+  op->SetOutput("Out", outputs);
+}
+
+// a->OP0->b
+// a->OP1->c
+// (b, c)->mul->d
+// (d, e)->elementwise_add->f
+ProgramDesc BuildProgramDesc() {
+  ProgramDesc prog;
+  for (auto& v : std::vector<std::string>({"a", "b", "c", "d", "e", "f"})) {
+    auto* var = prog.MutableBlock(0)->Var(v);
+    var->SetType(proto::VarType::SELECTED_ROWS);
+    if (v == "c") {
+      var->SetPersistable(true);
+    }
+  }
+
+  SetOp(&prog, "OP0", std::vector<std::string>({"a"}),
+        std::vector<std::string>({"b"}));
+  SetOp(&prog, "OP1", std::vector<std::string>({"a"}),
+        std::vector<std::string>({"c"}));
+  SetOp(&prog, "mul", std::vector<std::string>({"b", "c"}),
+        std::vector<std::string>({"d"}));
+  SetOp(&prog, "elementwise_add", std::vector<std::string>({"d", "e"}),
+        std::vector<std::string>({"f"}));
+
+  return prog;
+}
+
+TEST(FCFusePass, basic) {
+  auto prog = BuildProgramDesc();
+
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+
+  auto pass = PassRegistry::Instance().Get("fc_fuse_pass");
+
+  int pre_nodes = graph->Nodes().size();
+
+  graph = pass->Apply(std::move(graph));
+
+  int after_nodes = graph->Nodes().size();
+
+  // Remove 3 Nodes: MUL,ELEMENTWISE_ADD, mul_out
+  // Add 1 Node: FC
+  EXPECT_EQ(pre_nodes - 2, after_nodes);
+
+  // Assert fc op in newly generated graph
+  int fc_count = 0;
+
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp() && node->Op()->Type() == "fc") {
+      ++fc_count;
+    }
+  }
+  EXPECT_EQ(fc_count, 1);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+USE_PASS(fc_fuse_pass);
diff --git a/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc b/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a902b0b50cf27ff84877053aca2ff921cd00b833
--- /dev/null
+++ b/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc
@@ -0,0 +1,185 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/fc_gru_fuse_pass.h"
+#include <string>
+#include "paddle/fluid/framework/lod_tensor.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+static int BuildFusion(Graph* graph, const std::string& name_scope,
+                       Scope* scope, bool with_fc_bias) {
+  GraphPatternDetector gpd;
+  auto* pattern = gpd.mutable_pattern();
+
+  // Create pattern.
+  patterns::FC fc_pattern(pattern, name_scope);
+  patterns::GRU gru_pattern(pattern, name_scope);
+
+  PDNode* x =
+      pattern->NewNode(patterns::UniqueKey("x"))->assert_var_not_persistable();
+
+  auto* fc_out = fc_pattern(x, with_fc_bias);
+  fc_out->AsIntermediate();  // fc_out is a tmp var, will be removed after fuse.
+  gru_pattern(fc_out);
+
+  // Create New OpDesc
+  auto gru_creater = [&](Node* gru, Node* x, Node* weight_x, Node* weight_h,
+                         Node* bias, Node* hidden, Node* fc_bias) {
+
+    OpDesc op_desc;
+    op_desc.SetType("fusion_gru");
+
+#define NEW_NAME(x) name_scope + "/at." #x ".new"
+#define SET_IN(Key, node__) op_desc.SetInput(#Key, {node__->Name()});
+    SET_IN(X, x);
+    SET_IN(WeightX, weight_x);
+    SET_IN(WeightH, weight_h);
+    if (with_fc_bias) {
+      op_desc.SetInput("Bias", {NEW_NAME(bias) + bias->Name()});
+    } else {
+      SET_IN(Bias, bias);
+    }
+#undef SET_IN
+    op_desc.SetInput("H0", {});
+    op_desc.SetOutput("Hidden", {hidden->Name()});
+    op_desc.SetAttr("is_reverse", gru->Op()->GetAttr("is_reverse"));
+    // TODO(TJ): This should be a option for infer
+    op_desc.SetAttr("use_seq", true);
+
+#define SET_IMTERMEDIATE_OUT(key) op_desc.SetOutput(#key, {NEW_NAME(key)})
+    SET_IMTERMEDIATE_OUT(ReorderedH0);
+    SET_IMTERMEDIATE_OUT(XX);
+    SET_IMTERMEDIATE_OUT(BatchedInput);
+    SET_IMTERMEDIATE_OUT(BatchedOut);
+#undef SET_IMTERMEDIATE_OUT
+
+    auto* op = graph->CreateOpNode(&op_desc);
+    PADDLE_ENFORCE(graph->Has(kParamScopeAttr));
+    auto* scope = graph->Get<Scope*>(kParamScopeAttr);
+    PADDLE_ENFORCE(scope);
+    if (with_fc_bias) {
+      // Fusion GRU bias = fcbias + grubias
+      auto* fusion_bias_var = scope->Var(NEW_NAME(bias) + bias->Name());
+      auto* out_bias_tensor =
+          fusion_bias_var->GetMutable<framework::LoDTensor>();
+      PADDLE_ENFORCE(fusion_bias_var);
+      auto* gru_bias_var = scope->FindVar(bias->Name());
+      auto* fc_bias_var = scope->FindVar(fc_bias->Name());
+      PADDLE_ENFORCE(gru_bias_var);
+      PADDLE_ENFORCE(fc_bias_var);
+      const auto& gru_bias_tenosr = gru_bias_var->Get<framework::LoDTensor>();
+      const auto& fc_bias_tensor = fc_bias_var->Get<framework::LoDTensor>();
+      // new bias = fc bias + gru bias
+      out_bias_tensor->Resize(gru_bias_tenosr.dims());
+      auto* data = out_bias_tensor->mutable_data<float>(platform::CPUPlace());
+      for (int i = 0; i < out_bias_tensor->numel(); i++) {
+        data[i] =
+            fc_bias_tensor.data<float>()[i] + gru_bias_tenosr.data<float>()[i];
+      }
+    }
+#undef GET_NODE
+
+#define NEW_IMTERMEDIATE_OUT(key) \
+  scope->Var(NEW_NAME(key))->GetMutable<framework::LoDTensor>()
+    NEW_IMTERMEDIATE_OUT(ReorderedH0);
+    NEW_IMTERMEDIATE_OUT(XX);
+    NEW_IMTERMEDIATE_OUT(BatchedInput);
+    NEW_IMTERMEDIATE_OUT(BatchedOut);
+#undef NEW_NAME
+#undef NEW_IMTERMEDIATE_OUT
+
+    IR_NODE_LINK_TO(x, op);
+    IR_NODE_LINK_TO(weight_x, op);
+    IR_NODE_LINK_TO(weight_h, op);
+    IR_NODE_LINK_TO(bias, op);  // actually should link to new bias if have
+    IR_NODE_LINK_TO(op, hidden);
+    // h0?
+    return op;
+  };
+
+  int fusion_count{0};
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    auto* x_n = subgraph.at(x);
+    GET_IR_NODE_FROM_SUBGRAPH(w, w, fc_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(mul, mul, fc_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(fc_out, Out, fc_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(Weight, Weight, gru_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(gru, gru, gru_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(Bias, Bias, gru_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(Hidden, Hidden, gru_pattern);
+    // nodes need be removed
+    GET_IR_NODE_FROM_SUBGRAPH(BatchGate, BatchGate, gru_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(BatchResetHiddenPrev, BatchGate, gru_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(BatchHidden, BatchGate, gru_pattern);
+
+    if (with_fc_bias) {
+      GET_IR_NODE_FROM_SUBGRAPH(mul_out, mul_out, fc_pattern);
+      GET_IR_NODE_FROM_SUBGRAPH(fc_bias, bias, fc_pattern);
+      GET_IR_NODE_FROM_SUBGRAPH(elementwise_add, elementwise_add, fc_pattern);
+
+      gru_creater(gru, x_n, w, Weight, Bias, Hidden, fc_bias);
+      // Remove unneeded nodes.
+      std::unordered_set<const Node*> marked_nodes(
+          {mul, gru, elementwise_add, fc_bias, fc_out, mul_out, BatchGate,
+           BatchResetHiddenPrev, BatchHidden});
+      GraphSafeRemoveNodes(graph, marked_nodes);
+    } else {
+      gru_creater(gru, x_n, w, Weight, Bias, Hidden, nullptr);
+      // Remove unneeded nodes.
+      std::unordered_set<const Node*> marked_nodes(
+          {mul, gru, BatchGate, BatchResetHiddenPrev, BatchHidden});
+      GraphSafeRemoveNodes(graph, marked_nodes);
+    }
+#undef GET_NODE
+
+    ++fusion_count;
+  };
+
+  gpd(graph, handler);
+
+  return fusion_count;
+}
+
+std::unique_ptr<ir::Graph> MulGRUFusePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  FusePassBase::Init(name_scope_, graph.get());
+
+  int fusion_count = BuildFusion(graph.get(), name_scope_, param_scope(),
+                                 false /*with_fc_bias*/);
+
+  AddStatis(fusion_count);
+  return graph;
+}
+
+std::unique_ptr<ir::Graph> FCGRUFusePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  FusePassBase::Init(name_scope_, graph.get());
+
+  int fusion_count = BuildFusion(graph.get(), name_scope_, param_scope(),
+                                 true /*with_fc_bias*/);
+
+  AddStatis(fusion_count);
+  return graph;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(mul_gru_fuse_pass, paddle::framework::ir::MulGRUFusePass);
+REGISTER_PASS(fc_gru_fuse_pass, paddle::framework::ir::FCGRUFusePass);
diff --git a/paddle/fluid/framework/ir/fc_gru_fuse_pass.h b/paddle/fluid/framework/ir/fc_gru_fuse_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..63e1c72bfb2e2641ae5d44858b342d5e427e9045
--- /dev/null
+++ b/paddle/fluid/framework/ir/fc_gru_fuse_pass.h
@@ -0,0 +1,50 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+// The MulGRUFusePass and MulGRUFusePass will fuse to the same FusionGRU op.
+
+class FCGRUFusePass : public FusePassBase {
+ public:
+  virtual ~FCGRUFusePass() {}
+
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+
+  const std::string name_scope_{"fc_gru_fuse"};
+};
+
+// Just FC without bias
+class MulGRUFusePass : public FusePassBase {
+ public:
+  virtual ~MulGRUFusePass() {}
+
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  const std::string name_scope_{"fc_nobias_gru_fuse"};
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..aa95d3e9f6c8221f6e48d192b73ad5135539dc75
--- /dev/null
+++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
@@ -0,0 +1,184 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/fc_lstm_fuse_pass.h"
+#include <string>
+#include "paddle/fluid/framework/lod_tensor.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
+                bool with_fc_bias) {
+  GraphPatternDetector gpd;
+  auto* pattern = gpd.mutable_pattern();
+
+  // Build pattern
+  PDNode* x = pattern->NewNode(patterns::PDNodeName(name_scope, "x"))
+                  ->assert_is_op_input("mul")
+                  ->assert_var_not_persistable();
+  patterns::FC fc_pattern(pattern, name_scope);
+
+  // fc_out is a tmp var, will be removed after fuse, so marked as intermediate.
+  auto* fc_out = fc_pattern(x, with_fc_bias)->AsIntermediate();
+  patterns::LSTM lstm_pattern(pattern, name_scope);
+  lstm_pattern(fc_out);
+
+  // Create New OpDesc
+  auto lstm_creator = [&](Node* lstm, Node* input, Node* weight_x,
+                          Node* weight_h, Node* bias, Node* hidden, Node* cell,
+                          Node* xx, Node* fc_bias) {
+    OpDesc op_desc;
+    op_desc.SetType("fusion_lstm");
+#define SET_IN(Key, node__) op_desc.SetInput(#Key, {node__->Name()});
+    SET_IN(X, input);
+    SET_IN(WeightX, weight_x);
+    SET_IN(WeightH, weight_h);
+    SET_IN(Bias, bias);
+#undef SET_IN
+    if (with_fc_bias) {
+      // Add FC-bias with LSTM-bias and create a new weight
+      PADDLE_ENFORCE(scope);
+      const std::string& new_bias_var = patterns::UniqueKey("NewBias");
+      auto* bias_var = scope->Var(new_bias_var);
+      PADDLE_ENFORCE(bias_var);
+      auto* bias_tensor = bias_var->GetMutable<framework::LoDTensor>();
+      auto* lstm_bias_var = scope->FindVar(bias->Name());
+      PADDLE_ENFORCE(lstm_bias_var);
+      const auto& lstm_bias_tensor = lstm_bias_var->Get<framework::LoDTensor>();
+      bias_tensor->Resize(lstm_bias_tensor.dims());
+
+      auto* fc_bias_var = scope->FindVar(fc_bias->Name());
+      const auto& fc_bias_tensor = fc_bias_var->Get<framework::LoDTensor>();
+
+      auto* data = bias_tensor->mutable_data<float>(platform::CPUPlace());
+
+      for (int i = 0; i < bias_tensor->numel(); i++) {
+        data[i] =
+            fc_bias_tensor.data<float>()[i] + lstm_bias_tensor.data<float>()[i];
+      }
+      op_desc.SetInput("Bias", {new_bias_var});
+    }
+
+    // Create temp variables.
+    const std::string BatchedInput = patterns::UniqueKey("BatchedInput");
+    const std::string BatchedCellPreAct =
+        patterns::UniqueKey("BatchedCellPreAct");
+    const std::string BatchedGate = patterns::UniqueKey("BatchedGate");
+
+    scope->Var(BatchedInput)->GetMutable<framework::LoDTensor>();
+    scope->Var(BatchedCellPreAct)->GetMutable<framework::LoDTensor>();
+    scope->Var(BatchedGate)->GetMutable<framework::LoDTensor>();
+
+    op_desc.SetInput("H0", {});
+    op_desc.SetInput("C0", {});
+    op_desc.SetOutput("Hidden", {hidden->Name()});
+    op_desc.SetOutput("Cell", {cell->Name()});
+    op_desc.SetOutput("XX", {xx->Name()});
+    op_desc.SetOutput("BatchedGate", {BatchedGate});
+    op_desc.SetOutput("BatchCellPreAct", {BatchedCellPreAct});
+    op_desc.SetOutput("BatchedInput", {BatchedInput});
+    op_desc.SetAttr("is_reverse", lstm->Op()->GetAttr("is_reverse"));
+    op_desc.SetAttr("use_peepholes", lstm->Op()->GetAttr("use_peepholes"));
+    // TODO(TJ): get from attr
+    op_desc.SetAttr("use_seq", true);
+
+    PADDLE_ENFORCE(graph->Has(kParamScopeAttr));
+    auto* scope = graph->Get<Scope*>(kParamScopeAttr);
+#define OP_SET_OUT(x)                            \
+  const std::string x = patterns::UniqueKey(#x); \
+  op_desc.SetOutput(#x, {x});                    \
+  scope->Var(x)->GetMutable<LoDTensor>()
+    OP_SET_OUT(BatchedCell);
+    OP_SET_OUT(BatchedHidden);
+    OP_SET_OUT(ReorderedH0);
+    OP_SET_OUT(ReorderedC0);
+#undef OP_SET_OUT
+
+    auto* op = graph->CreateOpNode(&op_desc);
+    IR_NODE_LINK_TO(input, op);
+    IR_NODE_LINK_TO(weight_x, op);
+    IR_NODE_LINK_TO(weight_h, op);
+    IR_NODE_LINK_TO(bias, op);
+    IR_NODE_LINK_TO(op, hidden);
+    return op;
+  };
+
+  int fusion_count{0};
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    GET_IR_NODE_FROM_SUBGRAPH(lstm, lstm, lstm_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(Weight, Weight, lstm_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(Bias, Bias, lstm_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(Cell, Cell, lstm_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(Hidden, Hidden, lstm_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(w, w, fc_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(mul, mul, fc_pattern);
+    if (with_fc_bias) {
+      GET_IR_NODE_FROM_SUBGRAPH(fc_out, Out, fc_pattern);
+      GET_IR_NODE_FROM_SUBGRAPH(fc_bias, bias, fc_pattern);
+      GET_IR_NODE_FROM_SUBGRAPH(elementwise_add, elementwise_add, fc_pattern);
+      lstm_creator(lstm, subgraph.at(x), w, Weight, Bias, Hidden, Cell, fc_out,
+                   fc_bias);
+      // Remove unneeded nodes.
+      std::unordered_set<const Node*> marked_nodes(
+          {mul, lstm, elementwise_add, fc_bias});
+      GraphSafeRemoveNodes(graph, marked_nodes);
+    } else {
+      GET_IR_NODE_FROM_SUBGRAPH(fc_out, mul_out, fc_pattern);
+      lstm_creator(lstm, subgraph.at(x), w, Weight, Bias, Hidden, Cell, fc_out,
+                   nullptr);
+      // Remove unneeded nodes.
+      std::unordered_set<const Node*> marked_nodes({mul, lstm});
+      GraphSafeRemoveNodes(graph, marked_nodes);
+    }
+
+    ++fusion_count;
+  };
+
+  gpd(graph, handler);
+
+  return fusion_count;
+}
+
+std::unique_ptr<ir::Graph> MulLstmFusePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  FusePassBase::Init(name_scope_, graph.get());
+
+  int fusion_count = BuildFusion(graph.get(), name_scope_, param_scope(),
+                                 false /*with_fc_bias*/);
+
+  AddStatis(fusion_count);
+  return graph;
+}
+
+std::unique_ptr<ir::Graph> FCLstmFusePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  FusePassBase::Init(name_scope_, graph.get());
+
+  int fusion_count = BuildFusion(graph.get(), name_scope_, param_scope(),
+                                 true /*with_fc_bias*/);
+
+  AddStatis(fusion_count);
+  return graph;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(mul_lstm_fuse_pass, paddle::framework::ir::MulLstmFusePass);
+REGISTER_PASS(fc_lstm_fuse_pass, paddle::framework::ir::FCLstmFusePass);
diff --git a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..3ee32c63a46fcc34bdccd1e14d4bbaf9668c49e9
--- /dev/null
+++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h
@@ -0,0 +1,49 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+// The MulLstmFusePass and MulLstmFusePass will fuse to the same FusionLstm op.
+
+// Just FC without bias
+class FCLstmFusePass : public FusePassBase {
+ public:
+  virtual ~FCLstmFusePass() {}
+
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+
+  const std::string name_scope_{"fc_lstm_fuse"};
+};
+
+class MulLstmFusePass : public FusePassBase {
+ public:
+  virtual ~MulLstmFusePass() {}
+
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  const std::string name_scope_{"fc_nobias_lstm_fuse"};
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/fuse_pass_base.h b/paddle/fluid/framework/ir/fuse_pass_base.h
new file mode 100644
index 0000000000000000000000000000000000000000..877bbeb502252cac77095981641d7ce283ca1eb7
--- /dev/null
+++ b/paddle/fluid/framework/ir/fuse_pass_base.h
@@ -0,0 +1,60 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/framework/scope.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+static const char kParamScopeAttr[] = "__param_scope__";
+static const char kFuseStatisAttr[] = "__fuse_statis__";
+
+class FusePassBase : public Pass {
+ public:
+  void Init(const std::string& repr, Graph* graph) const {
+    repr_ = repr;
+    graph_ = graph;
+  }
+
+  Scope* param_scope() const {
+    PADDLE_ENFORCE(graph_->Has(kParamScopeAttr));
+    return graph_->Get<framework::Scope*>(kParamScopeAttr);
+  }
+
+  void AddStatis(int count_of_fused) const {
+    PADDLE_ENFORCE(graph_);
+    PADDLE_ENFORCE(!repr_.empty());
+    if (!graph_->Has(kFuseStatisAttr)) {
+      graph_->Set(kFuseStatisAttr, new std::unordered_map<std::string, int>);
+    }
+    auto& info =
+        graph_->Get<std::unordered_map<std::string, int>>(kFuseStatisAttr);
+    info[repr_] = count_of_fused;
+  }
+
+  virtual ~FusePassBase() {}
+
+ protected:
+  mutable Graph* graph_;
+  mutable std::string repr_;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc
new file mode 100644
index 0000000000000000000000000000000000000000..398f7095968e62f92d610f560d7574b27706d13e
--- /dev/null
+++ b/paddle/fluid/framework/ir/graph.cc
@@ -0,0 +1,207 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+#include <unordered_set>
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/op_proto_maker.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/var_desc.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+std::vector<std::string> FindDistTrainSendVars(
+    const std::vector<ir::Node *> &nodes) {
+  std::vector<std::string> send_vars;
+  // since parameters are all in block 0,
+  // it's enough to only scan send ops in block 0
+  for (auto &node : nodes) {
+    auto op_vars = node->Op()->InputArgumentNames();
+    send_vars.reserve(send_vars.size() +
+                      std::distance(op_vars.begin(), op_vars.end()));
+    send_vars.insert(send_vars.end(), op_vars.begin(), op_vars.end());
+  }
+  return send_vars;
+}
+
+std::vector<std::string> FindDistTrainRecvVars(
+    const std::vector<ir::Node *> &nodes) {
+  std::vector<std::string> recv_vars;
+  for (auto &node : nodes) {
+    auto op_vars = node->Op()->OutputArgumentNames();
+    recv_vars.reserve(recv_vars.size() +
+                      std::distance(op_vars.begin(), op_vars.end()));
+    recv_vars.insert(recv_vars.end(), op_vars.begin(), op_vars.end());
+  }
+  return recv_vars;
+}
+
+bool IsDistTrainOp(ir::Node *node, const std::vector<std::string> &send_vars,
+                   const std::vector<std::string> &recv_vars) {
+  if (send_vars.size() == 0 || recv_vars.size() == 0) {
+    return false;
+  }
+
+  /**
+   * Check any of opvars contains `.block` and in sendvars
+   */
+  auto checker = [](const std::vector<std::string> &opvars,
+                    const std::vector<std::string> &rpc_vars) -> bool {
+    for (auto &var : opvars) {
+      // a variable name with the suffix `.block` means it's a splited
+      // variable by (DistributeTranspiler)
+      // [python/paddle/fluid/transpiler/distribute_transpiler.py]
+      if (var.find(".block") != std::string::npos &&
+          std::find(rpc_vars.begin(), rpc_vars.end(), var) != rpc_vars.end()) {
+        return true;
+      }
+    }
+    return false;
+  };
+
+  std::vector<std::string> input_var_names;
+  std::vector<std::string> output_var_names;
+  for (ir::Node *input : node->inputs) {
+    input_var_names.push_back(input->Name());
+  }
+  for (ir::Node *output : node->outputs) {
+    output_var_names.push_back(output->Name());
+  }
+
+  return checker(output_var_names, send_vars) ||
+         checker(input_var_names, recv_vars);
+}
+
+Graph::Graph(const ProgramDesc &program) : program_(program) {
+  // Make the nodes id start from 0.
+  Node::ResetId();
+
+  VLOG(3) << "block in program:" << program_.Size();
+  std::unordered_map<std::string, VarDesc *> all_vars;
+  for (auto *var : program.Block(0).AllVars()) {
+    all_vars.emplace(var->Name(), var);
+  }
+
+  std::map<std::string, std::vector<ir::Node *>> var_nodes;
+  for (auto *op : program.Block(0).AllOps()) {
+    ir::Node *node = CreateOpNode(op);
+    // For input args, reuse the same var name if it was created before.
+    // Otherwise, create a new one.
+    for (auto &each_var_name : op->InputArgumentNames()) {
+      ir::Node *var = nullptr;
+      if (var_nodes.find(each_var_name) != var_nodes.end()) {
+        var = var_nodes.at(each_var_name).back();
+      } else if (all_vars.count(each_var_name) != 0) {
+        var = CreateVarNode(all_vars.at(each_var_name));
+        var_nodes[each_var_name].push_back(var);
+      } else {
+        // Operation input var can be optional (dispensable). Which means
+        // the operation doesn't really need the var at runtime. In this
+        // case, the no-existed var is ready at the beginning.
+        var = CreateEmptyNode(each_var_name, ir::Node::Type::kVariable);
+        var_nodes[each_var_name].push_back(var);
+      }
+      node->inputs.push_back(var);
+      var->outputs.push_back(node);
+    }
+    // For output args, always create a new var.
+    for (auto &each_var_name : op->OutputArgumentNames()) {
+      ir::Node *var = nullptr;
+      if (all_vars.count(each_var_name) != 0) {
+        var = CreateVarNode(all_vars.at(each_var_name));
+      } else {
+        // Operation output vars can be @EMPTY@. For example, while_grad
+        // can have multi @EMPTY@ outputs with no VarDesc.
+        // TODO(panyx0718): Add a test.
+        var = CreateEmptyNode(each_var_name, ir::Node::Type::kVariable);
+      }
+      var_nodes[each_var_name].push_back(var);
+      node->outputs.push_back(var);
+      var->inputs.push_back(node);
+    }
+  }
+
+  /**
+   * We should handle write after read(WAR) and write after write(WAW) here.
+   * Because some of the operators of the program can be executed parallelly.
+   * So, to make the program running in the right order, we should add the
+   * dependence of WAR and WAW.
+   *
+   *
+   * https://en.wikipedia.org/wiki/Hazard_(computer_architecture)#Write_after_read_(WAR)
+   */
+
+  for (auto &var : var_nodes) {
+    auto &versions = var.second;
+    if (versions.size() <= 1) continue;
+
+    auto it_new = versions.rbegin();
+    auto it_old = versions.rbegin();
+    ++it_old;
+    for (; it_old != versions.rend(); it_new = it_old, ++it_old) {
+      ir::Node *write_op =
+          (*it_new)->inputs.empty() ? nullptr : (*it_new)->inputs[0];
+      const auto &read_ops = (*it_old)->outputs;
+
+      PADDLE_ENFORCE(write_op, "The write_op should not be empty.");
+
+      // Add write after write dependence
+      ir::Node *upstream_op =
+          (*it_old)->inputs.empty() ? nullptr : (*it_old)->inputs[0];
+      // TODO(zcd): Add a test.
+      if (upstream_op && upstream_op != write_op) {
+        ir::Node *dep_var = CreateControlDepVar();
+        write_op->inputs.push_back(dep_var);
+        upstream_op->outputs.push_back(dep_var);
+        dep_var->outputs.push_back(write_op);
+        dep_var->inputs.push_back(upstream_op);
+      }
+
+      for (auto *read_op : read_ops) {
+        // Manually add a dependency var from read_op to write_op;
+        if (read_op == write_op) {
+          // Read Write is the same op.
+          continue;
+        }
+        // 2 ops might have been connected via other vars.
+        bool has_dep = false;
+        for (ir::Node *r_out : read_op->outputs) {
+          for (ir::Node *w_in : write_op->inputs) {
+            if (r_out == w_in) {
+              has_dep = true;
+              break;
+            }
+          }
+        }
+        if (has_dep) continue;
+
+        ir::Node *dep_var = CreateControlDepVar();
+        read_op->outputs.push_back(dep_var);
+        dep_var->inputs.push_back(read_op);
+        write_op->inputs.push_back(dep_var);
+        dep_var->outputs.push_back(write_op);
+      }
+    }
+  }
+}
+
+bool IsControlDepVar(const ir::Node &var) {
+  return var.Name().find(ir::Node::kControlDepVarName) != std::string::npos;
+}
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc
new file mode 100644
index 0000000000000000000000000000000000000000..62f94a1c0e5a300438bbe5fea34b9a07df5d9ebf
--- /dev/null
+++ b/paddle/fluid/framework/ir/graph_helper.cc
@@ -0,0 +1,118 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+#include <unordered_set>
+
+#include "paddle/fluid/framework/ir/graph_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+namespace {
+void SortHelper(
+    const std::map<ir::Node *, std::unordered_set<ir::Node *>> &adj_list,
+    ir::Node *node, std::unordered_set<ir::Node *> *visited,
+    std::vector<ir::Node *> *ret) {
+  visited->insert(node);
+
+  for (auto adj : adj_list.at(node)) {
+    if (visited->find(adj) == visited->end()) {
+      SortHelper(adj_list, adj, visited, ret);
+    }
+  }
+
+  VLOG(3) << "topology sort insert: " << node->Name()
+          << reinterpret_cast<void *>(node) << " input " << node->inputs.size();
+  ret->push_back(node);
+}
+
+bool HasCircleHelper(
+    ir::Node *node,
+    const std::map<ir::Node *, std::unordered_set<ir::Node *>> &adj_list,
+    std::unordered_set<ir::Node *> *visited,
+    std::unordered_set<ir::Node *> *in_trace) {
+  if (visited->find(node) == visited->end()) {
+    visited->insert(node);
+    in_trace->insert(node);
+
+    for (ir::Node *in : adj_list.at(node)) {
+      if (visited->find(in) == visited->end() &&
+          HasCircleHelper(in, adj_list, visited, in_trace)) {
+        return true;
+      } else if (in_trace->find(in) != in_trace->end()) {
+        return true;
+      }
+    }
+  }
+  in_trace->erase(node);
+  return false;
+}
+
+bool HasCircleInternal(
+    const std::map<ir::Node *, std::unordered_set<ir::Node *>> &adj_list) {
+  std::unordered_set<ir::Node *> visited;
+  std::unordered_set<ir::Node *> in_trace;
+  for (auto &adj : adj_list) {
+    if (HasCircleHelper(adj.first, adj_list, &visited, &in_trace)) {
+      return true;
+    }
+  }
+  return false;
+}
+}  // namespace
+
+bool HasCircle(const Graph &graph) {
+  return HasCircleInternal(BuildOperationAdjList(graph));
+}
+
+std::vector<ir::Node *> TopologySortOperations(const Graph &graph) {
+  std::map<ir::Node *, std::unordered_set<ir::Node *>> adj_list =
+      BuildOperationAdjList(graph);
+  PADDLE_ENFORCE(!HasCircleInternal(adj_list));
+  std::unordered_set<ir::Node *> visited;
+  std::vector<ir::Node *> ret;
+  for (auto adj : adj_list) {
+    if (visited.find(adj.first) == visited.end()) {
+      SortHelper(adj_list, adj.first, &visited, &ret);
+    }
+  }
+  return ret;
+}
+
+std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
+    const Graph &graph) {
+  std::map<ir::Node *, std::unordered_set<ir::Node *>> adj_list;
+
+  for (auto &n : graph.Nodes()) {
+    if (n->NodeType() != ir::Node::Type::kOperation) continue;
+    if (adj_list.find(n) == adj_list.end()) {
+      adj_list[n] = std::unordered_set<ir::Node *>();
+    }
+    for (auto &var : n->inputs) {
+      for (auto &adj_n : var->inputs) {
+        PADDLE_ENFORCE(adj_n->NodeType() == ir::Node::Type::kOperation);
+        VLOG(4) << "adj " << adj_n->Name() << reinterpret_cast<void *>(adj_n)
+                << " -> " << n->Name() << reinterpret_cast<void *>(n)
+                << "  via " << var->Name() << reinterpret_cast<void *>(var);
+        adj_list[n].insert(adj_n);
+      }
+    }
+  }
+  return adj_list;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_helper.h b/paddle/fluid/framework/ir/graph_helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..cd6c53a07f8f56781989739d995226bd02b3d3d0
--- /dev/null
+++ b/paddle/fluid/framework/ir/graph_helper.h
@@ -0,0 +1,40 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <map>
+#include <memory>
+#include <vector>
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/node.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+// Test if the graph contains circle.
+bool HasCircle(const Graph &graph);
+
+// Topology Sort the operations in the graph from inputs to outputs.
+// `graph` cannot contain circle.
+std::vector<ir::Node *> TopologySortOperations(const Graph &graph);
+
+// Build an adjacency list of operations for the `graph`.
+std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
+    const Graph &graph);
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_helper_test.cc b/paddle/fluid/framework/ir/graph_helper_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a260dd3da2a7863c06e51aa4feafd824ea254139
--- /dev/null
+++ b/paddle/fluid/framework/ir/graph_helper_test.cc
@@ -0,0 +1,125 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include <string>
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/program_desc.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void BuildCircleGraph(Graph* g) {
+  ir::Node* o1 = g->CreateEmptyNode("op1", Node::Type::kOperation);
+  ir::Node* v1 = g->CreateEmptyNode("var1", Node::Type::kVariable);
+
+  o1->outputs.push_back(v1);
+  o1->inputs.push_back(v1);
+  v1->inputs.push_back(o1);
+  v1->outputs.push_back(o1);
+}
+
+void BuildCircleGraph2(Graph* g) {
+  ir::Node* o1 = g->CreateEmptyNode("op1", Node::Type::kOperation);
+  ir::Node* o2 = g->CreateEmptyNode("op2", Node::Type::kOperation);
+  ir::Node* v1 = g->CreateEmptyNode("var1", Node::Type::kVariable);
+  ir::Node* v2 = g->CreateEmptyNode("var2", Node::Type::kVariable);
+
+  o1->outputs.push_back(v1);
+  o2->inputs.push_back(v1);
+  v1->inputs.push_back(o1);
+  v1->outputs.push_back(o2);
+
+  o2->outputs.push_back(v2);
+  o1->inputs.push_back(v2);
+  v2->inputs.push_back(o2);
+  v2->outputs.push_back(o1);
+}
+
+void BuildNoCircleGraph(Graph* g) {
+  ir::Node* o1 = g->CreateEmptyNode("op1", Node::Type::kOperation);
+  ir::Node* o2 = g->CreateEmptyNode("op2", Node::Type::kOperation);
+  ir::Node* o3 = g->CreateEmptyNode("op3", Node::Type::kOperation);
+  ir::Node* o4 = g->CreateEmptyNode("op4", Node::Type::kOperation);
+  ir::Node* o5 = g->CreateEmptyNode("op5", Node::Type::kOperation);
+  ir::Node* v1 = g->CreateEmptyNode("var1", Node::Type::kVariable);
+  ir::Node* v2 = g->CreateEmptyNode("var2", Node::Type::kVariable);
+  ir::Node* v3 = g->CreateEmptyNode("var3", Node::Type::kVariable);
+  ir::Node* v4 = g->CreateEmptyNode("var4", Node::Type::kVariable);
+
+  // o1->v1->o2
+  o1->outputs.push_back(v1);
+  o2->inputs.push_back(v1);
+  v1->inputs.push_back(o1);
+  v1->outputs.push_back(o2);
+  // o2->v2->o3
+  // o2->v2->o4
+  o2->outputs.push_back(v2);
+  o3->inputs.push_back(v2);
+  o4->inputs.push_back(v2);
+  v2->inputs.push_back(o2);
+  v2->outputs.push_back(o3);
+  v2->outputs.push_back(o4);
+  // o2->v3->o5
+  o2->outputs.push_back(v3);
+  o5->inputs.push_back(v3);
+  v3->inputs.push_back(o2);
+  v3->outputs.push_back(o5);
+  // o3-v4->o5
+  o3->outputs.push_back(v4);
+  o5->inputs.push_back(v4);
+  v4->inputs.push_back(o3);
+  v4->outputs.push_back(o5);
+}
+
+TEST(GraphHelperTest, Basic) {
+  ProgramDesc prog;
+
+  Graph g(prog);
+  BuildCircleGraph(&g);
+  ASSERT_TRUE(HasCircle(g));
+
+  Graph g2(prog);
+  BuildCircleGraph2(&g2);
+  ASSERT_TRUE(HasCircle(g2));
+
+  auto adj_list = BuildOperationAdjList(g2);
+  for (auto& adj : adj_list) {
+    auto& adj_set = adj.second;
+    if (adj.first->Name() == "op1") {
+      ASSERT_EQ((*adj_set.begin())->Name(), "op2");
+    } else if (adj.first->Name() == "op2") {
+      ASSERT_EQ((*adj_set.begin())->Name(), "op1");
+    } else {
+      ASSERT_TRUE(false);
+    }
+  }
+
+  Graph g3(prog);
+  BuildNoCircleGraph(&g3);
+  ASSERT_FALSE(HasCircle(g3));
+  auto sorted = TopologySortOperations(g3);
+  std::map<std::string, size_t> node_map;
+  for (size_t i = 0; i < sorted.size(); ++i) {
+    node_map[sorted[i]->Name()] = i;
+  }
+  ASSERT_EQ(node_map.at("op1"), 0UL);
+  ASSERT_EQ(node_map.at("op2"), 1UL);
+  ASSERT_TRUE(node_map.at("op3") < node_map.at("op5"));
+}
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
new file mode 100644
index 0000000000000000000000000000000000000000..11d5998aafe1f325b94ef1a5ea1c13c72c13f5c9
--- /dev/null
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -0,0 +1,653 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <array>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/ir/graph_traits.h"
+#include "paddle/fluid/framework/ir/graph_viz_pass.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/string/pretty_log.h"
+#include "paddle/fluid/string/printf.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+using string::PrettyLogEndl;
+using string::PrettyLog;
+using string::Style;
+
+size_t PDPattern::id_ = 0UL;
+
+PDNode* PDPattern::NewNode(const std::string& name) {
+  if (!name.empty()) {
+    PADDLE_ENFORCE_EQ(node_map_.count(name), 0,
+                      "PDNode's name should be unique, get duplicate [%s]",
+                      name);
+  }
+
+  nodes_.emplace_back(new PDNode(this, name));
+  auto* cur = nodes_.back().get();
+  node_map_[name] = cur;
+  return cur;
+}
+
+PDNode* PDPattern::NewNode(PDNode::teller_t&& teller, const std::string& name) {
+  if (!name.empty()) {
+    PADDLE_ENFORCE_EQ(node_map_.count(name), 0,
+                      "PDNode's name should be unique, get duplicate [%s]",
+                      name);
+  }
+
+  nodes_.emplace_back(new PDNode(std::move(teller), this, name));
+  auto* cur = nodes_.back().get();
+  node_map_[name] = cur;
+  return cur;
+}
+
+PDNode* PDPattern::RetrieveNode(const std::string& id) const {
+  auto it = node_map_.find(id);
+  if (it == node_map_.end()) {
+    return nullptr;
+  }
+
+  return it->second;
+}
+
+void PDPattern::AddEdge(PDNode* a, PDNode* b) {
+  PADDLE_ENFORCE(a);
+  PADDLE_ENFORCE(b);
+  PADDLE_ENFORCE(a != b, "can't connect to the same nodes.");
+  edges_.emplace_back(a, b);
+}
+
+void GraphPatternDetector::operator()(Graph* graph,
+                                      GraphPatternDetector::handle_t handler) {
+  if (!MarkPDNodesInGraph(*graph)) {
+    return;
+  }
+
+  auto subgraphs = DetectPatterns();
+  UniquePatterns(&subgraphs);
+  RemoveOverlappedMatch(&subgraphs);
+  ValidateByNodeRole(&subgraphs);
+
+  if (subgraphs.empty()) return;
+  PrettyLogEndl(Style::detail(), "---  detect %d subgraphs", subgraphs.size());
+  int id = 0;
+  for (auto& g : subgraphs) {
+    VLOG(3) << "optimizing #" << id++ << " subgraph";
+    handler(g, graph);
+  }
+}
+
+bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph& graph) {
+  VLOG(3) << "mark pdnodes in graph";
+  if (graph.Nodes().empty()) return false;
+
+  for (auto& node : GraphTraits::DFS(graph)) {
+    for (const auto& pdnode : pattern_.nodes()) {
+      if (pdnode->Tell(&node)) {
+        VLOG(4) << "pdnode " << pdnode->name() << " marked";
+        pdnodes2nodes_[pdnode.get()].insert(&node);
+      }
+    }
+  }
+  // Check to early stop if some PDNode can't find matched Node.
+  for (auto& pdnode : pattern_.nodes()) {
+    if (!pdnodes2nodes_.count(pdnode.get())) {
+      VLOG(4) << pdnode->name() << " can't find matched Node, early stop";
+      // return false;
+    }
+  }
+  for (auto& item : pdnodes2nodes_) {
+    for (auto& n : item.second) {
+      GetMarkedNodes(const_cast<Graph*>(&graph)).insert(n);
+    }
+  }
+  VLOG(3) << pdnodes2nodes_.size() << " nodes marked";
+
+  return !pdnodes2nodes_.empty();
+}
+
+// The intermediate Nodes can only link to the nodes inside the pattern, or this
+// subgraph will be droped.
+void GraphPatternDetector::ValidateByNodeRole(
+    std::vector<GraphPatternDetector::subgraph_t>* subgraphs) {
+  std::vector<GraphPatternDetector::subgraph_t> result;
+
+  subgraphs->erase(
+      std::remove_if(
+          subgraphs->begin(), subgraphs->end(),
+          [](const GraphPatternDetector::subgraph_t& subgraph) -> bool {
+            // Collect the inputs and outputs.
+            std::unordered_set<Node*> ios;
+            for (auto& item : subgraph) {
+              if (!item.first->IsIntermediate()) {
+                ios.insert(item.second);
+              }
+            }
+            for (auto& item : subgraph) {
+              if (item.first->IsIntermediate()) {
+                for (auto* x : item.second->inputs) {
+                  if (!ios.count(x)) {
+                    return true;
+                  }
+                }
+                for (auto* x : item.second->outputs) {
+                  if (!ios.count(x)) {
+                    return true;
+                  }
+                }
+              }
+            }
+            return false;
+          }),
+      subgraphs->end());
+}
+
+struct HitGroup {
+  std::unordered_map<PDNode*, Node*> roles;
+
+  bool Match(Node* node, PDNode* pat) {
+    if (nodes_.count(node)) {
+      if (!roles.count(pat)) return false;
+      return roles[pat] == node;
+    }
+    return !roles.count(pat) || roles.at(pat) == node;
+  }
+
+  void Register(Node* node, PDNode* pat) {
+    roles[pat] = node;
+    nodes_.insert(node);
+  }
+
+ private:
+  std::unordered_set<Node*> nodes_;
+};
+
+// Tell whether Node a links to b.
+bool IsNodesLink(Node* a, Node* b) {
+  for (auto* node : a->outputs) {
+    if (b == node) {
+      return true;
+    }
+  }
+  return false;
+}
+
+std::vector<GraphPatternDetector::subgraph_t>
+GraphPatternDetector::DetectPatterns() {
+  // Init empty subgraphs.
+  std::vector<GraphPatternDetector::subgraph_t> result;
+  std::vector<HitGroup> init_groups;
+  std::array<std::vector<HitGroup>, 2> bi_records;
+  // PADDLE_ENFORCE(!pattern_.edges().empty(), "At least one edge is needed");
+  auto* first_pnode = pattern_.edges().empty() ? pattern().nodes().front().get()
+                                               : pattern_.edges().front().first;
+  if (!pdnodes2nodes_.count(first_pnode)) return result;
+  for (auto* node : pdnodes2nodes_[first_pnode]) {
+    HitGroup group;
+    group.roles[first_pnode] = node;
+    init_groups.emplace_back(group);
+  }
+
+  int step = 0;
+  bi_records[0] = std::move(init_groups);
+
+  // Extend a PDNode to subgraphs by deducing the connection relations defined
+  // in edges of PDNodes.
+  for (const auto& edge : pattern_.edges()) {
+    VLOG(4) << "check " << edge.first->name() << " -> " << edge.second->name();
+    // TODO(Superjomn) Fix bug here, the groups might be duplicate here.
+    // Each role has two PDNodes, which indicates two roles.
+    // Detect two Nodes that can match these two roles and they are connected.
+    auto& pre_groups = bi_records[step % 2];
+    auto& cur_groups = bi_records[1 - (step++ % 2)];
+    cur_groups.clear();
+    if (pre_groups.empty()) break;
+    // source -> target
+    for (Node* source : pdnodes2nodes_[edge.first]) {
+      for (Node* target : pdnodes2nodes_[edge.second]) {
+        VLOG(8) << "check " << source->id() << " -- " << target->id();
+        // TODO(Superjomn) add some prune strategies.
+        for (const auto& group : pre_groups) {
+          HitGroup new_group = group;
+          if (IsNodesLink(source, target) &&
+              new_group.Match(source, edge.first)) {
+            new_group.Register(source, edge.first);
+            if (new_group.Match(target, edge.second)) {
+              new_group.Register(target, edge.second);
+              cur_groups.push_back(new_group);
+              // TODO(Superjomn) need to unique
+            }
+          }
+        }
+      }
+    }
+    VLOG(3) << "step " << step << " get records: " << cur_groups.size();
+    for (auto& group : cur_groups) {
+      for (auto& item : group.roles) {
+        VLOG(4) << "node " << item.second->id() << " as " << item.first->name();
+      }
+      VLOG(4) << "=========================================================";
+    }
+  }
+
+  for (auto& group : bi_records[step % 2]) {
+    GraphPatternDetector::subgraph_t subgraph;
+    for (auto& role : group.roles) {
+      subgraph.emplace(role.first, role.second);
+    }
+    result.emplace_back(subgraph);
+  }
+  return result;
+}
+
+void GraphPatternDetector::UniquePatterns(
+    std::vector<GraphPatternDetector::subgraph_t>* subgraphs) {
+  if (subgraphs->empty()) return;
+  std::vector<GraphPatternDetector::subgraph_t> result;
+
+  std::unordered_set<size_t> set;
+  for (auto& g : *subgraphs) {
+    size_t key = 0;
+    for (auto& item : g) {
+      key ^= std::hash<void*>{}(item.first);
+      key ^= std::hash<void*>{}(item.second);
+    }
+    if (!set.count(key)) {
+      result.emplace_back(g);
+      set.insert(key);
+    }
+  }
+  *subgraphs = result;
+}
+
+void GraphPatternDetector::RemoveOverlappedMatch(
+    std::vector<subgraph_t>* subgraphs) {
+  std::vector<subgraph_t> result;
+  std::unordered_set<Node*> node_set;
+
+  for (const auto& subgraph : *subgraphs) {
+    bool valid = true;
+    for (auto& item : subgraph) {
+      if (item.first->IsIntermediate() && node_set.count(item.second)) {
+        valid = false;
+        break;
+      }
+    }
+    if (valid) {
+      for (auto& item : subgraph) {
+        node_set.insert(item.second);
+      }
+      result.push_back(subgraph);
+    }
+  }
+  *subgraphs = result;
+}
+
+std::string PDPattern::DotString() const {
+  using inference::analysis::Dot;
+  Dot dot;
+  int id = 0;
+  // Create Nodes
+  std::unordered_map<PDNode*, std::string> node2dot;
+  for (const auto& node : nodes()) {
+    std::string node_id = "Node" + std::to_string(id++);
+    dot.AddNode(node_id, {}, node->name());
+    node2dot[node.get()] = node_id;
+  }
+  // Create Edges
+  for (const auto& edge : edges()) {
+    if (!node2dot.count(edge.first) || !node2dot.count(edge.second)) {
+      LOG(ERROR) << "no node " << edge.first << " " << edge.second;
+      continue;
+    }
+    auto& src = node2dot.at(edge.first);
+    auto& trg = node2dot.at(edge.second);
+    dot.AddEdge(src, trg, {});
+  }
+  return dot.Build();
+}
+
+PDNode& PDNode::LinksTo(const std::vector<PDNode*>& others) {
+  // extend outlinks.
+  for (PDNode* x : others) {
+    pattern_->AddEdge(this, x);
+  }
+  return *this;
+}
+
+PDNode& PDNode::LinksFrom(const std::vector<PDNode*>& others) {
+  // extend outlinks.
+  for (PDNode* x : others) {
+    pattern_->AddEdge(x, this);
+  }
+  return *this;
+}
+
+PDNode* PDNode::assert_is_op() {
+  asserts_.emplace_back([](Node* x) { return x && x->IsOp(); });
+  return this;
+}
+PDNode* PDNode::assert_is_op(const std::string& op_type) {
+  asserts_.emplace_back([op_type](Node* x) {
+    return x && x->IsOp() && x->Op()->Type() == op_type;
+  });
+  return this;
+}
+PDNode* PDNode::assert_is_var() {
+  asserts_.emplace_back([](Node* x) { return x && x->IsVar(); });
+  return this;
+}
+PDNode* PDNode::assert_var_not_persistable() {
+  assert_is_var();
+  asserts_.emplace_back([](Node* x) { return !x->Var()->Persistable(); });
+  return this;
+}
+PDNode* PDNode::assert_is_persistable_var() {
+  assert_is_var();
+  asserts_.emplace_back([=](Node* x) { return x->Var()->Persistable(); });
+  return this;
+}
+PDNode* PDNode::assert_is_op_nth_input(const std::string& op_type,
+                                       const std::string& argument, int nth) {
+  assert_is_var();
+  assert_is_op_input(op_type);
+  asserts_.emplace_back([=](Node* x) {
+    for (auto* op : x->outputs) {
+      if (op->IsOp() && op->Op()->Type() == op_type &&
+          IsNthInput(x, op, argument, nth))
+        return true;
+    }
+    return false;
+  });
+  return this;
+}
+PDNode* PDNode::assert_is_op_nth_output(const std::string& op_type,
+                                        const std::string& argument, int nth) {
+  assert_is_var();
+  asserts_.emplace_back([=](Node* x) {
+    for (auto* op : x->inputs) {
+      if (op->IsOp() && op->Op()->Type() == op_type &&
+          IsNthOutput(x, op, argument, nth))
+        return true;
+    }
+    return false;
+  });
+  return this;
+}
+PDNode* PDNode::assert_is_only_input_of_op(const std::string& op_type) {
+  assert_is_var();
+  asserts_.emplace_back([=](Node* x) {
+    for (auto* op : x->outputs) {
+      if (op && op->IsOp() && op->Op() && op->Op()->Type() == op_type &&
+          op->inputs.size() == 1) {
+        return true;
+      }
+    }
+    return false;
+  });
+  return this;
+}
+PDNode* PDNode::assert_is_only_output_of_op(const std::string& op_type) {
+  assert_is_var();
+  asserts_.emplace_back([=](Node* x) {
+    for (auto* op : x->inputs) {
+      if (op && op->IsOp() && op->Op() && op->Op()->Type() == op_type &&
+          op->outputs.size() == 1) {
+        return true;
+      }
+    }
+    return false;
+  });
+  return this;
+}
+PDNode* PDNode::assert_is_op_output(const std::string& op_type) {
+  assert_is_var();
+  asserts_.emplace_back([=](Node* x) {
+    for (auto* op : x->inputs) {
+      if (op && op->IsOp() && op->Op() && op->Op()->Type() == op_type) {
+        return true;
+      }
+    }
+    return false;
+  });
+  return this;
+}
+PDNode* PDNode::assert_is_op_output(const std::string& op_type,
+                                    const std::string& argument) {
+  assert_is_var();
+  assert_is_op_nth_output(op_type, argument, 0);
+  return this;
+}
+PDNode* PDNode::assert_is_op_input(const std::string& op_type) {
+  assert_is_var();
+  asserts_.emplace_back([=](Node* x) {
+    for (auto* op : x->outputs) {
+      if (op && op->IsOp() && op->Op() && op->Op()->Type() == op_type) {
+        return true;
+      }
+    }
+    return false;
+  });
+  return this;
+}
+PDNode* PDNode::assert_is_op_input(const std::string& op_type,
+                                   const std::string& argument) {
+  assert_is_var();
+  assert_is_op_nth_input(op_type, argument, 0);
+  return this;
+}
+PDNode* PDNode::assert_op_has_n_inputs(const std::string& op_type, size_t n) {
+  assert_is_op(op_type);
+  asserts_.emplace_back([=](Node* x) { return x->inputs.size() == n; });
+  return this;
+}
+PDNode* PDNode::assert_op_has_n_outputs(const std::string& op_type, size_t n) {
+  assert_is_op(op_type);
+  asserts_.emplace_back([=](Node* x) { return x->outputs.size() == n; });
+  return this;
+}
+PDNode* PDNode::assert_more(PDNode::teller_t&& teller) {
+  asserts_.emplace_back(std::move(teller));
+  return this;
+}
+
+bool VarLinksToOp(Node* node, const std::string& op_type) {
+  for (auto* out : node->outputs) {
+    if (out->IsOp() && out->Op()->Type() == op_type) {
+      return true;
+    }
+  }
+  return false;
+}
+bool IsNthInput(Node* var, Node* op, const std::string& argument, size_t nth) {
+  PADDLE_ENFORCE(var->IsVar());
+  PADDLE_ENFORCE(op->IsOp());
+  if (op->Op()->Input(argument).size() <= nth) return false;
+  return var->Name() == op->Op()->Input(argument)[nth];
+}
+bool IsNthOutput(Node* var, Node* op, const std::string& argument, size_t nth) {
+  PADDLE_ENFORCE(var->IsVar());
+  PADDLE_ENFORCE(op->IsOp());
+  if (op->Op()->Output(argument).size() <= nth) return false;
+  return var->Name() == op->Op()->Output(argument)[nth];
+}
+void GraphSafeRemoveNodes(Graph* graph,
+                          const std::unordered_set<const Node*>& nodes) {
+  for (auto* node : nodes) {
+    graph->RemoveNode(const_cast<Node*>(node));
+  }
+
+  for (auto* node : graph->Nodes()) {
+    for (auto it = node->inputs.begin(); it != node->inputs.end();) {
+      if (nodes.count(*it)) {
+        it = const_cast<Node*>(node)->inputs.erase(it);
+      } else {
+        it++;
+      }
+    }
+    for (auto it = node->outputs.begin(); it != node->outputs.end();) {
+      if (nodes.count(*it)) {
+        it = const_cast<Node*>(node)->outputs.erase(it);
+      } else {
+        it++;
+      }
+    }
+  }
+}
+bool VarLinksFromOp(Node* node, const std::string& op_type) {
+  for (auto* out : node->inputs) {
+    if (out->IsOp() && out->Op()->Type() == op_type) {
+      return true;
+    }
+  }
+  return false;
+}
+
+PDNode* patterns::ConvReLU::operator()(
+    paddle::framework::ir::PDNode* conv_input) {
+  // Create Operators
+  conv_input->assert_is_op_input("conv2d", "Input");
+  auto* conv_op = pattern->NewNode(conv_repr())->assert_is_op("conv2d");
+  auto* relu_op = pattern->NewNode(relu_repr())->assert_is_op("relu");
+  // Create variables
+  // Filter
+  auto* conv_weight_var = pattern->NewNode(conv_weight_repr())
+                              ->AsInput()
+                              ->assert_is_persistable_var()
+                              ->assert_is_op_input("conv2d", "Filter");
+  // Bias
+  auto* conv_bias_var = pattern->NewNode(conv_bias_repr())
+                            ->AsInput()
+                            ->assert_is_persistable_var()
+                            ->assert_is_op_input("conv2d", "Bias");
+  // intermediate variable, will be removed in the IR after fuse.
+  auto* conv_out_var = pattern->NewNode(conv_out_repr())
+                           ->AsIntermediate()
+                           ->assert_is_only_output_of_op("conv2d")
+                           ->assert_is_op_input("relu");
+  // output
+  auto* relu_out_var = pattern->NewNode(relu_out_repr())
+                           ->AsOutput()
+                           ->assert_is_op_output("relu");
+
+  conv_op->LinksFrom({conv_input, conv_weight_var, conv_bias_var})
+      .LinksTo({conv_out_var});
+  relu_op->LinksFrom({conv_out_var}).LinksTo({relu_out_var});
+  return relu_out_var;
+}
+
+PDNode* patterns::FC::operator()(paddle::framework::ir::PDNode* x,
+                                 bool with_bias) {
+  // Create shared nodes.
+  x->assert_is_op_input("mul", "X");
+  auto* mul = pattern->NewNode(mul_repr())->assert_is_op("mul");
+
+  auto* mul_w_var = pattern->NewNode(w_repr())
+                        ->AsInput()
+                        ->assert_is_persistable_var()
+                        ->assert_is_op_input("mul", "Y");
+
+  auto* mul_out_var =
+      pattern->NewNode(mul_out_repr())->assert_is_op_output("mul");
+
+  if (!with_bias) {  // not with bias
+    // Add links.
+    mul->LinksFrom({x, mul_w_var}).LinksTo({mul_out_var});
+    return mul_out_var;
+
+  } else {  // with bias
+    mul_out_var->AsIntermediate()->assert_is_op_input("elementwise_add");
+    // Create operators.
+    auto* elementwise_add = pattern->NewNode(elementwise_add_repr())
+                                ->assert_is_op("elementwise_add");
+    // Create variables.
+    auto* bias = pattern->NewNode(bias_repr())
+                     ->assert_is_op_input("elementwise_add")
+                     ->AsInput();
+
+    auto* fc_out = pattern->NewNode(Out_repr())
+                       ->AsOutput()
+                       ->assert_is_op_output("elementwise_add");
+
+    mul->LinksFrom({mul_w_var, x}).LinksTo({mul_out_var});
+    elementwise_add->LinksFrom({mul_out_var, bias}).LinksTo({fc_out});
+    return fc_out;
+  }
+}
+
+PDNode* patterns::LSTM::operator()(PDNode* x) {
+  x->assert_is_op_input("lstm", "Input");
+  auto* lstm_op = pattern->NewNode(lstm_repr())->assert_is_op("lstm");
+#define NEW_NODE(arg__, io__) \
+  auto* arg__ =               \
+      pattern->NewNode(arg__##_repr())->assert_is_op_##io__("lstm", #arg__);
+
+  // Currently, the H0 and C0 are optional
+  // TODO(Superjomn) upgrade the fuse framework to support optional.
+  // NEW_NODE(H0, input);
+  // NEW_NODE(C0, input);
+  NEW_NODE(Weight, input);
+  NEW_NODE(Bias, input);
+
+  NEW_NODE(Hidden, output);
+  NEW_NODE(Cell, output);
+  NEW_NODE(BatchGate, output);
+  NEW_NODE(BatchCellPreAct, output);
+#undef NEW_NODE
+
+  lstm_op->LinksFrom({x, Weight, Bias});
+  lstm_op->LinksTo({Hidden, Cell, BatchGate, BatchCellPreAct});
+  return Hidden;
+}
+
+PDNode* patterns::GRU::operator()(PDNode* x) {
+  x->assert_is_op_input("gru", "Input");
+  auto* gru_op = pattern->NewNode(gru_repr())->assert_is_op("gru");
+#define NEW_NODE(arg__, io__) \
+  auto* arg__ =               \
+      pattern->NewNode(arg__##_repr())->assert_is_op_##io__("gru", #arg__);
+
+  NEW_NODE(Weight, input);
+  // TODO(Superjomn): upgrade the fuse framework to support optional.
+  // H0 and bias are optional
+  NEW_NODE(Bias, input);  // also optional
+  // NEW_NODE(H0, input);
+
+  NEW_NODE(Hidden, output);
+  // below are intermediate
+  NEW_NODE(BatchGate, output);
+  NEW_NODE(BatchResetHiddenPrev, output);
+  NEW_NODE(BatchHidden, output);
+#undef NEW_NODE
+
+  BatchGate->AsIntermediate();
+  BatchResetHiddenPrev->AsIntermediate();
+  BatchHidden->AsIntermediate();
+
+  gru_op->LinksFrom({x, Weight, Bias});
+  gru_op->LinksTo({Hidden, BatchGate, BatchResetHiddenPrev, BatchHidden});
+  return Hidden;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
new file mode 100644
index 0000000000000000000000000000000000000000..1a8d9cefbfa570d2ac3f4fc32d50d705ddc67a75
--- /dev/null
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -0,0 +1,459 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#ifdef PADDLE_WITH_TESTING
+#include <gtest/gtest_prod.h>
+#endif
+
+#include <numeric>
+#include <string>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/node.h"
+#include "paddle/fluid/inference/analysis/dot.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+class PDPattern;
+
+// Some basic terminologies:
+//   - PDPattern: a pattern defined as a data flow graph.
+//   - PDNode: the node in the pattern, each PDNode represents an `ir::Node`
+//     that meets some conditions defined in `PDNode.teller`.
+//   - A pattern is defined with PDNodes with edges.
+
+// Pattern detector node. This node helps to build a pattern.
+struct PDNode {
+  // tell whether an ir::Node* is a candidation for a PDNode.
+  using teller_t = std::function<bool(Node*)>;
+  enum class Type { kOp, kVar };
+  enum class Role {
+    kUnknown,      // No role,
+    kInput,        // an input and will be retained,
+    kOutput,       // an output and will be retained,
+    kIntermediate  // will be removed after handler.
+  };
+
+  // this link to others
+  PDNode& LinksTo(const std::vector<PDNode*>& others);
+  PDNode& LinksFrom(const std::vector<PDNode*>& others);
+
+  bool Tell(Node* node) const {
+    if (teller_) return teller_(node);
+
+    for (auto& asrt : asserts_) {
+      if (!asrt(node)) return false;
+    }
+    return true;
+  }
+
+  bool IsOp() const { return type_ == Type::kOp; }
+  bool IsVar() const { return type_ == Type::kVar; }
+
+  const std::string& name() const { return name_; }
+
+  PDNode& operator=(const PDNode&) = delete;
+  PDNode(const PDNode&) = delete;
+
+  // Mark this node is an Input of a subgraph and will be retained.
+  PDNode* AsInput() {
+    role_ = Role::kInput;
+    return this;
+  }
+  // Mark this node is an Output of a subgraph and will be retained.
+  PDNode* AsOutput() {
+    role_ = Role::kOutput;
+    return this;
+  }
+  // Mark this node will be removed, so all the links should be inside a matched
+  // sub-graph.
+  PDNode* AsIntermediate() {
+    role_ = Role::kIntermediate;
+    return this;
+  }
+
+  bool IsIntermediate() const { return role_ == Role::kIntermediate; }
+  bool IsInput() const { return role_ == Role::kInput; }
+  bool IsOutput() const { return role_ == Role::kOutput; }
+
+  // Assertions, helper functions to simplify the pattern definition.
+  PDNode* assert_is_op();
+  PDNode* assert_is_op(const std::string& op_type);
+  PDNode* assert_is_var();
+  PDNode* assert_var_not_persistable();
+  PDNode* assert_is_persistable_var();
+  PDNode* assert_is_op_output(const std::string& op_type);
+  PDNode* assert_is_op_output(const std::string& op_type,
+                              const std::string& argument);
+  PDNode* assert_is_op_input(const std::string& op_type);
+  PDNode* assert_is_op_input(const std::string& op_type,
+                             const std::string& argument);
+  PDNode* assert_is_op_nth_input(const std::string& op_type,
+                                 const std::string& argument, int nth);
+  PDNode* assert_is_op_nth_output(const std::string& op_type,
+                                  const std::string& argument, int nth);
+  PDNode* assert_is_only_input_of_op(const std::string& op_type);
+  PDNode* assert_is_only_output_of_op(const std::string& op_type);
+  PDNode* assert_op_has_n_inputs(const std::string& op_type, size_t n);
+  PDNode* assert_op_has_n_outputs(const std::string& op_type, size_t n);
+  PDNode* assert_more(teller_t&& teller);
+
+ private:
+  PDNode(PDPattern* pattern, const std::string& name = "",
+         Type type = Type::kVar)
+      : pattern_(pattern), name_(name), type_(type) {}
+  PDNode(teller_t&& teller, PDPattern* pattern, const std::string& name = "",
+         Type type = Type::kVar)
+      : teller_(std::move(teller)),
+        pattern_(pattern),
+        name_(name),
+        type_(type) {
+    PADDLE_ENFORCE(teller_ != nullptr, "invalid teller functer is set.");
+  }
+
+  PDNode(PDNode&& other) = default;
+
+  friend class PDPattern;
+
+  // Will removed latter.
+  teller_t teller_;
+  std::vector<teller_t> asserts_;
+  PDPattern* pattern_;
+  std::string name_;
+  Type type_;
+  Role role_{Role::kUnknown};
+};
+
+/*
+ * A pattern in a graph, which defined with PDNode and edges. Most graph
+ * patterns can be divided into PDNodes and link relations between them.
+ *
+ * For example, the FC fusion need to filter the MUL and ELEMENTWISE_ADD
+ * operators from the computation graph, the MUL's output should have only one
+ * consumer which is the ELEMENTWISE_ADD.
+ * This pattern can be defined as with the following pseudo codes
+ *
+ *     // Create two operator PDNodes.
+ *     MUL = PDPattern.NewNode().assert_is_op("mul");
+ *     ELE = PDPattern.NewNode().assert_is_op("elementwise_add");
+ *     // Create the variable PDNodes.
+ *     MUL_out = PDPattern.NewNode().assert_is_op_output("mul") \
+ *                                  .assert_is_op_input("elementwise_add") \
+ *                                  .AsIntermediate();
+ *     // Add relations.
+ *     MUL->LinksTo({MUL_out});
+ *     MUL_out->LinksTo({ELE});
+ *
+ * One can add more specific asserts for PDNodes or edges, both the Operator
+ * and Variable Nodes can be ruled in PDNode.assert_more(...).
+ *
+ * PDPattern can record the general patterns, such as the pattern represents
+ *   - Op in CPU -> Op in GPU -> Op in CPU, to findout the IO abnormal place.
+ *   - Ops whose inputs and outputs share the same variables
+ */
+class PDPattern {
+ public:
+  using edge_t = std::pair<PDNode*, PDNode*>;
+
+  void AddEdge(PDNode* a, PDNode* b);
+
+  PDNode* NewNode(PDNode::teller_t&& teller, const std::string& name = NewID());
+  PDNode* NewNode(const std::string& name = NewID());
+  PDNode* NewNode(const std::string& prefix, const std::string& name) {
+    return NewNode(prefix + "/" + name);
+  }
+  PDNode* RetrieveNode(const std::string& id) const;
+
+  const std::vector<std::unique_ptr<PDNode>>& nodes() const { return nodes_; }
+  const std::vector<edge_t>& edges() const { return edges_; }
+
+  std::string DotString() const;
+
+ private:
+#ifdef PADDLE_WITH_TESTING
+  FRIEND_TEST(PDPattern, AddEdge);
+  FRIEND_TEST(PDPattern, NewNode);
+#endif
+
+  static std::string NewID() { return "pdnode-" + std::to_string(id_++); }
+
+  std::vector<std::unique_ptr<PDNode>> nodes_;
+  std::vector<edge_t> edges_;
+  std::unordered_map<std::string, PDNode*> node_map_;
+  static size_t id_;
+};
+
+/*
+ * GraphPatternDetector helps to detect the specific patterns in the graph.
+ * Input a pattern, output a list of the matched subgraphs/nodes.
+ * This helper can be used to support fuse(conv+batchnorm => batchnorm e.g.).
+ *
+ * The algorithm has three phases:
+ *   1. Mark the nodes that match the defined PDNodes in a PDPattern,
+ *   2. Extend a PDNode to subgraphs by deducing the connection relation defined
+ *      in PAPattern(the edges),
+ *   3. Get the filtered subgraphs and treat them with a pre-defined handler.
+ *
+ * Usage:
+ *    // Create a detector
+ *    GraphPatternDetector detector;
+ *    // Define the detector's pattern, by adding PDNode and define the edges.
+ *    auto* node0 = detector.mutable_pattern().AddNode(...)
+ *    auto* node1 = detector.mutable_pattern().AddNode(...)
+ *    node0->teller = some lambda.
+ *    node1->teller = some lambda.
+ *    detector.mutable_pattern().AddEdge(node0, node1);
+ *    // Create an handler, to define the behavior of treating the filtered
+ *    // subgraphs that comply with the patterns.
+ *    GraphPatternDetector::handle_t handler = some labmda
+ *    // Execute the detector.
+ *    detector(&graph, handler);
+ */
+class GraphPatternDetector {
+ public:
+  using subgraph_t = std::unordered_map<PDNode*, Node*>;
+
+  // Operate on the detected pattern.
+  using handle_t =
+      std::function<void(const subgraph_t& /*hitted pattern*/, Graph*)>;
+
+  void operator()(Graph* graph, handle_t handler);
+
+  const PDPattern& pattern() const { return pattern_; }
+  PDPattern* mutable_pattern() { return &pattern_; }
+
+ private:
+  // Mark the nodes that fits the pattern.
+  bool MarkPDNodesInGraph(const ir::Graph& graph);
+
+  // Detect all the pattern and output the hit records.
+  std::vector<subgraph_t> DetectPatterns();
+
+  // Remove duplicate patterns.
+  void UniquePatterns(std::vector<subgraph_t>* subgraphs);
+
+  // Remove overlapped match subgraphs, when overlapped, keep the previous one.
+  // The intermediate PDNodes will be removed, so can't shared by multiple
+  // patterns.
+  void RemoveOverlappedMatch(std::vector<subgraph_t>* subgraphs);
+
+  // Validate whether the intermediate nodes are linked by external nodes.
+  void ValidateByNodeRole(std::vector<subgraph_t>* subgraphs);
+
+#ifdef PADDLE_WITH_TESTING
+  FRIEND_TEST(GraphPatternDetecter, MarkPDNodesInGraph);
+  FRIEND_TEST(GraphPatternDetecter, DetectPatterns);
+#endif
+
+ private:
+  using hit_rcd_t =
+      std::pair<Node* /*node in graph*/, PDNode* /*node in pattern*/>;
+  PDPattern pattern_;
+  std::unordered_map<const PDNode*, std::unordered_set<Node*>> pdnodes2nodes_;
+};
+
+// some helper methods.
+
+// Tell if a var links to an Op
+bool VarLinksToOp(Node* node, const std::string& op_type);
+
+// Tell if an op links to a var
+bool VarLinksFromOp(Node* node, const std::string& op_type);
+
+// Check whether a var node is a op node's nth input.
+bool IsNthInput(Node* var, Node* op, const std::string& argument, size_t nth);
+
+// Tell whether a var node is a op node's nth output.
+bool IsNthOutput(Node* var, Node* op, const std::string& argument, size_t nth);
+
+// Graph safely remove some nodes, will automatically clean up the edges.
+void GraphSafeRemoveNodes(Graph* graph,
+                          const std::unordered_set<const Node*>& nodes);
+
+// Some pre-defined patterns those can be reused in multiple passes.
+// The related Fluid Layer or Op should be one pattern here for better reusage
+// accross different fusion.
+namespace patterns {
+
+struct KeyCounter {
+  static KeyCounter& Instance() {
+    static KeyCounter x;
+    return x;
+  }
+
+  int IncCounter(const std::string& key) { return dic_[key]++; }
+
+ private:
+  std::unordered_map<std::string, size_t> dic_;
+};
+
+// Generate a unique PDNode's name with name_scope and id.
+// The format is {name_scope}/{repr}/{id}/{name}
+static std::string PDNodeName(const std::string& name_scope,
+                              const std::string& repr, size_t id,
+                              const std::string& name) {
+  return string::Sprintf("%s/%s/%d/%s", name_scope, repr, id, name);
+}
+// Generate a unique PDNode's name.
+// The format is {name_scope}/{repr}/{id}
+static std::string PDNodeName(const std::string& name_scope,
+                              const std::string& repr) {
+  return string::Sprintf("%s/%s/%d", name_scope, repr,
+                         KeyCounter::Instance().IncCounter(repr));
+}
+// Generate a unique key. It can be used for a universally unique temporary
+// name.
+// The format is {repr}/{id}
+static std::string UniqueKey(const std::string& repr) {
+  return string::Sprintf("%s/%d", repr,
+                         KeyCounter::Instance().IncCounter(repr));
+}
+
+// Declare a PDNode in a pattern, will create two methods:
+// std::string xxx_repr(); return this PDNode's string id.
+// PDNode* xxx_n(); return the corresponding PDNode.
+#define PATTERN_DECL_NODE(name__)                        \
+  std::string name__##_repr() const {                    \
+    return PDNodeName(name_scope_, repr_, id_, #name__); \
+  }                                                      \
+  PDNode* name__##_n() const { return pattern->RetrieveNode(name__##_repr()); }
+
+// Get an ir::Node* from the matched subgraph.
+// var: variable.
+// arg: the argument declared by PATTERN_DECL_NODE in a pattern definition.
+// pat: the pattern object.
+#define GET_IR_NODE_FROM_SUBGRAPH(var, arg, pat)                    \
+  PADDLE_ENFORCE(subgraph.count(pat.arg##_n()),                     \
+                 "Node not found for PDNode %s", pat.arg##_repr()); \
+  Node* var = subgraph.at(pat.arg##_n());                           \
+  PADDLE_ENFORCE(var, "node %s not exists in the sub-graph", #arg)
+
+// The base class of all the patterns.
+struct PatternBase {
+  PatternBase(PDPattern* pattern, const std::string& name_scope,
+              const std::string& repr)
+      : pattern(pattern),
+        name_scope_(name_scope),
+        repr_(repr),
+        id_(KeyCounter::Instance().IncCounter(repr)) {}
+
+  PDPattern* pattern;
+
+ protected:
+  std::string name_scope_;
+  std::string repr_;
+  size_t id_;
+};
+
+// CONV with ReLU
+// op: conv + relu
+// named nodes:
+// conv_input, conv_weight,
+// conv_bias, conv_out, conv,
+// relu_out, relu
+struct ConvReLU : public PatternBase {
+  ConvReLU(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "conv_relu") {}
+
+  PDNode* operator()(PDNode* conv_input);
+
+  // declare operator node's name
+  PATTERN_DECL_NODE(conv);
+  PATTERN_DECL_NODE(relu);
+  // declare variable node's name
+  PATTERN_DECL_NODE(conv_weight);
+  PATTERN_DECL_NODE(conv_bias);
+  PATTERN_DECL_NODE(conv_out);
+  PATTERN_DECL_NODE(relu_out);
+};
+
+// FC with bias
+// op: mul + elementwise_add
+// named nodes:
+// mul, elementwise_add
+// w, mul_out, bias, fc_out
+struct FC : public PatternBase {
+  FC(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "fc") {}
+
+  PDNode* operator()(PDNode* x, bool with_bias);
+
+  // declare operator node's name
+  PATTERN_DECL_NODE(fc);
+  PATTERN_DECL_NODE(mul);
+  PATTERN_DECL_NODE(elementwise_add);
+  // declare variable node's name
+  PATTERN_DECL_NODE(w);
+  PATTERN_DECL_NODE(mul_out);  // (x,w) -> mul_out
+  PATTERN_DECL_NODE(bias);
+  PATTERN_DECL_NODE(Out);
+};
+
+struct LSTM : public PatternBase {
+  LSTM(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "lstm") {}
+
+  PDNode* operator()(PDNode* x);
+
+  // Operators
+  PATTERN_DECL_NODE(lstm);
+
+  // Inputs
+  PATTERN_DECL_NODE(Input);
+  PATTERN_DECL_NODE(H0);
+  PATTERN_DECL_NODE(C0);
+  PATTERN_DECL_NODE(Weight);
+  PATTERN_DECL_NODE(Bias);
+
+  // Outputs
+  PATTERN_DECL_NODE(Hidden);
+  PATTERN_DECL_NODE(Cell);
+  PATTERN_DECL_NODE(BatchGate);
+  PATTERN_DECL_NODE(BatchCellPreAct);
+};
+
+struct GRU : public PatternBase {
+  GRU(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "gru") {}
+
+  PDNode* operator()(PDNode* x);
+
+  // Operators
+  PATTERN_DECL_NODE(gru);
+
+  // Inputs
+  PATTERN_DECL_NODE(Bias);
+  PATTERN_DECL_NODE(Weight);
+
+  // Outputs
+  PATTERN_DECL_NODE(BatchGate);
+  PATTERN_DECL_NODE(BatchResetHiddenPrev);
+  PATTERN_DECL_NODE(BatchHidden);
+  PATTERN_DECL_NODE(Hidden);
+};
+
+}  // namespace patterns
+
+// Link two ir::Nodes from each other.
+#define IR_NODE_LINK_TO(a, b) \
+  a->outputs.push_back(b);    \
+  b->inputs.push_back(a);
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector_tester.cc b/paddle/fluid/framework/ir/graph_pattern_detector_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6c466fb21fb46e09961dc874e9e39655f83d17c6
--- /dev/null
+++ b/paddle/fluid/framework/ir/graph_pattern_detector_tester.cc
@@ -0,0 +1,206 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+
+#include <gtest/gtest.h>
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void BuildGraph(Graph* g) {
+  ir::Node* o1 = g->CreateEmptyNode("op1", Node::Type::kOperation);
+  ir::Node* o2 = g->CreateEmptyNode("op2", Node::Type::kOperation);
+  ir::Node* o3 = g->CreateEmptyNode("op3", Node::Type::kOperation);
+  ir::Node* o4 = g->CreateEmptyNode("op4", Node::Type::kOperation);
+  ir::Node* o5 = g->CreateEmptyNode("op5", Node::Type::kOperation);
+  ir::Node* v1 = g->CreateEmptyNode("var1", Node::Type::kVariable);
+  ir::Node* v2 = g->CreateEmptyNode("var2", Node::Type::kVariable);
+  ir::Node* v3 = g->CreateEmptyNode("var3", Node::Type::kVariable);
+  ir::Node* v4 = g->CreateEmptyNode("var4", Node::Type::kVariable);
+
+  // o1->v1->o2
+  o1->outputs.push_back(v1);
+  o2->inputs.push_back(v1);
+  v1->inputs.push_back(o1);
+  v1->outputs.push_back(o2);
+  // o2->v2->o3
+  // o2->v2->o4
+  o2->outputs.push_back(v2);
+  o3->inputs.push_back(v2);
+  o4->inputs.push_back(v2);
+  v2->inputs.push_back(o2);
+  v2->outputs.push_back(o3);
+  v2->outputs.push_back(o4);
+  // o2->v3->o5
+  o2->outputs.push_back(v3);
+  o5->inputs.push_back(v3);
+  v3->inputs.push_back(o2);
+  v3->outputs.push_back(o5);
+  // o3-v4->o5
+  o3->outputs.push_back(v4);
+  o5->inputs.push_back(v4);
+  v4->inputs.push_back(o3);
+  v4->outputs.push_back(o5);
+}
+
+TEST(PDPattern, NewNode) {
+  PDPattern x;
+  auto* n = x.NewNode([](Node* x) { return true; });
+  ASSERT_TRUE(n);
+  ASSERT_EQ(x.nodes_.size(), 1UL);
+}
+
+TEST(PDPattern, AddEdge) {
+  PDPattern x;
+  auto* a = x.NewNode([](Node* x) { return true; });
+  auto* b = x.NewNode([](Node* x) { return true; });
+  ASSERT_TRUE(a);
+  ASSERT_TRUE(b);
+  x.AddEdge(a, b);
+  ASSERT_EQ(x.nodes_.size(), 2UL);
+  ASSERT_EQ(x.edges_.size(), 1UL);
+  ASSERT_EQ(x.edges_.front().first, a);
+  ASSERT_EQ(x.edges_.front().second, b);
+
+  ASSERT_EQ(x.nodes().size(), 2UL);
+  ASSERT_EQ(x.edges().size(), 1UL);
+  ASSERT_EQ(x.edges().front().first, a);
+  ASSERT_EQ(x.edges().front().second, b);
+}
+
+TEST(GraphPatternDetecter, MarkPDNodesInGraph) {
+  GraphPatternDetector x;
+  // mark o2, o3, v2
+
+  // The pattern is a graph:
+  //   o2(a node named o2) -> v2(a node named v2)
+  //   v2 -> o3(a node named o3)
+  auto* o2 = x.pattern_.NewNode([](Node* node) {
+    // The teller can be any condition, such as op type, or variable's shape.
+    return node && node->Name() == "op2" && node->IsOp();
+  });
+  auto* o3 = x.pattern_.NewNode([](Node* node) {
+    // The teller can be any condition, such as op type, or variable's shape.
+    return node && node->Name() == "op3" && node->IsOp();
+  });
+  auto* v2 = x.pattern_.NewNode([](Node* node) {
+    // The teller can be any condition, such as op type, or variable's shape.
+    return node && node->Name() == "var2" && node->IsVar();
+  });
+
+  ASSERT_FALSE(o2->Tell(nullptr));
+  ASSERT_FALSE(o3->Tell(nullptr));
+  ASSERT_FALSE(v2->Tell(nullptr));
+
+  x.pattern_.AddEdge(o2, v2);
+  x.pattern_.AddEdge(v2, o3);
+
+  ASSERT_EQ(x.pattern_.edges().size(), 2UL);
+  ASSERT_EQ(x.pattern_.edges()[0].first, o2);
+  ASSERT_EQ(x.pattern_.edges()[0].second, v2);
+  ASSERT_EQ(x.pattern_.edges()[1].first, v2);
+  ASSERT_EQ(x.pattern_.edges()[1].second, o3);
+
+  ProgramDesc program;
+  Graph graph(program);
+  BuildGraph(&graph);
+
+  x.MarkPDNodesInGraph(graph);
+
+  ASSERT_EQ(x.pdnodes2nodes_.size(), 3UL);
+
+  auto subgraphs = x.DetectPatterns();
+  ASSERT_EQ(subgraphs.size(), 1UL);
+}
+
+TEST(GraphPatternDetecter, MultiSubgraph) {
+  ProgramDesc program;
+  Graph graph(program);
+  BuildGraph(&graph);
+
+  GraphPatternDetector x;
+
+  // The pattern is a graph:
+  //   op -> var
+  auto* any_op = x.mutable_pattern()->NewNode(
+      [](Node* node) {
+        return node->IsOp() && (node->Name() == "op2" || node->Name() == "op3");
+      },
+      "OP0");
+  auto* any_var = x.mutable_pattern()
+                      ->NewNode([](Node* node) { return node->IsVar(); }, "VAR")
+                      ->AsIntermediate();
+  auto* any_op1 = x.mutable_pattern()->NewNode(
+      [](Node* node) { return node->IsOp(); }, "OP1");
+
+  x.mutable_pattern()->AddEdge(any_op, any_var);
+  x.mutable_pattern()->AddEdge(any_var, any_op1);
+
+  int count = 0;
+  GraphPatternDetector::handle_t handle = [&](
+      const GraphPatternDetector::subgraph_t& s, Graph* g) {
+    LOG(INFO) << "Detect " << s.at(any_op)->Name() << " -> "
+              << s.at(any_var)->Name() << " -> " << s.at(any_op1)->Name();
+    count++;
+  };
+
+  x(&graph, handle);
+
+  // 1. Detect op3 -> var4 -> op5
+  // 2. Detect op2 -> var2 -> op3
+  // 3. Detect op2 -> var2 -> op4
+  // 4. Detect op2 -> var3 -> op5
+  // But 2 and 3 and 4 overlapped, so keep 2, so the final choices are 1 and 2
+  ASSERT_GE(count, 1);
+  ASSERT_LE(count, 2);
+}
+
+TEST(GraphPatternDetector, IntermediateCheck) {
+  ProgramDesc program;
+  Graph graph(program);
+  BuildGraph(&graph);
+
+  // o2->v2->o3
+  // o2->v2->o4
+  // check o2+o3 fuse, should fail because v2 also link to o4.
+  GraphPatternDetector detector;
+  auto* op2 = detector.mutable_pattern()->NewNode(
+      [](Node* x) { return x && x->IsOp() && x->Name() == "op2"; }, "op2");
+  auto* op3 = detector.mutable_pattern()->NewNode(
+      [](Node* x) { return x && x->IsOp() && x->Name() == "op3"; }, "op3");
+  auto* v2 =
+      detector.mutable_pattern()
+          ->NewNode(
+              [](Node* x) { return x && x->IsVar() && x->Name() == "var2"; },
+              "var2")
+          ->AsIntermediate();
+  v2->LinksFrom({op2}).LinksTo({op3});
+
+  int count = 0;
+  detector(&graph, [&](const GraphPatternDetector::subgraph_t& g,
+                       Graph* graph) { ++count; });
+  EXPECT_EQ(count, 0);
+
+  count = 0;
+  v2->AsInput();
+  detector(&graph, [&](const GraphPatternDetector::subgraph_t& g,
+                       Graph* graph) { ++count; });
+  ASSERT_EQ(count, 1);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_test.cc b/paddle/fluid/framework/ir/graph_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cadda49c399a6d65079cacedfea61f4fd580a69a
--- /dev/null
+++ b/paddle/fluid/framework/ir/graph_test.cc
@@ -0,0 +1,210 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+
+namespace paddle {
+namespace framework {
+
+class NOP : public OperatorBase {
+ public:
+  NOP(const std::string &type, const VariableNameMap &inputs,
+      const VariableNameMap &outputs, const AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+ private:
+  void RunImpl(const Scope &scope,
+               const platform::Place &place) const override {}
+};
+
+class SumOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "").AsDuplicable();
+    AddOutput("Out", "").AsDuplicable();
+    AddComment("");
+  }
+};
+
+class SumOpVarTypeInference : public VarTypeInference {
+ public:
+  void operator()(const OpDesc &op_desc, BlockDesc *block) const override {
+    auto &inputs = op_desc.Input("X");
+    auto default_var_type = proto::VarType::SELECTED_ROWS;
+
+    bool any_input_is_lod_tensor = std::any_of(
+        inputs.begin(), inputs.end(), [block](const std::string &name) {
+          return block->Var(name)->GetType() == proto::VarType::LOD_TENSOR;
+        });
+    if (any_input_is_lod_tensor) {
+      default_var_type = proto::VarType::LOD_TENSOR;
+    }
+
+    auto out_var_name = op_desc.Output("Out").front();
+    block->Var(out_var_name)->SetType(default_var_type);
+  }
+};
+
+class DummyOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "").AsDuplicable();
+    AddOutput("Out", "").AsDuplicable();
+    AddComment("");
+  }
+};
+
+class DummyOpVarTypeInference : public VarTypeInference {
+ public:
+  void operator()(const OpDesc &op_desc, BlockDesc *block) const override {}
+};
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_OPERATOR(sum, paddle::framework::NOP, paddle::framework::SumOpMaker,
+                  paddle::framework::SumOpVarTypeInference);
+REGISTER_OPERATOR(dummy, paddle::framework::NOP, paddle::framework::SumOpMaker,
+                  paddle::framework::SumOpVarTypeInference);
+REGISTER_OPERATOR(sum_without_infer_var_type, paddle::framework::NOP,
+                  paddle::framework::SumOpMaker);
+
+namespace paddle {
+namespace framework {
+
+TEST(GraphTest, Basic) {
+  ProgramDesc prog;
+  auto *op = prog.MutableBlock(0)->AppendOp();
+  op->SetType("sum");
+  op->SetInput("X", {"test_a", "test_b", "test_c"});
+  op->SetOutput("Out", {"test_out"});
+  op->SetAttr("op_role", 1);
+
+  prog.MutableBlock(0)->Var("test_a")->SetType(proto::VarType::SELECTED_ROWS);
+  prog.MutableBlock(0)->Var("test_b")->SetType(proto::VarType::SELECTED_ROWS);
+  prog.MutableBlock(0)->Var("test_c")->SetType(proto::VarType::SELECTED_ROWS);
+  prog.MutableBlock(0)->Var("test_out");
+
+  op->InferVarType(prog.MutableBlock(0));
+
+  ASSERT_EQ(proto::VarType::SELECTED_ROWS,
+            prog.MutableBlock(0)->Var("test_out")->GetType());
+
+  prog.MutableBlock(0)->Var("test_b")->SetType(proto::VarType::LOD_TENSOR);
+  op->InferVarType(prog.MutableBlock(0));
+  ASSERT_EQ(proto::VarType::LOD_TENSOR,
+            prog.MutableBlock(0)->Var("test_out")->GetType());
+
+  std::unique_ptr<ir::Graph> g(new ir::Graph(prog));
+  std::vector<ir::Node *> nodes(g->Nodes().begin(), g->Nodes().end());
+  for (ir::Node *n : nodes) {
+    if (n->Name() == "sum") {
+      ASSERT_EQ(n->inputs.size(), 3UL);
+      ASSERT_EQ(n->outputs.size(), 1UL);
+    } else if (n->Name() == "test_a" || n->Name() == "test_b" ||
+               n->Name() == "test_c") {
+      ASSERT_EQ(n->inputs.size(), 0UL);
+      ASSERT_EQ(n->outputs.size(), 1UL);
+    } else if (n->Name() == "test_out") {
+      ASSERT_EQ(n->inputs.size(), 1UL);
+      ASSERT_EQ(n->outputs.size(), 0UL);
+    }
+  }
+  ASSERT_EQ(nodes.size(), 5);
+}
+
+TEST(GraphTest, WriteAfterRead) {
+  // void Test() {
+  ProgramDesc prog;
+  auto *op = prog.MutableBlock(0)->AppendOp();
+  op->SetType("sum");
+  op->SetInput("X", {"a"});
+  op->SetOutput("Out", {"b"});
+  op->SetAttr("op_role", 1);
+
+  op = prog.MutableBlock(0)->AppendOp();
+  op->SetType("dummy");
+  op->SetInput("X", {"c"});
+  op->SetOutput("Out", {"a"});
+  op->SetAttr("op_role", 1);
+
+  prog.MutableBlock(0)->Var("a")->SetType(proto::VarType::LOD_TENSOR);
+  prog.MutableBlock(0)->Var("b")->SetType(proto::VarType::LOD_TENSOR);
+  prog.MutableBlock(0)->Var("c")->SetType(proto::VarType::LOD_TENSOR);
+
+  std::unique_ptr<ir::Graph> g(new ir::Graph(prog));
+  ir::Node *control_dep1 = nullptr;
+  ir::Node *control_dep2 = nullptr;
+  for (ir::Node *n : g->Nodes()) {
+    if (n->Name() == "sum") {
+      ASSERT_EQ(n->outputs[0]->Name(), "b");
+      ASSERT_TRUE(ir::IsControlDepVar(*n->outputs[1]));
+      control_dep1 = n->outputs[1];
+      ASSERT_EQ(n->outputs.size(), 2);
+    }
+    if (n->Name() == "dummy") {
+      ASSERT_EQ(n->inputs[0]->Name(), "c");
+      ASSERT_TRUE(ir::IsControlDepVar(*n->inputs[1]));
+      control_dep2 = n->inputs[1];
+      ASSERT_EQ(n->inputs.size(), 2);
+    }
+  }
+  ASSERT_EQ(control_dep1, control_dep2);
+}
+
+TEST(GraphTest, WriteAfterWrite) {
+  // void Test() {
+  ProgramDesc prog;
+  auto *op = prog.MutableBlock(0)->AppendOp();
+  op->SetType("sum");
+  op->SetInput("X", {"a"});
+  op->SetOutput("Out", {"b"});
+  op->SetAttr("op_role", 1);
+
+  op = prog.MutableBlock(0)->AppendOp();
+  op->SetType("dummy");
+  op->SetInput("X", {"c"});
+  op->SetOutput("Out", {"b"});
+  op->SetAttr("op_role", 1);
+
+  prog.MutableBlock(0)->Var("a")->SetType(proto::VarType::LOD_TENSOR);
+  prog.MutableBlock(0)->Var("b")->SetType(proto::VarType::LOD_TENSOR);
+  prog.MutableBlock(0)->Var("c")->SetType(proto::VarType::LOD_TENSOR);
+
+  std::unique_ptr<ir::Graph> g(new ir::Graph(prog));
+  ir::Node *control_dep1 = nullptr;
+  ir::Node *control_dep2 = nullptr;
+  for (ir::Node *n : g->Nodes()) {
+    if (n->Name() == "sum") {
+      ASSERT_EQ(n->outputs[0]->Name(), "b");
+      ASSERT_TRUE(ir::IsControlDepVar(*n->outputs[1]));
+      ASSERT_EQ(n->outputs.size(), 2);
+      control_dep1 = n->outputs[1];
+    }
+    if (n->Name() == "dummy") {
+      ASSERT_EQ(n->inputs[0]->Name(), "c");
+      ASSERT_TRUE(ir::IsControlDepVar(*n->inputs[1]));
+      control_dep2 = n->inputs[1];
+      ASSERT_EQ(n->inputs.size(), 2);
+    }
+  }
+  ASSERT_NE(control_dep1, nullptr);
+  ASSERT_NE(control_dep2, nullptr);
+  ASSERT_EQ(control_dep1, control_dep2);
+}
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_to_program_pass.cc b/paddle/fluid/framework/ir/graph_to_program_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..414d8f79b15de091c62af5fe099ffae144156e4e
--- /dev/null
+++ b/paddle/fluid/framework/ir/graph_to_program_pass.cc
@@ -0,0 +1,65 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/ir/graph_to_program_pass.h"
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
+
+#include "paddle/fluid/framework/program_desc.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+std::unique_ptr<Graph> GraphToProgramPass::ApplyImpl(
+    std::unique_ptr<Graph> graph) const {
+  ProgramDesc& program = Get<ProgramDesc>("program");
+
+  std::unique_ptr<proto::ProgramDesc> program_pb(
+      new proto::ProgramDesc(*program.Proto()));
+
+  auto block = program_pb->mutable_blocks(kRootBlockIndex);
+  block->clear_vars();
+  std::unordered_set<std::string> visited_vars;
+  for (ir::Node* n : graph->Nodes()) {
+    if (n->NodeType() == ir::Node::Type::kVariable) {
+      if (n->Var() && visited_vars.count(n->Var()->Name()) == 0) {
+        visited_vars.insert(n->Var()->Name());
+        block->add_vars()->MergeFrom(*n->Var()->Proto());
+      }
+    }
+  }
+
+  block->clear_ops();
+  std::vector<ir::Node*> nodes = TopologySortOperations(*graph);
+  for (ir::Node* n : nodes) {
+    if (!n->Op()) {
+      continue;
+    }
+    block->add_ops()->MergeFrom(*n->Op()->Proto());
+  }
+
+  program.CopyFrom(*program_pb);
+  return graph;
+}
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(graph_to_program_pass, paddle::framework::ir::GraphToProgramPass);
diff --git a/paddle/fluid/framework/ir/graph_to_program_pass.h b/paddle/fluid/framework/ir/graph_to_program_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..124ec5a8e771fb768b31fa2e9f5143db96154490
--- /dev/null
+++ b/paddle/fluid/framework/ir/graph_to_program_pass.h
@@ -0,0 +1,30 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class GraphToProgramPass : public Pass {
+ protected:
+  std::unique_ptr<Graph> ApplyImpl(std::unique_ptr<Graph> graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_to_program_pass_test.cc b/paddle/fluid/framework/ir/graph_to_program_pass_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5d51d9751a28d2b1549096b1984d67b55f913da6
--- /dev/null
+++ b/paddle/fluid/framework/ir/graph_to_program_pass_test.cc
@@ -0,0 +1,110 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/ir/graph_to_program_pass.h"
+
+#include <string>
+#include <vector>
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/program_desc.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void BuildNoCircleGraph(Graph* g) {
+  OpDesc op1;
+  op1.SetType("op1");
+  OpDesc op2;
+  op2.SetType("op2");
+  OpDesc op3;
+  op3.SetType("op3");
+  OpDesc op4;
+  op4.SetType("op4");
+  OpDesc op5;
+  op5.SetType("op5");
+  VarDesc var1("var1");
+  VarDesc var2("var2");
+  VarDesc var3("var3");
+  VarDesc var4("var4");
+
+  ir::Node* o1 = g->CreateOpNode(&op1);
+  ir::Node* o2 = g->CreateOpNode(&op2);
+  ir::Node* o3 = g->CreateOpNode(&op3);
+  ir::Node* o4 = g->CreateOpNode(&op4);
+  ir::Node* o5 = g->CreateOpNode(&op5);
+  ir::Node* v1 = g->CreateVarNode(&var1);
+  ir::Node* v2 = g->CreateVarNode(&var2);
+  ir::Node* v3 = g->CreateVarNode(&var3);
+  ir::Node* v4 = g->CreateVarNode(&var4);
+
+  // o1->v1->o2
+  o1->outputs.push_back(v1);
+  o2->inputs.push_back(v1);
+  v1->inputs.push_back(o1);
+  v1->outputs.push_back(o2);
+  // o2->v2->o3
+  // o2->v2->o4
+  o2->outputs.push_back(v2);
+  o3->inputs.push_back(v2);
+  o4->inputs.push_back(v2);
+  v2->outputs.push_back(o3);
+  v2->outputs.push_back(o4);
+  v2->inputs.push_back(o2);
+  // o4->v3->o5
+  o4->outputs.push_back(v3);
+  o5->inputs.push_back(v3);
+  v3->inputs.push_back(o4);
+  v3->outputs.push_back(o5);
+  // o3-v4->o5
+  o3->outputs.push_back(v4);
+  o5->inputs.push_back(v4);
+  v4->inputs.push_back(o3);
+  v4->outputs.push_back(o5);
+}
+
+TEST(GraphToProgramPass, Basic) {
+  ProgramDesc prog;
+  std::unique_ptr<Graph> g(new Graph(prog));
+  BuildNoCircleGraph(g.get());
+
+  auto pass = paddle::framework::ir::PassRegistry::Instance().Get(
+      "graph_to_program_pass");
+
+  ProgramDesc compiled_prog;
+  pass->SetNotOwned<paddle::framework::ProgramDesc>("program", &compiled_prog);
+  pass->Apply(std::move(g));
+  std::vector<OpDesc*> ops = compiled_prog.Block(0).AllOps();
+  EXPECT_EQ(ops[0]->Type(), "op1");
+  EXPECT_EQ(ops[1]->Type(), "op2");
+  if (ops[2]->Type() == "op3") {
+    EXPECT_EQ(ops[3]->Type(), "op4");
+  } else if (ops[2]->Type() == "op4") {
+    EXPECT_EQ(ops[3]->Type(), "op3");
+  }
+  EXPECT_EQ(ops[4]->Type(), "op5");
+
+  std::unordered_set<std::string> vars;
+  for (VarDesc* v : compiled_prog.Block(0).AllVars()) {
+    vars.insert(v->Name());
+  }
+  EXPECT_TRUE(vars.find("var1") != vars.end());
+  EXPECT_TRUE(vars.find("var2") != vars.end());
+  EXPECT_TRUE(vars.find("var3") != vars.end());
+}
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+USE_PASS(graph_to_program_pass);
diff --git a/paddle/fluid/framework/ir/graph_traits.cc b/paddle/fluid/framework/ir/graph_traits.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8f548913e4e1d9d5bc5bdace8b92db9065cf3b5e
--- /dev/null
+++ b/paddle/fluid/framework/ir/graph_traits.cc
@@ -0,0 +1,69 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/graph_traits.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+//
+// NodesDFSIterator
+//
+NodesDFSIterator::NodesDFSIterator(const std::vector<Node *> &source) {
+  for (auto *x : source) stack_.push(x);
+}
+
+NodesDFSIterator::NodesDFSIterator(NodesDFSIterator &&other) noexcept
+    : stack_(std::move(other.stack_)),
+      visited_(std::move(other.visited_)) {}
+
+NodesDFSIterator::NodesDFSIterator(const NodesDFSIterator &other)
+    : stack_(other.stack_), visited_(other.visited_) {}
+
+Node &NodesDFSIterator::operator*() {
+  PADDLE_ENFORCE(!stack_.empty());
+  return *stack_.top();
+}
+
+NodesDFSIterator &NodesDFSIterator::operator++() {
+  PADDLE_ENFORCE(!stack_.empty(), "the iterator exceeds range");
+  visited_.insert(stack_.top());
+  auto *cur = stack_.top();
+  stack_.pop();
+  for (auto *x : cur->outputs) {
+    if (!visited_.count(x)) {
+      stack_.push(x);
+    }
+  }
+  return *this;
+}
+bool NodesDFSIterator::operator==(const NodesDFSIterator &other) {
+  if (stack_.empty()) return other.stack_.empty();
+  if ((!stack_.empty()) && (!other.stack_.empty())) {
+    return stack_.top() == other.stack_.top();
+  }
+  return false;
+}
+
+NodesDFSIterator &NodesDFSIterator::operator=(const NodesDFSIterator &other) {
+  stack_ = other.stack_;
+  visited_ = other.visited_;
+  return *this;
+}
+Node *NodesDFSIterator::operator->() { return stack_.top(); }
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_traits.h b/paddle/fluid/framework/ir/graph_traits.h
new file mode 100644
index 0000000000000000000000000000000000000000..f42bab20ed97e372d2da0c4a492a4458ab94e0a0
--- /dev/null
+++ b/paddle/fluid/framework/ir/graph_traits.h
@@ -0,0 +1,94 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <stack>
+#include <vector>
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/node.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+template <typename IteratorT>
+class iterator_range {
+  IteratorT begin_, end_;
+
+ public:
+  template <typename Container>
+  explicit iterator_range(Container &&c) : begin_(c.begin()), end_(c.end()) {}
+
+  iterator_range(const IteratorT &begin, const IteratorT &end)
+      : begin_(begin), end_(end) {}
+
+  const IteratorT &begin() const { return begin_; }
+  const IteratorT &end() const { return end_; }
+};
+
+// DFS iterator on nodes.
+struct NodesDFSIterator
+    : public std::iterator<std::forward_iterator_tag, Node *> {
+  NodesDFSIterator() = default;
+  explicit NodesDFSIterator(const std::vector<Node *> &source);
+  NodesDFSIterator(NodesDFSIterator &&other) noexcept;
+  NodesDFSIterator(const NodesDFSIterator &other);
+
+  Node &operator*();
+  NodesDFSIterator &operator++();
+  // TODO(Superjomn) current implementation just compare the first
+  // element, need to compare the graph and all the elements in the queue and
+  // set.
+  NodesDFSIterator &operator=(const NodesDFSIterator &other);
+  bool operator==(const NodesDFSIterator &other);
+  bool operator!=(const NodesDFSIterator &other) { return !(*this == other); }
+  Node *operator->();
+
+ private:
+  std::stack<Node *> stack_;
+  std::unordered_set<Node *> visited_;
+};
+
+/*
+ * GraphTraits contains some graph traversal algorithms.
+ *
+ * Usage:
+ *
+ */
+struct GraphTraits {
+  static iterator_range<NodesDFSIterator> DFS(const Graph &g) {
+    auto start_points = ExtractStartPoints(g);
+    NodesDFSIterator x(start_points);
+    return iterator_range<NodesDFSIterator>(NodesDFSIterator(start_points),
+                                            NodesDFSIterator());
+  }
+
+ private:
+  // The nodes those have no input will be treated as start points.
+  static std::vector<Node *> ExtractStartPoints(const Graph &g) {
+    std::vector<Node *> result;
+    for (auto *node : g.Nodes()) {
+      if (node->inputs.empty()) {
+        result.push_back(node);
+      }
+    }
+    return result;
+  }
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_viz_pass.cc b/paddle/fluid/framework/ir/graph_viz_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..31ed98db72c8fd4af8c970861d386687962001ce
--- /dev/null
+++ b/paddle/fluid/framework/ir/graph_viz_pass.cc
@@ -0,0 +1,138 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+#include <unordered_set>
+
+#include "paddle/fluid/framework/ir/graph_viz_pass.h"
+#include "paddle/fluid/framework/op_proto_maker.h"
+#include "paddle/fluid/inference/analysis/dot.h"
+#include "paddle/fluid/string/printf.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+using inference::analysis::Dot;
+namespace {
+const char kGraphVizPath[] = "graph_viz_path";
+
+std::string FormatName(const Node* node) {
+  if (!node->IsOp() || !node->Op() ||
+      !node->Op()->HasAttr(OpProtoAndCheckerMaker::OpNamescopeAttrName())) {
+    return node->Name();
+  }
+  const std::string full_scope = boost::get<std::string>(
+      node->Op()->GetAttr(OpProtoAndCheckerMaker::OpNamescopeAttrName()));
+  return string::Sprintf("%s%s", full_scope.c_str(), node->Name().c_str());
+}
+}  // namespace
+
+std::unique_ptr<ir::Graph> GraphVizPass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  const std::string graph_viz_path = Get<std::string>(kGraphVizPath);
+  VLOG(3) << "draw IR graph viz to " << graph_viz_path;
+  std::unique_ptr<std::ostream> fout(new std::ofstream(graph_viz_path));
+  PADDLE_ENFORCE(fout->good());
+  std::ostream& sout = *fout;
+
+  std::unordered_map<const ir::Node*, std::string> node2dot;
+
+  Dot dot;
+
+  const std::vector<Dot::Attr> op_attrs({
+      Dot::Attr("style", "rounded,filled,bold"),  //
+      Dot::Attr("shape", "box"),                  //
+      Dot::Attr("color", "#303A3A"),              //
+      Dot::Attr("fontcolor", "#ffffff"),          //
+      Dot::Attr("width", "1.3"),                  //
+      Dot::Attr("height", "0.84"),                //
+      Dot::Attr("fontname", "Arial"),             //
+  });
+  const std::vector<Dot::Attr> arg_attrs({
+      Dot::Attr("shape", "box"),                  //
+      Dot::Attr("style", "rounded,filled,bold"),  //
+      Dot::Attr("fontname", "Arial"),             //
+      Dot::Attr("fillcolor", "#999999"),          //
+      Dot::Attr("color", "#dddddd"),              //
+  });
+
+  const std::vector<Dot::Attr> param_attrs({
+      Dot::Attr("shape", "box"),                  //
+      Dot::Attr("style", "rounded,filled,bold"),  //
+      Dot::Attr("fontname", "Arial"),             //
+      Dot::Attr("color", "#148b97"),              //
+      Dot::Attr("fontcolor", "#ffffff"),          //
+  });
+
+  const std::vector<Dot::Attr> marked_op_attrs(
+      {Dot::Attr("style", "rounded,filled,bold"), Dot::Attr("shape", "box"),
+       Dot::Attr("fillcolor", "yellow")});
+  const std::vector<Dot::Attr> marked_var_attrs(
+      {Dot::Attr("style", "filled,rounded"), Dot::Attr("shape", "box"),
+       Dot::Attr("fillcolor", "yellow")});
+
+  auto marked_nodes = ConsumeMarkedNodes(graph.get());
+  // Create nodes
+  for (const Node* n : graph->Nodes()) {
+    std::string node_id = FormatName(n) + "(" + std::to_string(n->id()) + ")";
+    if (n->IsOp()) {
+      decltype(op_attrs) attr =
+          marked_nodes.count(n) ? marked_op_attrs : op_attrs;
+      dot.AddNode(node_id, attr, node_id);
+    } else if (n->IsVar()) {
+      decltype(op_attrs)* attr;
+      if (marked_nodes.count(n)) {
+        attr = &marked_var_attrs;
+      } else if (const_cast<Node*>(n)->Var() &&
+                 const_cast<Node*>(n)->Var()->Persistable()) {
+        attr = &param_attrs;
+      } else {
+        attr = &arg_attrs;
+      }
+
+      dot.AddNode(node_id, *attr, node_id);
+    }
+    node2dot[n] = node_id;
+  }
+  // Create edges
+  for (const Node* n : graph->Nodes()) {
+    const auto& src_id = node2dot.at(n);
+    for (auto* out : n->outputs) {
+      const auto& trg_id = node2dot.at(out);
+      dot.AddEdge(src_id, trg_id, {});
+    }
+  }
+
+  sout << dot.Build();
+
+  return graph;
+}
+
+GraphVizPass::marked_nodes_t GraphVizPass::ConsumeMarkedNodes(
+    Graph* graph) const {
+  marked_nodes_t res;
+  if (graph->Has(kGraphvizMarkedNodeAttr)) {
+    auto& attr = graph->Get<marked_nodes_t>(kGraphvizMarkedNodeAttr);
+    res = attr;
+    attr.clear();
+  }
+  return res;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(graph_viz_pass, paddle::framework::ir::GraphVizPass)
+    .RequirePassAttr(paddle::framework::ir::kGraphVizPath);
diff --git a/paddle/fluid/framework/ir/graph_viz_pass.h b/paddle/fluid/framework/ir/graph_viz_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..e64916a5bb662e3b00cfe212f0bbbc537c7bc2cc
--- /dev/null
+++ b/paddle/fluid/framework/ir/graph_viz_pass.h
@@ -0,0 +1,54 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <fstream>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+const char kGraphvizMarkedNodeAttr[] = "__graphviz__marked_node__";
+
+class GraphVizPass : public Pass {
+ public:
+  using marked_nodes_t = std::unordered_set<const Node*>;
+
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
+
+  // Tell whether there are any marked nodes in the graph. Consume the
+  // corresponding attribute.
+  marked_nodes_t ConsumeMarkedNodes(Graph* graph) const;
+};
+
+static GraphVizPass::marked_nodes_t& GetMarkedNodes(Graph* graph) {
+  if (!graph->Has(kGraphvizMarkedNodeAttr)) {
+    graph->Set(kGraphvizMarkedNodeAttr, new GraphVizPass::marked_nodes_t);
+  }
+  return graph->Get<GraphVizPass::marked_nodes_t>(kGraphvizMarkedNodeAttr);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/infer_clean_graph_pass.cc b/paddle/fluid/framework/ir/infer_clean_graph_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7713ed1eab88ee4fa16d52e7425075ae66f721a3
--- /dev/null
+++ b/paddle/fluid/framework/ir/infer_clean_graph_pass.cc
@@ -0,0 +1,68 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class InferCleanGraphPass : public FusePassBase {
+ public:
+  virtual ~InferCleanGraphPass() {}
+
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const {
+    FusePassBase::Init("original_graph", graph.get());
+    PADDLE_ENFORCE(graph.get());
+
+    auto is_valid_node = [](Node* x) {
+      return x && IsControlDepVar(*x) && x->IsVar() && !x->Var();
+    };
+
+    std::unordered_set<const Node*> invalid_nodes;
+    int valid_op = 0;
+    for (auto* node : graph->Nodes()) {
+      if (is_valid_node(node)) {
+        invalid_nodes.insert(node);
+      } else if (node->IsOp()) {
+        // Collect all the operators to help tracking number of operators.
+        ++valid_op;
+      }
+    }
+
+    GraphSafeRemoveNodes(graph.get(), invalid_nodes);
+
+    AddStatis(valid_op);
+
+    return graph;
+  }
+
+  void CleanEdges(std::vector<Node*>* nodes,
+                  const std::unordered_set<Node*>& to_remove) const {
+    auto it = std::remove_if(nodes->begin(), nodes->end(),
+                             [&](Node* x) { return to_remove.count(x); });
+    nodes->erase(it, nodes->end());
+  }
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(infer_clean_graph_pass,
+              paddle::framework::ir::InferCleanGraphPass);
diff --git a/paddle/fluid/framework/ir/node.cc b/paddle/fluid/framework/ir/node.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2817fcf5320f00affdcba097681c7ab20f0eb227
--- /dev/null
+++ b/paddle/fluid/framework/ir/node.cc
@@ -0,0 +1,24 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/ir/node.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+constexpr char Node::kControlDepVarName[];
+int Node::count_ = 0;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h
new file mode 100644
index 0000000000000000000000000000000000000000..d53d789d3ad27b8f9606a396264d91e5f07a9d10
--- /dev/null
+++ b/paddle/fluid/framework/ir/node.h
@@ -0,0 +1,91 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/framework/var_desc.h"
+#include "paddle/fluid/platform/macros.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class Node {
+ public:
+  enum class Type { kOperation, kVariable };
+  static constexpr char kControlDepVarName[] = "__control_var";
+
+  explicit Node(const std::string& name, Type type)
+      : name_(name),
+        var_desc_(nullptr),
+        op_desc_(nullptr),
+        type_(type),
+        id_(count_++) {}
+
+  explicit Node(VarDesc* var_desc)
+      : name_(var_desc->Name()),
+        var_desc_(new VarDesc(*var_desc)),
+        op_desc_(nullptr),
+        type_(Type::kVariable),
+        id_(count_++) {}
+
+  explicit Node(OpDesc* op_desc)
+      : name_(op_desc->Type()),
+        var_desc_(nullptr),
+        op_desc_(new OpDesc(*op_desc, op_desc->Block())),
+        type_(Type::kOperation),
+        id_(count_++) {}
+
+  Type NodeType() const { return type_; }
+
+  std::string Name() const { return name_; }
+
+  VarDesc* Var() {
+    PADDLE_ENFORCE(IsVar());
+    return var_desc_.get();
+  }
+
+  OpDesc* Op() const {
+    PADDLE_ENFORCE(IsOp());
+    return op_desc_.get();
+  }
+
+  int id() const { return id_; }
+
+  bool IsOp() const { return type_ == Type::kOperation; }
+  bool IsVar() const { return type_ == Type::kVariable; }
+
+  std::vector<Node*> inputs;
+  std::vector<Node*> outputs;
+
+ protected:
+  const std::string name_;
+  std::unique_ptr<VarDesc> var_desc_;
+  std::unique_ptr<OpDesc> op_desc_;
+  Type type_;
+  int id_;
+
+ private:
+  friend class Graph;
+  static int count_;
+  static void ResetId() { count_ = 0; }
+  DISABLE_COPY_AND_ASSIGN(Node);
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/pass.cc b/paddle/fluid/framework/ir/pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d7158eba62686be57499df697466797e4034ea8f
--- /dev/null
+++ b/paddle/fluid/framework/ir/pass.cc
@@ -0,0 +1,46 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+std::unique_ptr<Graph> Pass::Apply(std::unique_ptr<Graph> graph) const {
+  PADDLE_ENFORCE(!applied_, "Pass can only Apply() once.");
+  PADDLE_ENFORCE(graph.get(), "graph passed to Pass::Apply() cannot be empty.");
+  for (const std::string& attr : required_pass_attrs_) {
+    PADDLE_ENFORCE(attrs_.find(attr) != attrs_.end(),
+                   "Required pass atrribute %s not set.", attr);
+  }
+  for (const std::string& attr : required_graph_attrs_) {
+    PADDLE_ENFORCE(graph->Has(attr), "Required graph atrribute %s not set.",
+                   attr);
+  }
+  auto applied_graph = ApplyImpl(std::move(graph));
+  // TODO(panyx0718): Add more verifications.
+  PADDLE_ENFORCE(!HasCircle(*applied_graph),
+                 "Illegal Pass. Generated graph shouldn't has cycle.");
+  applied_ = true;
+  return applied_graph;
+}
+
+PassRegistry& PassRegistry::Instance() {
+  static PassRegistry g_pass_info_map;
+  return g_pass_info_map;
+}
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/pass.h b/paddle/fluid/framework/ir/pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..0f14083d259172f5b5f1ed80c7d38312d711beb5
--- /dev/null
+++ b/paddle/fluid/framework/ir/pass.h
@@ -0,0 +1,200 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <functional>
+#include <map>
+#include <string>
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/node.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/platform/variant.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+template <typename PassType>
+struct PassRegistrar;
+
+class Pass {
+ public:
+  Pass() = default;
+  virtual ~Pass() {
+    for (auto &attr : attrs_) {
+      if (attr_dels_.find(attr.first) != attr_dels_.end()) {
+        attr_dels_[attr.first]();
+      }
+    }
+    attrs_.clear();
+    attr_dels_.clear();
+  }
+
+  std::unique_ptr<Graph> Apply(std::unique_ptr<Graph> graph) const;
+
+  // Get a reference to the attributed previously set.
+  template <typename AttrType>
+  AttrType &Get(const std::string &attr_name) const {
+    PADDLE_ENFORCE(attrs_.find(attr_name) != attrs_.end(),
+                   "%s attr not registered for pass.", attr_name);
+    return *boost::any_cast<AttrType *>(attrs_.at(attr_name));
+  }
+
+  // Set a pointer to the attribute. Pass takes ownership of the attribute.
+  template <typename AttrType>
+  void Set(const std::string &attr_name, AttrType *attr) {
+    PADDLE_ENFORCE(attrs_.count(attr_name) == 0, "%s already set in the pass",
+                   attr_name);
+    attrs_[attr_name] = attr;
+    attr_dels_[attr_name] = [attr, attr_name]() {
+      VLOG(3) << "deleting " << attr_name;
+      delete attr;
+    };
+  }
+
+  // Set a pointer to the attribute. Pass doesn't take ownership. Caller
+  // should delete the attribute.
+  template <typename AttrType>
+  void SetNotOwned(const std::string &attr_name, AttrType *attr) {
+    PADDLE_ENFORCE(attrs_.count(attr_name) == 0);
+    attrs_[attr_name] = attr;
+  }
+
+ protected:
+  virtual std::unique_ptr<Graph> ApplyImpl(
+      std::unique_ptr<Graph> graph) const = 0;
+
+ private:
+  template <typename PassType>
+  friend struct PassRegistrar;
+
+  void RegisterRequiredPassAttrs(const std::unordered_set<std::string> &attrs) {
+    required_pass_attrs_.insert(attrs.begin(), attrs.end());
+  }
+
+  void RegisterRequiredGraphAttrs(
+      const std::unordered_set<std::string> &attrs) {
+    required_graph_attrs_.insert(attrs.begin(), attrs.end());
+  }
+
+  mutable bool applied_{false};
+  std::unordered_set<std::string> required_pass_attrs_;
+  std::unordered_set<std::string> required_graph_attrs_;
+  std::map<std::string, boost::any> attrs_;
+  std::map<std::string, std::function<void(void)>> attr_dels_;
+};
+
+using PassCreator = std::function<std::unique_ptr<Pass>()>;
+
+class Registrar {
+ public:
+  // In our design, various kinds of passes,
+  // have their corresponding registry and registrar. The action of
+  // registration is in the constructor of a global registrar variable, which
+  // are not used in the code that calls package framework, and would
+  // be removed from the generated binary file by the linker. To avoid such
+  // removal, we add Touch to all registrar classes and make USE_PASS macros to
+  // call this method. So, as long as the callee code calls USE_PASS, the global
+  // registrar variable won't be removed by the linker.
+  void Touch() {}
+};
+
+class PassRegistry {
+ public:
+  static PassRegistry &Instance();
+
+  bool Has(const std::string &pass_type) const {
+    return map_.find(pass_type) != map_.end();
+  }
+
+  void Insert(const std::string &pass_type, const PassCreator &pass_creator) {
+    PADDLE_ENFORCE(!Has(pass_type), "Pass %s has been registered", pass_type);
+    map_.insert({pass_type, pass_creator});
+  }
+
+  std::unique_ptr<Pass> Get(const std::string &pass_type) const {
+    PADDLE_ENFORCE(Has(pass_type), "Pass %s has not been registered",
+                   pass_type);
+    return map_.at(pass_type)();
+  }
+
+ private:
+  PassRegistry() = default;
+  std::unordered_map<std::string, PassCreator> map_;
+
+  DISABLE_COPY_AND_ASSIGN(PassRegistry);
+};
+
+template <typename PassType>
+struct PassRegistrar : public Registrar {
+  explicit PassRegistrar(const char *pass_type) {
+    PADDLE_ENFORCE(!PassRegistry::Instance().Has(pass_type),
+                   "'%s' is registered more than once.", pass_type);
+    PassRegistry::Instance().Insert(
+        pass_type, [this]() -> std::unique_ptr<Pass> {
+          std::unique_ptr<Pass> pass(new PassType());
+          pass->RegisterRequiredPassAttrs(this->required_pass_attrs_);
+          pass->RegisterRequiredGraphAttrs(this->required_graph_attrs_);
+          return pass;
+        });
+  }
+
+  PassRegistrar<PassType> &RequirePassAttr(const std::string &attr) {
+    required_pass_attrs_.insert(attr);
+    return *this;
+  }
+
+  PassRegistrar<PassType> &RequireGraphAttr(const std::string &attr) {
+    required_graph_attrs_.insert(attr);
+    return *this;
+  }
+
+ private:
+  std::unordered_set<std::string> required_pass_attrs_;
+  std::unordered_set<std::string> required_graph_attrs_;
+};
+
+#define STATIC_ASSERT_PASS_GLOBAL_NAMESPACE(uniq_name, msg)                   \
+  struct __test_global_namespace_##uniq_name##__ {};                          \
+  static_assert(std::is_same<::__test_global_namespace_##uniq_name##__,       \
+                             __test_global_namespace_##uniq_name##__>::value, \
+                msg)
+
+// Register a new pass that can be applied on the IR.
+#define REGISTER_PASS(pass_type, pass_class)                          \
+  STATIC_ASSERT_PASS_GLOBAL_NAMESPACE(                                \
+      __reg_pass__##pass_type,                                        \
+      "REGISTER_PASS must be called in global namespace");            \
+  static ::paddle::framework::ir::PassRegistrar<pass_class>           \
+      __pass_registrar_##pass_type##__(#pass_type);                   \
+  int TouchPassRegistrar_##pass_type() {                              \
+    __pass_registrar_##pass_type##__.Touch();                         \
+    return 0;                                                         \
+  }                                                                   \
+  static ::paddle::framework::ir::PassRegistrar<pass_class>           \
+      &__pass_tmp_registrar_##pass_type##__ __attribute__((unused)) = \
+          __pass_registrar_##pass_type##__
+
+#define USE_PASS(pass_type)                                           \
+  STATIC_ASSERT_PASS_GLOBAL_NAMESPACE(                                \
+      __use_pass_itself_##pass_type,                                  \
+      "USE_PASS must be called in global namespace");                 \
+  extern int TouchPassRegistrar_##pass_type();                        \
+  static int use_pass_itself_##pass_type##_ __attribute__((unused)) = \
+      TouchPassRegistrar_##pass_type()
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/pass_test.cc b/paddle/fluid/framework/ir/pass_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5b5011412ed39e033a7a65921e9c64ce2d54c638
--- /dev/null
+++ b/paddle/fluid/framework/ir/pass_test.cc
@@ -0,0 +1,112 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/ir/pass.h"
+#include <string>
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/ir/graph.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+void BuildCircleGraph(Graph* g) {
+  ir::Node* o1 = g->CreateEmptyNode("op1", Node::Type::kOperation);
+  ir::Node* o2 = g->CreateEmptyNode("op2", Node::Type::kOperation);
+  ir::Node* v1 = g->CreateEmptyNode("var1", Node::Type::kVariable);
+  ir::Node* v2 = g->CreateEmptyNode("var2", Node::Type::kVariable);
+
+  o1->outputs.push_back(v1);
+  o2->inputs.push_back(v1);
+  v1->inputs.push_back(o1);
+  v1->outputs.push_back(o2);
+
+  o2->outputs.push_back(v2);
+  o1->inputs.push_back(v2);
+  v2->inputs.push_back(o2);
+  v2->outputs.push_back(o1);
+}
+
+class TestPass : public Pass {
+ protected:
+  std::unique_ptr<Graph> ApplyImpl(std::unique_ptr<Graph> graph) const {
+    graph->Set<int>("copy_test_pass_attr", new int);
+    graph->Set<int>("copy_test_graph_attr", new int);
+
+    int test_pass_attr = this->Get<int>("test_pass_attr");
+    graph->Get<int>("copy_test_pass_attr") = test_pass_attr + 1;
+
+    int test_graph_attr = graph->Get<int>("test_graph_attr");
+    graph->Get<int>("copy_test_graph_attr") = test_graph_attr + 1;
+    return graph;
+  }
+};
+
+TEST(PassTest, TestPassAttrCheck) {
+  ProgramDesc prog;
+  auto pass = PassRegistry::Instance().Get("test_pass");
+  std::unique_ptr<Graph> graph(new Graph(prog));
+  std::string exception;
+  try {
+    graph = pass->Apply(std::move(graph));
+  } catch (paddle::platform::EnforceNotMet e) {
+    exception = std::string(e.what());
+  }
+  ASSERT_TRUE(exception.find("test_pass_attr not set") != exception.npos);
+
+  int val = 1;
+  graph.reset(new Graph(prog));
+  pass->SetNotOwned<int>("test_pass_attr", &val);
+
+  try {
+    graph = pass->Apply(std::move(graph));
+  } catch (paddle::platform::EnforceNotMet e) {
+    exception = std::string(e.what());
+  }
+  ASSERT_TRUE(exception.find("test_graph_attr not set") != exception.npos);
+
+  graph.reset(new Graph(prog));
+  graph->Set<int>("test_graph_attr", new int);
+  graph->Get<int>("test_graph_attr") = 1;
+  graph = pass->Apply(std::move(graph));
+  ASSERT_EQ(graph->Get<int>("copy_test_pass_attr"), 2);
+  ASSERT_EQ(graph->Get<int>("copy_test_graph_attr"), 2);
+
+  try {
+    graph = pass->Apply(std::move(graph));
+  } catch (paddle::platform::EnforceNotMet e) {
+    exception = std::string(e.what());
+  }
+  ASSERT_TRUE(exception.find("Pass can only Apply() once") != exception.npos);
+
+  pass = PassRegistry::Instance().Get("test_pass");
+  pass->SetNotOwned<int>("test_pass_attr", &val);
+  graph.reset(new Graph(prog));
+  BuildCircleGraph(graph.get());
+  graph->Set<int>("test_graph_attr", new int);
+  graph->Get<int>("test_graph_attr") = 2;
+  try {
+    auto tmp = pass->Apply(std::move(graph));
+  } catch (paddle::platform::EnforceNotMet e) {
+    exception = std::string(e.what());
+  }
+  ASSERT_TRUE(exception.find("shouldn't has cycle") != exception.npos);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(test_pass, paddle::framework::ir::TestPass)
+    .RequirePassAttr("test_pass_attr")
+    .RequireGraphAttr("test_graph_attr");
diff --git a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a7d5161c35db804703415066990f34da8109fbd9
--- /dev/null
+++ b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc
@@ -0,0 +1,258 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h"
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/ir/graph_viz_pass.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+struct FuseExpr {};
+
+// sequence expand, concat fuse pattern, return concat's output
+PDNode* BuildSeqExpandConcatPattern(PDPattern* pattern) {
+  // The following operators will be fused:
+  // concat
+  // sequence_expand
+  // sequence_expand
+
+  // The following variables will be treat as inputs:
+  // concat mid input, 0th input for fused op
+  // sequence_expand input, 1th input for fused op
+  // sequence_expand input, 2th input for fused op
+
+  // The following variables will be treat as outputs:
+  // concat output
+
+  // So the following variables will be removed:
+  // sequence-expand output
+  // sequence-expand output
+
+  // Three operators
+  auto* sequence_expand0 = pattern->NewNode(
+      [](Node* x) {
+        return x && x->IsOp() && x->Op()->Type() == "sequence_expand";
+      },
+      "sequence_expand0");
+
+  auto* sequence_expand1 = pattern->NewNode(
+      [](Node* x) {
+        return x && x->IsOp() && x->Op()->Type() == "sequence_expand";
+      },
+      "sequence_expand1");
+
+  auto* concat = pattern->NewNode(
+      [](Node* x) {
+        return x && x->IsOp() && x->Op()->Type() == "concat" &&  // basic check
+               x->Op()->Input("X").size() == 3;                  // Special case
+      },
+      "concat");
+
+  auto* sequence_expand0_in = pattern->NewNode(
+      [](Node* x) {
+        return x && x->IsVar() && VarLinksToOp(x, "sequence_expand");
+      },
+      "sequence_expand0_in");
+  auto* sequence_expand1_in = pattern->NewNode(
+      [](Node* x) {
+        return x && x->IsVar() && VarLinksToOp(x, "sequence_expand");
+      },
+      "sequence_expand1_in");
+
+  // The variables
+  auto* sequence_expand0_out = pattern->NewNode(
+      [](Node* x) {
+        return x && x->IsVar() &&
+               VarLinksFromOp(x, "sequence_expand") &&  // basic check
+               VarLinksToOp(x, "concat") &&             // is concat's input
+               IsNthInput(x, x->outputs[0], "X", 1);    // X[0]
+      },
+      "sequence_expand0_out");
+
+  auto* sequence_expand1_out = pattern->NewNode(
+      [](Node* x) {
+        return x && x->IsVar() &&
+               VarLinksFromOp(x, "sequence_expand") &&  // basic check
+               VarLinksToOp(x, "concat") &&             // is concat's input
+               IsNthInput(x, x->outputs[0], "X", 2);    // x[2]
+      },
+      "sequence_expand1_out");
+
+  auto* concat_in0 = pattern->NewNode(
+      [](Node* x) { return x && x->IsVar() && VarLinksToOp(x, "concat"); },
+      "concat_in0");
+
+  auto* concat_out = pattern->NewNode(
+      [](Node* x) { return x && x->IsVar() && VarLinksFromOp(x, "concat"); },
+      "concat_out");
+
+  // Links
+  sequence_expand0->LinksFrom({sequence_expand0_in})
+      .LinksTo({sequence_expand0_out});
+  sequence_expand1->LinksFrom({sequence_expand1_in})
+      .LinksTo({sequence_expand1_out});
+  concat->LinksFrom({sequence_expand0_out, sequence_expand1_out, concat_in0})
+      .LinksTo({concat_out});
+  return concat_out;
+}
+
+PDNode* BuildFCPattern(PDPattern* pattern, PDNode* fc_x) {
+  PDNode* fc_w = pattern->NewNode(
+      [](Node* x) {
+        return x && x->IsVar() &&                 // basic
+               VarLinksToOp(x, "mul") &&          // link
+               x->Var()->Proto()->persistable();  // is a parameter
+      },
+      "fc_w");
+
+  PDNode* mul_out = pattern->NewNode(
+      [](Node* x) {
+        return x && x->IsVar() &&                     // basic
+               VarLinksFromOp(x, "mul") &&            // link
+               VarLinksToOp(x, "elementwise_add") &&  //
+               !x->Var()->Proto()->persistable();     // is a parameter
+      },
+      "mul_out");
+
+  PDNode* fc_mul = pattern->NewNode(
+      [](Node* x) {
+        return x && x->IsOp() && x->Op()->Type() == "mul";  // basic
+      },
+      "fc_mul");
+
+  PDNode* fc_bias = pattern->NewNode(
+      [](Node* x) {
+        return x && x->IsVar() &&                     // basic
+               VarLinksToOp(x, "elementwise_add") &&  // link
+               x->Var()->Proto()->persistable();      // is a parameter
+      },
+      "fc_bias");
+
+  PDNode* elementwise_add = pattern->NewNode(
+      [](Node* x) {
+        return x && x->IsOp() && x->Op()->Type() == "elementwise_add";
+      },
+      "elementwise_add");
+
+  PDNode* add_out = pattern->NewNode(
+      [](Node* x) {
+        return x && x->IsVar() &&                       // basic
+               VarLinksFromOp(x, "elementwise_add") &&  // link
+               !x->Var()->Proto()->persistable();       // is a parameter
+      },
+      "add_out");
+
+  std::set<std::string> acts({"sigmoid", "tanh", "relu", "identity"});
+  PDNode* act = pattern->NewNode(
+      [=](Node* x) {
+        return x && x->IsOp() && acts.count(x->Op()->Type());
+
+      },
+      "act");
+
+  PDNode* fc_out = pattern->NewNode(
+      [](Node* x) {
+        return x && x->IsVar() &&                  // basic
+               !x->Var()->Proto()->persistable();  // is a parameter
+      },
+      "fc_out");
+
+  fc_mul->LinksFrom({fc_w, fc_x}).LinksTo({mul_out});
+  elementwise_add->LinksFrom({mul_out, fc_bias}).LinksTo({add_out});
+  act->LinksFrom({add_out}).LinksTo({fc_out});
+  return fc_out;
+}
+
+std::unique_ptr<ir::Graph> SeqConcatFcFusePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  FusePassBase::Init("seq_concat_fc_fuse", graph.get());
+  GraphPatternDetector detector;
+  auto* pattern = detector.mutable_pattern();
+  auto* concat_out = BuildSeqExpandConcatPattern(pattern);
+  BuildFCPattern(pattern, concat_out);
+
+#define GET_NODE(id, pattern)                               \
+  PADDLE_ENFORCE(subgraph.count(pattern.RetrieveNode(#id)), \
+                 "pattern has no Node called %s", #id);     \
+  auto* id = subgraph.at(pattern.RetrieveNode(#id));        \
+  PADDLE_ENFORCE_NOT_NULL(id, "subgraph has no node %s", #id);
+
+  int fuse_count{0};
+
+  detector(graph.get(), [&](const GraphPatternDetector::subgraph_t& subgraph,
+                            Graph* graph) {
+    VLOG(4) << "get one concat pattern";
+    // fc
+    GET_NODE(fc_w, detector.pattern());
+    GET_NODE(fc_bias, detector.pattern());
+    GET_NODE(act, detector.pattern());
+    GET_NODE(fc_out, detector.pattern());
+
+    // concat
+    GET_NODE(concat_in0, detector.pattern());
+    GET_NODE(sequence_expand0_in, detector.pattern());
+    GET_NODE(sequence_expand1_in, detector.pattern());
+
+    OpDesc op_desc;
+    op_desc.SetType("fusion_seqexpand_concat_fc");
+    op_desc.SetInput("X", {concat_in0->Name(), sequence_expand0_in->Name(),
+                           sequence_expand1_in->Name()});
+    op_desc.SetInput("FCWeight", {fc_w->Name()});
+    op_desc.SetInput("FCBias", {fc_bias->Name()});
+    const std::string fc_out_tmp = fc_out->Name() + ".tmp";
+    param_scope()->Var(fc_out_tmp)->GetMutable<framework::LoDTensor>();
+    op_desc.SetOutput("FCOut", {fc_out_tmp});
+    op_desc.SetOutput("Out", {fc_out->Name()});
+    op_desc.SetAttr("fc_activation", act->Op()->Type());
+
+    auto* op_node = graph->CreateOpNode(&op_desc);
+    // Add links
+    IR_NODE_LINK_TO(fc_w, op_node);
+    IR_NODE_LINK_TO(fc_bias, op_node);
+    IR_NODE_LINK_TO(concat_in0, op_node);
+    IR_NODE_LINK_TO(sequence_expand0_in, op_node);
+    IR_NODE_LINK_TO(sequence_expand1_in, op_node);
+    IR_NODE_LINK_TO(op_node, fc_out);
+
+    // Clean nodes.
+    std::unordered_set<const Node*> marked_nodes;
+    for (auto& item : subgraph) {
+      marked_nodes.insert(item.second);
+    }
+    marked_nodes.erase(fc_w);
+    marked_nodes.erase(fc_bias);
+    marked_nodes.erase(concat_in0);
+    marked_nodes.erase(sequence_expand0_in);
+    marked_nodes.erase(sequence_expand1_in);
+    marked_nodes.erase(fc_out);
+    GraphSafeRemoveNodes(graph, marked_nodes);
+
+    ++fuse_count;
+  });
+
+  AddStatis(fuse_count);
+
+  return graph;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(seq_concat_fc_fuse_pass,
+              paddle::framework::ir::SeqConcatFcFusePass);
diff --git a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..9f5fd1a29adf918806d8f30097d8c7f002f48f3e
--- /dev/null
+++ b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class SeqConcatFcFusePass : public FusePassBase {
+ public:
+  virtual ~SeqConcatFcFusePass() {}
+
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc
index cba0064f38f89c1dd27cfac1ddb2339a5ee6c93f..1e7da9a69c7cbf8c13306656599a759515802b76 100644
--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
@@ -21,12 +21,15 @@ limitations under the License. */
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/var_type.h"
+#include "paddle/fluid/framework/version.h"
 
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/memory/memory.h"
 
+#if !defined(_WIN32)
 #include "paddle/fluid/recordio/scanner.h"
 #include "paddle/fluid/recordio/writer.h"
+#endif  // _WIN32
 
 namespace paddle {
 namespace framework {
@@ -249,8 +252,8 @@ void AppendLoD(LoD *lod, const LoD &lod_length) {
 void SerializeToStream(std::ostream &os, const LoDTensor &tensor,
                        const platform::DeviceContext &dev_ctx) {
   {  // the 1st field, uint32_t version for LoDTensor
-    constexpr uint32_t version = 0;
-    os.write(reinterpret_cast<const char *>(&version), sizeof(version));
+    os.write(reinterpret_cast<const char *>(&kCurTensorVersion),
+             sizeof(kCurTensorVersion));
   }
   {
     // the 2st field, LoD information
@@ -279,6 +282,8 @@ void DeserializeFromStream(std::istream &is, LoDTensor *tensor,
     // the 1st field, unit32_t version for LoDTensor
     uint32_t version;
     is.read(reinterpret_cast<char *>(&version), sizeof(version));
+    PADDLE_ENFORCE(framework::IsTensorVersionSupported(version),
+                   "tensor version %u is not supported.", version);
     PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
   }
   {
@@ -300,6 +305,7 @@ void DeserializeFromStream(std::istream &is, LoDTensor *tensor,
   TensorFromStream(is, static_cast<Tensor *>(tensor), dev_ctx);
 }
 
+#if !defined(_WIN32)
 void WriteToRecordIO(recordio::Writer *writer,
                      const std::vector<LoDTensor> &tensor,
                      const platform::DeviceContext &dev_ctx) {
@@ -312,21 +318,36 @@ void WriteToRecordIO(recordio::Writer *writer,
   writer->Write(buffer.str());
 }
 
-std::vector<LoDTensor> ReadFromRecordIO(
-    recordio::Scanner *scanner, const platform::DeviceContext &dev_ctx) {
-  std::vector<LoDTensor> result;
-  if (scanner->HasNext()) {
-    std::istringstream sin(scanner->Next());
-    uint32_t sz;
-    sin.read(reinterpret_cast<char *>(&sz), sizeof(uint32_t));
-    result.resize(sz);
-    for (uint32_t i = 0; i < sz; ++i) {
-      DeserializeFromStream(sin, &result[i], dev_ctx);
-    }
+bool ReadFromRecordIO(recordio::Scanner *scanner,
+                      const platform::DeviceContext &dev_ctx,
+                      std::vector<LoDTensor> *result_ptr) {
+  if (!scanner->HasNext()) {
+    return false;
+  }
+  std::istringstream sin(scanner->Next());
+  uint32_t sz;
+  sin.read(reinterpret_cast<char *>(&sz), sizeof(uint32_t));
+  auto &result = *result_ptr;
+  result.resize(sz);
+  for (uint32_t i = 0; i < sz; ++i) {
+    DeserializeFromStream(sin, &result[i], dev_ctx);
   }
-  return result;
-}
 
+  return true;
+}
+#else
+class Writer {};
+class Scanner {};
+void WriteToRecordIO(recordio::Writer *writer,
+                     const std::vector<LoDTensor> &tensor,
+                     const platform::DeviceContext &dev_ctx) {}
+bool ReadFromRecordIO(recordio::Scanner *scanner,
+                      const platform::DeviceContext &dev_ctx,
+                      std::vector<LoDTensor> *result_ptr) {
+  PADDLE_ENFORCE("windows didn't supported recordio!.");
+  return true;
+}
+#endif  // _WIN32
 std::vector<LoDTensor> LoDTensor::SplitLoDTensor(
     const std::vector<platform::Place> places) const {
   check_memory_size();
diff --git a/paddle/fluid/framework/lod_tensor.h b/paddle/fluid/framework/lod_tensor.h
index 4a2729373b5c63176ed1e856f4acf29fd1e73254..e9b473d547252e80ed26ec61e1a33fbe1742dbe0 100644
--- a/paddle/fluid/framework/lod_tensor.h
+++ b/paddle/fluid/framework/lod_tensor.h
@@ -223,8 +223,9 @@ extern void WriteToRecordIO(recordio::Writer* writer,
                             const std::vector<LoDTensor>& tensor,
                             const platform::DeviceContext& dev_ctx);
 
-extern std::vector<LoDTensor> ReadFromRecordIO(
-    recordio::Scanner* scanner, const platform::DeviceContext& dev_ctx);
+extern bool ReadFromRecordIO(recordio::Scanner* scanner,
+                             const platform::DeviceContext& dev_ctx,
+                             std::vector<LoDTensor>* result_ptr);
 
 /*
  * Convert between length-based LoD and offset-based LoD.
diff --git a/paddle/fluid/framework/lod_tensor_test.cc b/paddle/fluid/framework/lod_tensor_test.cc
index 38d3cd96d65f0a54b0ea87b4c677013f3802adfb..cbf5fd04d73007d303d0fd96064e3a2d7f21cfb8 100644
--- a/paddle/fluid/framework/lod_tensor_test.cc
+++ b/paddle/fluid/framework/lod_tensor_test.cc
@@ -274,6 +274,7 @@ TEST(LoD, ConvertToOffsetBasedLoD) {
   EXPECT_EQ(offset_lod, expected);
 }
 
+#if !defined(_WIN32)
 template <typename T>
 static void TestRecordIO() {
   LoDTensor tensor;
@@ -301,11 +302,12 @@ static void TestRecordIO() {
   {
     std::unique_ptr<std::istream> stream_ptr(stream);
     recordio::Scanner scanner(std::move(stream_ptr));
-    auto tensors = ReadFromRecordIO(&scanner, ctx);
+    std::vector<framework::LoDTensor> tensors;
+    ASSERT_TRUE(ReadFromRecordIO(&scanner, ctx, &tensors));
     ASSERT_EQ(tensors.size(), static_cast<size_t>(2));
     assert_tensor_ok(tensors[0]);
     assert_tensor_ok(tensors[1]);
-    tensors = ReadFromRecordIO(&scanner, ctx);
+    ASSERT_TRUE(ReadFromRecordIO(&scanner, ctx, &tensors));
     ASSERT_EQ(tensors.size(), static_cast<size_t>(2));
     assert_tensor_ok(tensors[0]);
     assert_tensor_ok(tensors[1]);
@@ -319,6 +321,7 @@ TEST(LoDTensor, RecordIO) {
   TestRecordIO<float>();
   TestRecordIO<double>();
 }
+#endif  // !defined(_WIN32)
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/mixed_vector.h b/paddle/fluid/framework/mixed_vector.h
index 29b3396bc9854cd3d3ac8d4283f48019c9a9c55f..7836ecb1272a07a79a70c9cb040335f9a42e5684 100644
--- a/paddle/fluid/framework/mixed_vector.h
+++ b/paddle/fluid/framework/mixed_vector.h
@@ -16,6 +16,7 @@
 
 #include <algorithm>
 #include <initializer_list>
+#include <memory>
 #include <vector>
 
 #include "paddle/fluid/framework/tensor.h"
@@ -26,6 +27,7 @@
 namespace paddle {
 namespace framework {
 
+#if defined(PADDLE_WITH_CUDA)
 // Vector<T> implements the std::vector interface, and can get Data or
 // MutableData from any place. The data will be synced implicitly inside.
 template <typename T>
@@ -37,11 +39,11 @@ class Vector {
   Vector() { InitEmpty(); }
 
   // Fill vector with value. The vector size is `count`.
-  explicit Vector(size_t count, const T& value = T()) {
+  explicit Vector(size_t count, const T &value = T()) {
     InitEmpty();
     if (count != 0) {
       resize(count);
-      T* ptr = begin();
+      T *ptr = begin();
       for (size_t i = 0; i < count; ++i) {
         ptr[i] = value;
       }
@@ -59,7 +61,7 @@ class Vector {
 
   // implicit cast from std::vector.
   template <typename U>
-  Vector(const std::vector<U>& dat) {  // NOLINT
+  Vector(const std::vector<U> &dat) {  // NOLINT
     if (dat.size() == 0) {
       InitEmpty();
     } else {
@@ -68,10 +70,10 @@ class Vector {
   }
 
   // Copy ctor
-  Vector(const Vector<T>& other) { this->operator=(other); }
+  Vector(const Vector<T> &other) { this->operator=(other); }
 
   // Copy operator
-  Vector<T>& operator=(const Vector<T>& other) {
+  Vector<T> &operator=(const Vector<T> &other) {
     if (other.size() != 0) {
       this->InitByIter(other.size(), other.begin(), other.end());
     } else {
@@ -81,7 +83,7 @@ class Vector {
   }
 
   // Move ctor
-  Vector(Vector<T>&& other) {
+  Vector(Vector<T> &&other) {
     this->size_ = other.size_;
     this->flag_ = other.flag_;
     if (other.cuda_vec_.memory_size()) {
@@ -93,13 +95,13 @@ class Vector {
   }
 
   // CPU data access method. Mutable.
-  T& operator[](size_t i) {
+  T &operator[](size_t i) {
     MutableCPU();
-    return const_cast<T*>(cpu_vec_.data<T>())[i];
+    return const_cast<T *>(cpu_vec_.data<T>())[i];
   }
 
   // CPU data access method. Immutable.
-  const T& operator[](size_t i) const {
+  const T &operator[](size_t i) const {
     ImmutableCPU();
     return cpu_vec_.data<T>()[i];
   }
@@ -107,43 +109,43 @@ class Vector {
   // std::vector iterator methods. Based on CPU data access method
   size_t size() const { return size_; }
 
-  T* begin() { return capacity() == 0 ? &EmptyDummy() : &this->operator[](0); }
+  T *begin() { return capacity() == 0 ? &EmptyDummy() : &this->operator[](0); }
 
-  T* end() {
+  T *end() {
     return capacity() == 0 ? &EmptyDummy() : &this->operator[](size());
   }
 
-  T& front() { return *begin(); }
+  T &front() { return *begin(); }
 
-  T& back() {
+  T &back() {
     auto it = end();
     --it;
     return *it;
   }
 
-  const T* begin() const {
+  const T *begin() const {
     return capacity() == 0 ? &EmptyDummy() : &this->operator[](0);
   }
 
-  const T* end() const {
+  const T *end() const {
     return capacity() == 0 ? &EmptyDummy() : &this->operator[](size());
   }
 
-  const T* cbegin() const { return begin(); }
+  const T *cbegin() const { return begin(); }
 
-  const T* cend() const { return end(); }
+  const T *cend() const { return end(); }
 
-  const T& back() const {
+  const T &back() const {
     auto it = end();
     --it;
     return *it;
   }
 
-  T* data() { return begin(); }
+  T *data() { return begin(); }
 
-  const T* data() const { return begin(); }
+  const T *data() const { return begin(); }
 
-  const T& front() const { return *begin(); }
+  const T &front() const { return *begin(); }
   // end of std::vector iterator methods
 
   // assign this from iterator.
@@ -169,7 +171,7 @@ class Vector {
   void Extend(It begin, It end) {
     size_t pre_size = size_;
     resize(pre_size + (end - begin));
-    T* ptr = this->begin() + pre_size;
+    T *ptr = this->begin() + pre_size;
     for (; begin < end; ++begin, ++ptr) {
       *ptr = *begin;
     }
@@ -183,9 +185,9 @@ class Vector {
       MutableCPU();
       Tensor cpu_tensor;
       platform::Place cpu = platform::CPUPlace();
-      T* ptr = cpu_tensor.mutable_data<T>(
+      T *ptr = cpu_tensor.mutable_data<T>(
           framework::make_ddim({static_cast<int64_t>(size)}), cpu);
-      const T* old_ptr =
+      const T *old_ptr =
           cpu_vec_.memory_size() == 0 ? nullptr : cpu_vec_.data<T>();
       if (old_ptr != nullptr) {
         std::copy(old_ptr, old_ptr + size_, ptr);
@@ -196,7 +198,7 @@ class Vector {
   }
 
   // get cuda ptr. immutable
-  const T* CUDAData(platform::Place place) const {
+  const T *CUDAData(platform::Place place) const {
     PADDLE_ENFORCE(platform::is_gpu_place(place),
                    "CUDA Data must on CUDA place");
     ImmutableCUDA(place);
@@ -204,10 +206,10 @@ class Vector {
   }
 
   // get cuda ptr. mutable
-  T* CUDAMutableData(platform::Place place) {
-    const T* ptr = CUDAData(place);
+  T *CUDAMutableData(platform::Place place) {
+    const T *ptr = CUDAData(place);
     flag_ = kDirty | kDataInCUDA;
-    return const_cast<T*>(ptr);
+    return const_cast<T *>(ptr);
   }
 
   // clear
@@ -228,7 +230,7 @@ class Vector {
   }
 
   // the unify method to access CPU or CUDA data. immutable.
-  const T* Data(platform::Place place) const {
+  const T *Data(platform::Place place) const {
     if (platform::is_gpu_place(place)) {
       return CUDAData(place);
     } else {
@@ -237,7 +239,7 @@ class Vector {
   }
 
   // the unify method to access CPU or CUDA data. mutable.
-  T* MutableData(platform::Place place) {
+  T *MutableData(platform::Place place) {
     if (platform::is_gpu_place(place)) {
       return CUDAMutableData(place);
     } else {
@@ -253,7 +255,7 @@ class Vector {
     return result;
   }
 
-  bool operator==(const Vector<T>& other) const {
+  bool operator==(const Vector<T> &other) const {
     if (size() != other.size()) return false;
     auto it1 = cbegin();
     auto it2 = other.cbegin();
@@ -274,7 +276,7 @@ class Vector {
   template <typename Iter>
   void InitByIter(size_t size, Iter begin, Iter end) {
     platform::Place cpu = platform::CPUPlace();
-    T* ptr = this->cpu_vec_.template mutable_data<T>(
+    T *ptr = this->cpu_vec_.template mutable_data<T>(
         framework::make_ddim({static_cast<int64_t>(size)}), cpu);
     for (size_t i = 0; i < size; ++i) {
       *ptr++ = *begin++;
@@ -368,7 +370,7 @@ class Vector {
     }
   }
 
-  static T& EmptyDummy() {
+  static T &EmptyDummy() {
     static T dummy = T();
     return dummy;
   }
@@ -379,5 +381,52 @@ class Vector {
   size_t size_;
 };
 
-}  // namespace framework
+#else  // PADDLE_WITH_CUDA
+
+template <typename T>
+class CPUVector : public std::vector<T, std::allocator<T>> {
+ public:
+  CPUVector() : std::vector<T>() {}
+  CPUVector(size_t count, const T &value = T())  // NOLINT
+      : std::vector<T>(count, value) {}
+  CPUVector(std::initializer_list<T> init) : std::vector<T>(init) {}
+  CPUVector(const std::vector<T> &other) : std::vector<T>(other) {}  // NOLINT
+  CPUVector(const CPUVector<T> &other) : std::vector<T>(other) {}
+  CPUVector(CPUVector<T> &&other) : std::vector<T>(std::move(other)) {}
+  CPUVector(std::vector<T> &&other)  // NOLINT
+      : std::vector<T>(std::move(other)) {}
+  CPUVector &operator=(const CPUVector &other) {
+    this->assign(other.begin(), other.end());
+    return *this;
+  }
+  CPUVector &operator=(const std::vector<T> &other) {
+    this->assign(other.begin(), other.end());
+    return *this;
+  }
+
+  friend std::ostream &operator<<(std::ostream &os, const CPUVector<T> &other) {
+    std::stringstream ss;
+    for (auto v : other) {
+      os << v << " ";
+    }
+    return os;
+  }
+
+  T &operator[](size_t id) { return this->at(id); }
+
+  const T &operator[](size_t id) const { return this->at(id); }
+
+  template <typename D>
+  void Extend(const D &begin, const D &end) {
+    this->reserve(this->size() + size_t(end - begin));
+    this->insert(this->end(), begin, end);
+  }
+};
+
+template <typename T>
+using Vector = CPUVector<T>;
+
+#endif  // PADDLE_WITH_CUDA
+
+};  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/mixed_vector_test.cc b/paddle/fluid/framework/mixed_vector_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0599c8d384641606b0a5ebb5ba1781b56f539e63
--- /dev/null
+++ b/paddle/fluid/framework/mixed_vector_test.cc
@@ -0,0 +1,72 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <memory>
+
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/mixed_vector.h"
+
+template <typename T>
+using vec = paddle::framework::Vector<T>;
+
+TEST(mixed_vector, CPU_VECTOR) {
+  vec<int> tmp;
+  for (int i = 0; i < 10; ++i) {
+    tmp.push_back(i);
+  }
+  ASSERT_EQ(tmp.size(), 10UL);
+  vec<int> tmp2;
+  tmp2 = tmp;
+  ASSERT_EQ(tmp2.size(), 10UL);
+  for (int i = 0; i < 10; ++i) {
+    ASSERT_EQ(tmp2[i], i);
+    ASSERT_EQ(tmp2[i], tmp[i]);
+  }
+  int cnt = 0;
+  for (auto& t : tmp2) {
+    ASSERT_EQ(t, cnt);
+    ++cnt;
+  }
+}
+
+TEST(mixed_vector, InitWithCount) {
+  paddle::framework::Vector<int> vec(10, 10);
+  for (int i = 0; i < 10; ++i) {
+    ASSERT_EQ(vec[i], 10);
+  }
+}
+
+TEST(mixed_vector, ForEach) {
+  vec<int> tmp;
+  for (auto& v : tmp) {
+    VLOG(3) << v;
+  }
+}
+
+TEST(mixed_vector, Reserve) {
+  paddle::framework::Vector<int> vec;
+  vec.reserve(1);
+  vec.push_back(0);
+  vec.push_back(0);
+  vec.push_back(0);
+}
+
+TEST(mixed_vector, Resize) {
+  paddle::framework::Vector<int> vec;
+  vec.resize(1);
+  vec.push_back(0);
+  vec.push_back(0);
+  vec.push_back(0);
+}
diff --git a/paddle/fluid/framework/mixed_vector_test.cu b/paddle/fluid/framework/mixed_vector_test.cu
index d57f82510833d6a0cea7009cf1f0b49543812f8d..4b0caa8d350dde0462e5fdcca743df919358a364 100644
--- a/paddle/fluid/framework/mixed_vector_test.cu
+++ b/paddle/fluid/framework/mixed_vector_test.cu
@@ -11,7 +11,9 @@
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and
    limitations under the License. */
+
 #include <cuda_runtime.h>
+#include <memory>
 
 #include "glog/logging.h"
 #include "gtest/gtest.h"
@@ -21,26 +23,6 @@
 template <typename T>
 using vec = paddle::framework::Vector<T>;
 
-TEST(mixed_vector, CPU_VECTOR) {
-  vec<int> tmp;
-  for (int i = 0; i < 10; ++i) {
-    tmp.push_back(i);
-  }
-  ASSERT_EQ(tmp.size(), 10UL);
-  vec<int> tmp2;
-  tmp2 = tmp;
-  ASSERT_EQ(tmp2.size(), 10UL);
-  for (int i = 0; i < 10; ++i) {
-    ASSERT_EQ(tmp2[i], i);
-    ASSERT_EQ(tmp2[i], tmp[i]);
-  }
-  int cnt = 0;
-  for (auto& t : tmp2) {
-    ASSERT_EQ(t, cnt);
-    ++cnt;
-  }
-}
-
 static __global__ void multiply_10(int* ptr) {
   for (int i = 0; i < 10; ++i) {
     ptr[i] *= 10;
@@ -91,24 +73,3 @@ TEST(mixed_vector, MultiGPU) {
     ASSERT_EQ(tmp[i], i * 100);
   }
 }
-
-TEST(mixed_vector, InitWithCount) {
-  paddle::framework::Vector<int> vec(10, 10);
-  for (int i = 0; i < 10; ++i) {
-    ASSERT_EQ(vec[i], 10);
-  }
-}
-
-TEST(mixed_vector, ForEach) {
-  vec<int> tmp;
-  for (auto& v : tmp) {
-  }
-}
-
-TEST(mixed_vector, Reserve) {
-  paddle::framework::Vector<int> vec;
-  vec.reserve(1);
-  vec.push_back(0);
-  vec.push_back(0);
-  vec.push_back(0);
-}
diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc
index a190199f1cb1361f67f20c755b8e7ef52c284adc..555faba9624b9c76a9efdf4a62cd319f9682566e 100644
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -95,6 +95,12 @@ OpDesc::OpDesc(const std::string &type, const VariableNameMap &inputs,
   need_update_ = true;
 }
 
+OpDesc::OpDesc(const OpDesc &other, BlockDesc *block) {
+  CopyFrom(other);
+  block_ = block;
+  need_update_ = true;
+}
+
 void OpDesc::CopyFrom(const OpDesc &op_desc) {
   desc_.set_type(op_desc.Type());
   inputs_ = op_desc.inputs_;
@@ -131,8 +137,9 @@ OpDesc::OpDesc(const proto::OpDesc &desc, BlockDesc *block)
   for (const proto::OpDesc::Attr &attr : desc_.attrs()) {
     std::string attr_name = attr.name();
     // The sub_block referred to by the BLOCK attr hasn't been added
-    // to ProgramDesc class yet, we skip setting BLOCK attr here.
-    if (attr.type() != proto::AttrType::BLOCK) {
+    // to ProgramDesc class yet, we skip setting BLOCK/BLOCKS attr here.
+    if (attr.type() != proto::AttrType::BLOCK &&
+        attr.type() != proto::AttrType::BLOCKS) {
       attrs_[attr_name] = GetAttrValue(attr);
     }
   }
@@ -202,6 +209,52 @@ std::vector<std::string> OpDesc::AttrNames() const {
 }
 
 void OpDesc::SetAttr(const std::string &name, const Attribute &v) {
+  // NOTICE(minqiyang): pybind11 will take the empty list in python as
+  // the std::vector<int> type in C++; so we have to change the attr's type
+  // here if we meet this issue
+  proto::AttrType attr_type = static_cast<proto::AttrType>(v.which() - 1);
+  if (attr_type == proto::AttrType::INTS &&
+      boost::get<std::vector<int>>(v).size() == 0u) {
+    // Find current attr via attr name and set the correct attribute value
+    const proto::OpProto::Attr &attr = GetProtoAttr(name);
+    switch (attr.type()) {
+      case proto::AttrType::BOOLEANS: {
+        VLOG(11) << "SetAttr: " << Type() << ", " << name
+                 << " from INTS to BOOLEANS";
+        this->attrs_[name] = std::vector<bool>();
+        break;
+      }
+      case proto::AttrType::INTS: {
+        VLOG(11) << "SetAttr: " << Type() << ", " << name
+                 << " from INTS to INTS";
+        this->attrs_[name] = std::vector<int>();
+        break;
+      }
+      case proto::AttrType::FLOATS: {
+        VLOG(11) << "SetAttr: " << Type() << ", " << name
+                 << " from INTS to FLOATS";
+        this->attrs_[name] = std::vector<float>();
+        break;
+      }
+      case proto::AttrType::STRINGS: {
+        VLOG(11) << "SetAttr: " << Type() << ", " << name
+                 << " from INTS to STRINGS";
+        this->attrs_[name] = std::vector<std::string>();
+        break;
+      }
+      case proto::AttrType::BLOCKS: {
+        VLOG(11) << "SetAttr: " << Type() << ", " << name
+                 << " from INTS to BLOCKS";
+        this->SetBlocksAttr(name, std::vector<BlockDesc *>());
+        return;
+      }
+      default:
+        PADDLE_THROW("Wrong attr type %d", attr.type());
+    }
+    need_update_ = true;
+    return;
+  }
+
   this->attrs_[name] = v;
   need_update_ = true;
 }
@@ -229,6 +282,19 @@ Attribute OpDesc::GetAttr(const std::string &name) const {
   return it->second;
 }
 
+const proto::OpProto::Attr &OpDesc::GetProtoAttr(
+    const std::string &name) const {
+  const proto::OpProto &proto = OpInfoMap::Instance().Get(Type()).Proto();
+  for (int i = 0; i != proto.attrs_size(); ++i) {
+    const proto::OpProto::Attr &attr = proto.attrs(i);
+    if (attr.name() == name) {
+      return attr;
+    }
+  }
+
+  PADDLE_THROW("Attribute %s is not found in proto %s", name, proto.type());
+}
+
 Attribute OpDesc::GetNullableAttr(const std::string &name) const {
   auto it = attrs_.find(name);
   if (it != attrs_.end()) {
@@ -238,7 +304,20 @@ Attribute OpDesc::GetNullableAttr(const std::string &name) const {
   }
 }
 
-int OpDesc::GetBlockAttr(const std::string &name) const {
+std::vector<int> OpDesc::GetBlocksAttrIds(const std::string &name) const {
+  auto it = attrs_.find(name);
+  PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
+  auto blocks = boost::get<std::vector<BlockDesc *>>(it->second);
+
+  std::vector<int> ids;
+  for (auto n : blocks) {
+    ids.push_back(n->ID());
+  }
+
+  return ids;
+}
+
+int OpDesc::GetBlockAttrId(const std::string &name) const {
   auto it = attrs_.find(name);
   PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
   return boost::get<BlockDesc *>(it->second)->ID();
diff --git a/paddle/fluid/framework/op_desc.h b/paddle/fluid/framework/op_desc.h
index 74dd8ec002005dd080424b48b5db1a2574a6974f..b4205aba83e774fb9c08193124adb93935c00157 100644
--- a/paddle/fluid/framework/op_desc.h
+++ b/paddle/fluid/framework/op_desc.h
@@ -37,11 +37,7 @@ class OpDesc {
 
   explicit OpDesc(BlockDesc *block) : block_(block) {}
 
-  OpDesc(const OpDesc &other, BlockDesc *block) {
-    *this = other;
-    block_ = block;
-    need_update_ = true;
-  }
+  OpDesc(const OpDesc &other, BlockDesc *block);
 
   void CopyFrom(const OpDesc &op_desc);
 
@@ -81,9 +77,13 @@ class OpDesc {
 
   Attribute GetAttr(const std::string &name) const;
 
+  const proto::OpProto::Attr &GetProtoAttr(const std::string &name) const;
+
   Attribute GetNullableAttr(const std::string &name) const;
 
-  int GetBlockAttr(const std::string &name) const;
+  int GetBlockAttrId(const std::string &name) const;
+
+  std::vector<int> GetBlocksAttrIds(const std::string &name) const;
 
   void Rename(const std::string &old_name, const std::string &new_name);
 
diff --git a/paddle/fluid/framework/op_info.cc b/paddle/fluid/framework/op_info.cc
index f1261dee0319440995951d1bee145404186a8ad4..af75baa5c4b98f7d092834c05eb57e9c7e131b29 100644
--- a/paddle/fluid/framework/op_info.cc
+++ b/paddle/fluid/framework/op_info.cc
@@ -21,8 +21,8 @@ namespace framework {
 // a static local variable is already being initialized.
 // https://stackoverflow.com/questions/11711920/how-to-implement-multithread-safe-singleton-in-c11-without-using-mutex
 OpInfoMap& OpInfoMap::Instance() {
-  static OpInfoMap* g_op_info_map = new OpInfoMap();
-  return *g_op_info_map;
+  static OpInfoMap g_op_info_map;
+  return g_op_info_map;
 }
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/op_kernel_type_test.cc b/paddle/fluid/framework/op_kernel_type_test.cc
index db95861c510b52a5b52229541434e6437d3fb9f4..3e17a512ce154de88ac890f3b29f03385595d95c 100644
--- a/paddle/fluid/framework/op_kernel_type_test.cc
+++ b/paddle/fluid/framework/op_kernel_type_test.cc
@@ -29,6 +29,13 @@ TEST(OpKernelType, ToString) {
   ASSERT_EQ(paddle::framework::KernelTypeToString(op_kernel_type),
             "data_type[float]:data_layout[NCHW]:place[CPUPlace]:library_type["
             "CUDNN]");
+
+  using CUDAPlace = paddle::platform::CUDAPlace;
+  OpKernelType op_kernel_type2(DataType::FP16, CUDAPlace(0), DataLayout::kNCHW,
+                               LibraryType::kCUDNN);
+  ASSERT_EQ(paddle::framework::KernelTypeToString(op_kernel_type2),
+            "data_type[float16]:data_layout[NCHW]:place[CUDAPlace(0)]:library_"
+            "type[CUDNN]");
 }
 
 TEST(OpKernelType, Hash) {
diff --git a/paddle/fluid/framework/op_proto_maker.cc b/paddle/fluid/framework/op_proto_maker.cc
index 001b5cb5a8eb57cbe0a2e0ad7f64ef05f8149922..4fa047bf3ee3d06ac4aec5d2cc6a355965836d42 100644
--- a/paddle/fluid/framework/op_proto_maker.cc
+++ b/paddle/fluid/framework/op_proto_maker.cc
@@ -40,6 +40,40 @@ OpProtoAndCheckerMaker::VariableBuilder OpProtoAndCheckerMaker::AddOutput(
   return OpProtoAndCheckerMaker::VariableBuilder{output};
 }
 
+void OpProtoAndCheckerMaker::Reuse(const std::string& name,
+                                   const std::string& reused_name) {
+  bool found = false;
+  proto::OpProto::Var* var;
+
+  for (auto& var : proto_->inputs()) {
+    if (var.name() == reused_name) {
+      found = true;
+      break;
+    }
+  }
+  PADDLE_ENFORCE(found == true,
+                 "Input/Output name: %s reused_name: %s, one of them is not "
+                 "exists or not matched.",
+                 name, reused_name);
+
+  found = false;
+  for (int i = 0; i < proto_->outputs().size(); ++i) {
+    var = proto_->mutable_outputs()->Mutable(i);
+    if (var->name() == name) {
+      PADDLE_ENFORCE(!var->has_reuse(),
+                     "Output(%s) has been set reused var of %s", name,
+                     var->reuse());
+      found = true;
+      var->set_reuse(reused_name);
+      break;
+    }
+  }
+  PADDLE_ENFORCE(found == true,
+                 "Input/Output name: %s reused_name: %s, one of them is not "
+                 "exists or not matched.",
+                 name, reused_name);
+}
+
 void OpProtoAndCheckerMaker::CheckNoDuplicatedInOutAttrs() {
   std::unordered_set<std::string> names;
   auto checker = [&](const std::string& name) {
@@ -95,6 +129,9 @@ void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto,
                                     "Optimized for variable")
       .SetDefault({});
 
+  AddAttr<std::string>(OpNamescopeAttrName(), "Operator name with namesope.")
+      .SetDefault("");
+
   Validate();
 }
 
diff --git a/paddle/fluid/framework/op_proto_maker.h b/paddle/fluid/framework/op_proto_maker.h
index 92f86bb5de520878d0a7b8d7214620580242c061..18827385ad659922230ff68709a2926a8c9013ac 100644
--- a/paddle/fluid/framework/op_proto_maker.h
+++ b/paddle/fluid/framework/op_proto_maker.h
@@ -39,6 +39,7 @@ class OpProtoAndCheckerMaker {
  public:
   static const char *OpRoleAttrName() { return "op_role"; }
   static const char *OpRoleVarAttrName() { return "op_role_var"; }
+  static const char *OpNamescopeAttrName() { return "op_namescope"; }
 
   void operator()(proto::OpProto *proto, OpAttrChecker *attr_checker);
 
@@ -78,6 +79,8 @@ class OpProtoAndCheckerMaker {
   VariableBuilder AddOutput(const std::string &name,
                             const std::string &comment);
 
+  void Reuse(const std::string &name, const std::string &reused_name);
+
   template <typename T>
   TypedAttrChecker<T> &AddAttr(const std::string &name,
                                const std::string &comment,
diff --git a/paddle/fluid/framework/op_proto_maker_test.cc b/paddle/fluid/framework/op_proto_maker_test.cc
index 58f70cb39c0d96ed3b9ff35ea132ba75a37f5405..b71c7b646857e11f291748c4c7c2af92b6d53231 100644
--- a/paddle/fluid/framework/op_proto_maker_test.cc
+++ b/paddle/fluid/framework/op_proto_maker_test.cc
@@ -49,6 +49,15 @@ TEST(ProtoMaker, DuplicatedInOut) {
 }
 
 class TestInplaceProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "input of test op");
+    AddOutput("XOut", "output of test op").Reuse("X");
+  }
+};
+
+class TestInplaceProtoMaker2
+    : public paddle::framework::OpProtoAndCheckerMaker {
  public:
   void Make() {
     AddInput("X", "input of test op");
@@ -58,12 +67,100 @@ class TestInplaceProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
 };
 
 TEST(ProtoMaker, InplaceOutput) {
-  paddle::framework::proto::OpProto op_proto;
+  paddle::framework::proto::OpProto op_proto, op_proto2;
   paddle::framework::OpAttrChecker op_checker;
   TestInplaceProtoMaker proto_maker;
-  ASSERT_THROW(proto_maker(&op_proto, &op_checker),
+  TestInplaceProtoMaker2 proto_maker2;
+
+  proto_maker(&op_proto, &op_checker);
+
+  ASSERT_THROW(proto_maker2(&op_proto2, &op_checker),
                paddle::platform::EnforceNotMet);
-  // proto_maker(&op_proto, &op_checker);
-  // proto_maker.Make();
-  // ASSERT_THROW(proto_maker.Validate(), paddle::platform::EnforceNotMet);
 }
+
+// normal reuse
+class TestReuseProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "input of test op");
+    AddInput("Y", "input of test op");
+    AddOutput("Out", "output of test op");
+    AddOutput("XOut", "output of test op");
+    // avoid destructor exception.
+    // Validate();
+    TestReuse();
+  }
+
+  virtual void TestReuse() {}
+};
+
+// test duplicate reuse error
+class TestReuseProtoMaker2 : public TestReuseProtoMaker {
+ public:
+  void TestReuse() {
+    Reuse("Out", "X");
+    Reuse("Out", "Y");
+  }
+};
+
+// NotExists Input
+class TestReuseProtoMaker3 : public TestReuseProtoMaker {
+ public:
+  void TestReuse() {
+    Reuse("Out", "NotExists");
+    Reuse("XOut", "X");
+  }
+};
+
+// NotExists Output
+class TestReuseProtoMaker4 : public TestReuseProtoMaker {
+ public:
+  void TestReuse() { Reuse("NotExists", "X"); }
+};
+
+TEST(ProtoMaker, Reuse) {
+  paddle::framework::proto::OpProto op_proto;
+  paddle::framework::OpAttrChecker op_checker;
+  TestReuseProtoMaker proto_maker;
+  proto_maker(&op_proto, &op_checker);
+}
+
+// NOTE(dzhwinter):
+// There is a Fatal CHECK on base class destructor, which will call abort inside
+// instead of
+// throw an exception. If we throw an exception in Make(), we will trigger the
+// CHECK and terminate the tests.
+//
+// I had tried to replace the default CHECK with a exception, however, it's
+// still not supported by glog.
+// the details:
+// https://github.com/google/glog/issues/249
+// https://github.com/facebookresearch/TensorComprehensions/issues/351
+/*
+TEST(ProtoMaker, ReuseWithException) {
+  paddle::framework::proto::OpProto op_proto2, op_proto3, op_proto4;
+  paddle::framework::OpAttrChecker op_checker;
+  TestReuseProtoMaker2 proto_maker2;
+  TestReuseProtoMaker3 proto_maker3;
+  TestReuseProtoMaker4 proto_maker4;
+  EXPECT_THROW(proto_maker2(&op_proto2, &op_checker),
+               paddle::platform::EnforceNotMet);
+
+  EXPECT_THROW(proto_maker3(&op_proto3, &op_checker),
+               paddle::platform::EnforceNotMet);
+
+  EXPECT_THROW(proto_maker4(&op_proto4, &op_checker),
+               paddle::platform::EnforceNotMet);
+}
+
+void FailureFunction() {
+  throw std::runtime_error("Check failed in destructor.");
+  // return 0;
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  google::InstallFailureFunction(&FailureFunction);
+  return RUN_ALL_TESTS();
+}
+*/
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 3cf8e8696d739e3f2894e490161b9fb5b459bc41..b7fae7171a57666a8fb4613a7cbe3aa15997b638 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -18,6 +18,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/data_transform.h"
 #include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/shape_inference.h"
 #include "paddle/fluid/framework/var_type.h"
@@ -57,7 +58,11 @@ static DDim GetDims(const Scope& scope, const std::string& name,
   }
 
   if (var->IsType<LoDTensor>()) {
-    return var->Get<LoDTensor>().dims();
+    const LoDTensor& tensor = var->Get<LoDTensor>();
+    if (UNLIKELY(!tensor.IsInitialized())) {
+      return DDim({-1});
+    }
+    return tensor.dims();
   } else if (var->IsType<SelectedRows>()) {
     if (get_actual_dim) {
       return var->Get<SelectedRows>().value().dims();
@@ -69,6 +74,36 @@ static DDim GetDims(const Scope& scope, const std::string& name,
   }
 }
 
+static bool VarInited(const Scope& scope, const std::string& name) {
+  Variable* var = scope.FindVar(name);
+  if (var == nullptr) return false;
+  return var->IsInitialized();
+}
+
+static std::string GetDtype(const Scope& scope, const std::string& name) {
+  Variable* var = scope.FindVar(name);
+  if (var == nullptr) {
+    return "";
+  }
+
+  if (var->IsType<LoDTensor>()) {
+    const LoDTensor& tensor = var->Get<LoDTensor>();
+    if (UNLIKELY(!tensor.IsInitialized())) {
+      return "";
+    }
+    return DataTypeToString(ToDataType(tensor.type()));
+  } else if (var->IsType<SelectedRows>()) {
+    auto tensor = var->Get<SelectedRows>().value();
+    if (UNLIKELY(!tensor.IsInitialized())) {
+      return "uninited";
+    } else {
+      return DataTypeToString(ToDataType(tensor.type()));
+    }
+  } else {
+    return "";
+  }
+}
+
 static int GetRowSize(const Scope& scope, const std::string& name) {
   Variable* var = scope.FindVar(name);
   if (var == nullptr) {
@@ -91,14 +126,18 @@ static LoD GetLoD(const Scope& scope, const std::string& name) {
   }
 
   if (var->IsType<LoDTensor>()) {
-    return var->Get<LoDTensor>().lod();
+    const LoDTensor& tensor = var->Get<LoDTensor>();
+    if (UNLIKELY(!tensor.IsInitialized())) {
+      return default_lod;
+    }
+    return tensor.lod();
   } else {
     return default_lod;
   }
 }
 
 void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
-  VLOG(10) << "- " << DebugStringEx(&scope);
+  VLOG(4) << place << " " << DebugStringEx(&scope);
   if (platform::is_gpu_place(place)) {
 #ifndef PADDLE_WITH_CUDA
     PADDLE_THROW("Cannot run operator on place %s", place);
@@ -107,8 +146,10 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
     platform::SetDeviceId(dev_id);
 #endif
   }
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  platform::RecordEvent record_event(Type(), pool.Get(place));
   RunImpl(scope, place);
-  VLOG(10) << "+ " << DebugStringEx(&scope);
+  VLOG(3) << place << " " << DebugStringEx(&scope);
 }
 
 bool OperatorBase::HasInputs(const std::string& name) const {
@@ -166,14 +207,21 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const {
     auto& input = *it;
     ss << input.first << "[";
     for (size_t i = 0; i < input.second.size(); ++i) {
-      ss << input.second[i];
+      auto var_name = input.second[i];
+      ss << var_name;
       if (scope) {
-        int row_size = GetRowSize(*scope, input.second[i]);
-        if (row_size >= 0) {
-          ss << "[row_size=" << row_size << "]";
+        if (!VarInited(*scope, var_name)) {
+          ss << "[uninited]";
+        } else {
+          int row_size = GetRowSize(*scope, var_name);
+          if (row_size >= 0) {
+            ss << "[row_size=" << row_size << "]";
+          }
+          std::string dtype = GetDtype(*scope, var_name);
+          ss << ":" << dtype;
+          ss << "[" << GetDims(*scope, var_name, true) << "]";
+          ss << "(" << GetLoD(*scope, var_name) << ")";
         }
-        ss << "[" << GetDims(*scope, input.second[i], true) << "]";
-        ss << "(" << GetLoD(*scope, input.second[i]) << ")";
       }
       if (i != input.second.size() - 1) {
         ss << ", ";
@@ -190,14 +238,19 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const {
     auto& output = *it;
     ss << output.first << "[";
     for (size_t i = 0; i < output.second.size(); ++i) {
-      ss << output.second[i];
+      auto var_name = output.second[i];
+      ss << var_name;
       if (scope) {
-        int row_size = GetRowSize(*scope, output.second[i]);
-        if (row_size >= 0) {
-          ss << "[row_size=" << row_size << "]";
+        if (!VarInited(*scope, var_name)) {
+          ss << "[uninited]";
+        } else {
+          int row_size = GetRowSize(*scope, output.second[i]);
+          if (row_size >= 0) {
+            ss << "[row_size=" << row_size << "]";
+          }
+          ss << "[" << GetDims(*scope, var_name, true) << "]";
+          ss << "(" << GetLoD(*scope, var_name) << ")";
         }
-        ss << "[" << GetDims(*scope, output.second[i], true) << "]";
-        ss << "(" << GetLoD(*scope, output.second[i]) << ")";
       }
       if (i != output.second.size() - 1) {
         ss << ", ";
@@ -411,35 +464,35 @@ class RuntimeInferShapeContext : public InferShapeContext {
       : op_(op), scope_(scope) {}
 
   bool HasInput(const std::string& name) const override {
-    if (!op_.HasInputs(name)) {
+    // has only one input
+    const auto& ins = op_.Inputs();
+    auto it = ins.find(name);
+    if (it == ins.end()) {
       return false;
     }
-    auto& ins = Inputs(name);
-    size_t length = ins.size();
-    if (length == 0) {
+    const auto& in = it->second;
+    if (in.size() == 0 || in[0] == kEmptyVarName) {
       return false;
     }
-    PADDLE_ENFORCE_EQ(length, 1UL,
+    PADDLE_ENFORCE_EQ(in.size(), 1UL,
                       "Input %s should not have more than one inputs", name);
-    auto ipt = ins[0];
-    auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
-    return var != nullptr;
+    return scope_.FindVar(in[0]) != nullptr;
   }
 
   bool HasOutput(const std::string& name) const override {
-    if (!op_.HasOutputs(name)) {
+    // has only one output
+    const auto& outs = op_.Outputs();
+    auto it = outs.find(name);
+    if (it == outs.end()) {
       return false;
     }
-    auto& outs = Outputs(name);
-    size_t length = outs.size();
-    if (length == 0) {
+    const auto& out = it->second;
+    if (out.size() == 0 || out[0] == kEmptyVarName) {
       return false;
     }
-    PADDLE_ENFORCE_EQ(length, 1UL,
-                      "Output %s should not have more than one inputs", name);
-    auto ipt = outs[0];
-    auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
-    return var != nullptr;
+    PADDLE_ENFORCE_EQ(out.size(), 1UL,
+                      "Output %s should not have more than one outputs", name);
+    return scope_.FindVar(out[0]) != nullptr;
   }
 
   bool HasInputs(const std::string& name) const override {
@@ -608,9 +661,6 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
   auto* dev_ctx = pool.Get(place);
 
-  // For profiling, don't move out of this function because that will result
-  // in the failure of multi-GPU profiling.
-  platform::RecordEvent record_event(Type(), dev_ctx);
   // check if op[type] has kernel registered.
   auto& all_op_kernels = AllOpKernels();
   auto kernels_iter = all_op_kernels.find(type_);
@@ -633,6 +683,16 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
 
   auto kernel_iter = kernels.find(expected_kernel_key);
+#ifdef PADDLE_WITH_MKLDNN
+  // workaround for missing MKLDNN kernel when FLAGS_use_mkldnn env var is set
+  if (kernel_iter == kernels.end() &&
+      expected_kernel_key.library_type_ == LibraryType::kMKLDNN) {
+    VLOG(3) << "missing MKLDNN kernel: fallbacking to PLAIN one";
+    expected_kernel_key.library_type_ = LibraryType::kPlain;
+    expected_kernel_key.data_layout_ = DataLayout::kAnyLayout;
+    kernel_iter = kernels.find(expected_kernel_key);
+  }
+#endif
   if (kernel_iter == kernels.end()) {
     PADDLE_THROW("op %s does not have kernel for %s", type_,
                  KernelTypeToString(expected_kernel_key));
@@ -669,6 +729,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
       if (var == nullptr) continue;
       if (var->IsType<framework::LoDTensor>()) {
         CheckTensorNANOrInf(vname, var->Get<framework::LoDTensor>());
+      } else if (var->IsType<framework::SelectedRows>()) {
+        CheckTensorNANOrInf(vname, var->Get<framework::SelectedRows>().value());
       }
     }
   }
@@ -736,6 +798,7 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
     const ExecutionContext& ctx) const {
   auto& scope = ctx.scope();
   int data_type = -1;
+  std::string last_input_name;
   for (auto& input : this->inputs_) {
     for (auto& ipt_name : input.second) {
       auto* var = scope.FindVar(ipt_name);
@@ -752,9 +815,10 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
           int tmp = static_cast<int>(ToDataType(t->type()));
           PADDLE_ENFORCE(
               tmp == data_type || data_type == -1,
-              "DataType of Paddle Op %s must be the same. Get %d != %d", Type(),
-              data_type, tmp);
+              "DataType of Paddle Op %s must be the same. Get %s(%d) != %s(%d)",
+              Type(), last_input_name, data_type, ipt_name, tmp);
           data_type = tmp;
+          last_input_name = ipt_name;
         }
       }
     }
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 5a19e7f1bf94abaac6d13e963cab3779c0789b82..880521f29e0c69af047f8ce4f6575694bd387ca7 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -18,24 +18,83 @@ limitations under the License. */
 #include <tuple>
 #include <vector>
 
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_viz_pass.h"
+
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/framework/details/reference_count_pass.h"
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
 
-#include "paddle/fluid/framework/details/all_reduce_op_handle.h"
-#include "paddle/fluid/framework/details/broadcast_op_handle.h"
-#include "paddle/fluid/framework/details/computation_op_handle.h"
-#include "paddle/fluid/framework/details/reduce_op_handle.h"
-#include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h"
+#include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
+#include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h"
+#include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h"
 #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
-#include "paddle/fluid/framework/details/ssa_graph_builder_factory.h"
 #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
 #include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace framework {
 
+std::unique_ptr<ir::Graph> ApplyParallelExecutorPass(
+    const ProgramDesc &main_program, const std::vector<platform::Place> &places,
+    const std::string &loss_var_name,
+    const std::unordered_set<std::string> &param_names,
+    const std::vector<Scope *> &local_scopes, const bool use_cuda,
+#ifdef PADDLE_WITH_CUDA
+    const BuildStrategy &strategy, platform::NCCLContextMap *nccl_ctxs) {
+#else
+    const BuildStrategy &strategy) {
+#endif
+  // Convert the program to graph.
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(main_program));
+
+  // Apply a graph viz pass to record a graph.
+  if (!strategy.debug_graphviz_path_.empty()) {
+    auto viz_pass = ir::PassRegistry::Instance().Get("graph_viz_pass");
+    const std::string graph_path = string::Sprintf(
+        "%s%s", strategy.debug_graphviz_path_.c_str(), "_original_graph");
+    viz_pass->Set<std::string>("graph_viz_path", new std::string(graph_path));
+    graph = viz_pass->Apply(std::move(graph));
+  }
+
+  // Convert graph to run on multi-devices.
+  auto multi_devices_pass =
+      ir::PassRegistry::Instance().Get("multi_devices_pass");
+  multi_devices_pass->SetNotOwned<const std::vector<platform::Place>>("places",
+                                                                      &places);
+  multi_devices_pass->SetNotOwned<const std::string>("loss_var_name",
+                                                     &loss_var_name);
+  multi_devices_pass->SetNotOwned<const std::unordered_set<std::string>>(
+      "params", &param_names);
+  multi_devices_pass->SetNotOwned<const std::vector<Scope *>>("local_scopes",
+                                                              &local_scopes);
+  multi_devices_pass->SetNotOwned<const BuildStrategy>("strategy", &strategy);
+
+#ifdef PADDLE_WITH_CUDA
+  platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr;
+  multi_devices_pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx);
+#endif
+  graph = multi_devices_pass->Apply(std::move(graph));
+
+  // Apply a graph print pass to record a graph with device info.
+  if (!strategy.debug_graphviz_path_.empty()) {
+    auto multi_devices_print_pass =
+        ir::PassRegistry::Instance().Get("multi_devices_print_pass");
+    multi_devices_print_pass->SetNotOwned<const std::string>(
+        "debug_graphviz_path", &strategy.debug_graphviz_path_);
+    multi_devices_print_pass->Set<details::GraphvizSSAGraphPrinter>(
+        "graph_printer", new details::GraphvizSSAGraphPrinter);
+    graph = multi_devices_print_pass->Apply(std::move(graph));
+  }
+
+  // Verify that the graph is correct for multi-device executor.
+  auto multi_devices_check_pass =
+      ir::PassRegistry::Instance().Get("multi_devices_check_pass");
+  graph = multi_devices_check_pass->Apply(std::move(graph));
+  return graph;
+}
+
 class ParallelExecutorPrivate {
  public:
   explicit ParallelExecutorPrivate(const std::vector<platform::Place> &places)
@@ -51,6 +110,7 @@ class ParallelExecutorPrivate {
 #endif
   bool own_local_scope_;
   bool use_cuda_;
+  bool use_all_reduce_;
 };
 
 std::vector<Scope *> &ParallelExecutor::GetLocalScopes() {
@@ -68,6 +128,14 @@ ParallelExecutor::ParallelExecutor(
     : member_(new ParallelExecutorPrivate(places)) {
   member_->global_scope_ = scope;
   member_->use_cuda_ = exec_strategy.use_cuda_;
+  member_->use_all_reduce_ =
+      build_strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce;
+
+  if (!member_->use_all_reduce_) {
+    PADDLE_ENFORCE(places.size() > 1,
+                   "If you set build_strategy.reduce with 'Reduce',"
+                   "the number of places must be greater than 1.");
+  }
 
   // Step 1. Bcast the params to devs.
   // Create local scopes
@@ -101,7 +169,7 @@ ParallelExecutor::ParallelExecutor(
   }
 
   if (member_->local_scopes_.size() != 1 && local_scopes.empty()) {
-    BCastParamsToGPUs(bcast_vars);
+    BCastParamsToDevices(bcast_vars);
   }
   // Startup Program has been run. All local scopes has correct parameters.
 
@@ -114,44 +182,49 @@ ParallelExecutor::ParallelExecutor(
     var_infos.back().persistable_ = var->Persistable();
   }
 
-  // Step 3. Convert main_program to SSA form and dependency graph. Also, insert
-  // ncclOp
-  details::SSAGraphBuilderFactory builder_factory(
-      member_->places_, loss_var_name, params, member_->local_scopes_,
-      build_strategy);
-  if (member_->use_cuda_) {
+// Step 3. Convert main_program to SSA form and dependency graph. Also, insert
+// ncclOp
 #ifdef PADDLE_WITH_CUDA
-    std::unique_ptr<ir::Graph> graph = ApplyParallelExecutorPass(
-        main_program, member_->places_, loss_var_name, params,
-        member_->local_scopes_, member_->use_cuda_, build_strategy,
-        member_->nccl_ctxs_.get());
-
-    auto max_memory_size = GetEagerDeletionThreshold();
-    if (max_memory_size >= 0) {
-      for (auto &place : member_->places_) {
-        if (!platform::is_gpu_place(place)) continue;
-        auto gpu_place = boost::get<platform::CUDAPlace>(place);
-        if (gcs_[gpu_place.device] == nullptr) {
-          ref_cnts_[gpu_place.device].reset(new details::ReferenceCountMap());
-          cur_ref_cnts_[gpu_place.device].reset(
-              new details::AtomicReferenceCountMap());
-          gcs_[gpu_place.device].reset(
-              new StreamGarbageCollector<Tensor>(gpu_place, max_memory_size));
-        }
-      }
-      if (!gcs_.empty()) {
-        auto ref_cnt_pass =
-            ir::PassRegistry::Instance().Get("reference_count_pass");
-        ref_cnt_pass->SetNotOwned(details::kGlobalReferenceCount, &ref_cnts_);
-        ref_cnt_pass->SetNotOwned(details::kCurReferenceCount, &cur_ref_cnts_);
-        ref_cnt_pass->SetNotOwned(details::kGarbageCollector, &gcs_);
-        graph = ref_cnt_pass->Apply(std::move(graph));
-        graph->SetNotOwned("garbage_collector", &gcs_);
+  std::unique_ptr<ir::Graph> graph = ApplyParallelExecutorPass(
+      main_program, member_->places_, loss_var_name, params,
+      member_->local_scopes_, member_->use_cuda_, build_strategy,
+      member_->nccl_ctxs_.get());
+
+  auto max_memory_size = GetEagerDeletionThreshold();
+  if (max_memory_size >= 0) {
+    for (auto &place : member_->places_) {
+      if (!platform::is_gpu_place(place)) continue;
+      auto gpu_place = boost::get<platform::CUDAPlace>(place);
+      if (gcs_[gpu_place.device] == nullptr) {
+        ref_cnts_[gpu_place.device].reset(new details::ReferenceCountMap());
+        cur_ref_cnts_[gpu_place.device].reset(
+            new details::AtomicReferenceCountMap());
+        gcs_[gpu_place.device].reset(
+            new StreamGarbageCollector<Tensor>(gpu_place, max_memory_size));
       }
     }
+    if (!gcs_.empty()) {
+      auto ref_cnt_pass =
+          ir::PassRegistry::Instance().Get("reference_count_pass");
+      ref_cnt_pass->SetNotOwned(details::kGlobalReferenceCount, &ref_cnts_);
+      ref_cnt_pass->SetNotOwned(details::kCurReferenceCount, &cur_ref_cnts_);
+      ref_cnt_pass->SetNotOwned(details::kGarbageCollector, &gcs_);
+      graph = ref_cnt_pass->Apply(std::move(graph));
+      graph->SetNotOwned("garbage_collector", &gcs_);
+    }
+  }
 #else
-    PADDLE_THROW("Not compiled with CUDA");
+  std::unique_ptr<ir::Graph> graph = ApplyParallelExecutorPass(
+      main_program, member_->places_, loss_var_name, params,
+      member_->local_scopes_, member_->use_cuda_, build_strategy);
 #endif
+
+  if (exec_strategy.type_ == ExecutionStrategy::kDefault) {
+    member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
+        exec_strategy, member_->local_scopes_, places, std::move(graph)));
+  } else {
+    member_->executor_.reset(new details::FastThreadedSSAGraphExecutor(
+        exec_strategy, member_->local_scopes_, places, std::move(graph)));
   }
 
   member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor(
@@ -159,16 +232,23 @@ ParallelExecutor::ParallelExecutor(
       member_->places_, std::move(member_->executor_)));
 }
 
-void ParallelExecutor::BCastParamsToGPUs(
+void ParallelExecutor::BCastParamsToDevices(
     const std::unordered_set<std::string> &vars) const {
-  // the the initializing bcast, all vars would be bcast from device(0),
+  // the initializing bcast, all vars would be bcast from device(0),
   // otherwise
   // bcast from the specified device.
-  bool initializing = builder_.get() == nullptr ? true : false;
-
+  bool initializing = member_->executor_ ? false : true;
   for (auto &var : vars) {
-    int var_dev_id =
-        builder_.get() == nullptr ? -1 : builder_->GetVarDeviceID(var);
+    int var_dev_id = -1;
+    if (member_->executor_) {
+      auto &sharded_var_device =
+          member_->executor_->Graph().Get<details::ShardedVarDevice>(
+              details::kShardedVarDevice);
+      if (sharded_var_device.find(var) != sharded_var_device.end()) {
+        var_dev_id = sharded_var_device.at(var);
+      }
+    }
+
     if (!initializing && var_dev_id == -1) continue;
 
     framework::Variable *main_var = nullptr;
@@ -230,12 +310,23 @@ void ParallelExecutor::BCastParamsToGPUs(
 #endif
     } else {
       platform::CPUPlace cpu;
-      for (size_t i = 1; i < member_->places_.size(); ++i) {
+      for (size_t i = 0; i < member_->places_.size(); ++i) {
+        if ((initializing && i == 0) ||
+            (!initializing && static_cast<int>(i) == var_dev_id))
+          continue;
+
         auto local_scope = member_->local_scopes_[i];
         auto *t = local_scope->Var(var)->GetMutable<LoDTensor>();
-        t->Resize(dims);
-        t->mutable_data(cpu, main_tensor.type());
-        paddle::framework::TensorCopy(main_tensor, cpu, t);
+
+        // FIXME(zcd): LR_DECAY_COUNTER should not be shared. This is a hot fix.
+        if (member_->use_all_reduce_ || member_->use_cuda_ ||
+            var == "@LR_DECAY_COUNTER@") {
+          t->Resize(dims);
+          t->mutable_data(cpu, main_tensor.type());
+          paddle::framework::TensorCopy(main_tensor, cpu, t);
+        } else {
+          t->ShareDataWith(main_tensor);
+        }
       }
     }
   }
@@ -291,7 +382,10 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
 ParallelExecutor::~ParallelExecutor() {
   if (member_->own_local_scope_) {
     for (size_t i = 1; i < member_->local_scopes_.size(); ++i) {
-      member_->global_scope_->DeleteScope(member_->local_scopes_[i]);
+      Scope *local_scope = member_->local_scopes_[i];
+      if (member_->global_scope_->HasKid(local_scope)) {
+        member_->global_scope_->DeleteScope(local_scope);
+      }
     }
   }
 }
diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h
index 2aa438e320a0f191f78a6274b8ad8453f1736ef4..a0f66c3f8fb82c1120cb1845d1d3fbee6df127f6 100644
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -21,7 +21,7 @@ limitations under the License. */
 #include <unordered_set>
 #include <vector>
 #include "paddle/fluid/framework/details/execution_strategy.h"
-#include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
+#include "paddle/fluid/framework/details/multi_devices_graph_pass.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/program_desc.h"
@@ -68,7 +68,7 @@ class ParallelExecutor {
   void Run(const std::vector<std::string> &fetch_tensors,
            const std::string &fetched_var_name);
 
-  void BCastParamsToGPUs(const std::unordered_set<std::string> &vars) const;
+  void BCastParamsToDevices(const std::unordered_set<std::string> &vars) const;
 
  private:
   ParallelExecutorPrivate *member_;
diff --git a/paddle/fluid/framework/program_desc.cc b/paddle/fluid/framework/program_desc.cc
index 1e01a6e900404990e16674755367d2fc6d832725..589905828f7793c614c0fe12259e9ba5ab11ceac 100644
--- a/paddle/fluid/framework/program_desc.cc
+++ b/paddle/fluid/framework/program_desc.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/feed_fetch_type.h"
+#include "paddle/fluid/framework/version.h"
 
 namespace paddle {
 namespace framework {
@@ -38,7 +39,10 @@ proto::ProgramDesc *ProgramDesc::Proto() {
   return &desc_;
 }
 
+int64_t ProgramDesc::Version() const { return desc_.version().version(); }
+
 ProgramDesc::ProgramDesc() {
+  desc_.mutable_version()->set_version(kCurProgramVersion);
   auto *block = desc_.mutable_blocks()->Add();
   block->set_idx(kRootBlockIndex);
   block->set_parent_idx(kNoneBlockIndex);
@@ -55,11 +59,20 @@ ProgramDesc::ProgramDesc(const ProgramDesc &o) {
     auto all_ops = blocks_[block_id]->AllOps();
     for (size_t op_id = 0; op_id < all_ops.size(); ++op_id) {
       auto &op = all_ops[op_id];
+
       for (const std::string &attr_name : op->AttrNames()) {
         if (op->GetAttrType(attr_name) == proto::AttrType::BLOCK) {
           int sub_block_id =
-              o.Block(block_id).Op(op_id)->GetBlockAttr(attr_name);
+              o.Block(block_id).Op(op_id)->GetBlockAttrId(attr_name);
           op->SetBlockAttr(attr_name, MutableBlock(sub_block_id));
+        } else if (op->GetAttrType(attr_name) == proto::AttrType::BLOCKS) {
+          std::vector<int> sub_block_ids =
+              o.Block(block_id).Op(op_id)->GetBlocksAttrIds(attr_name);
+          std::vector<BlockDesc *> block_descs;
+          for (int block_id : sub_block_ids) {
+            block_descs.push_back(MutableBlock(block_id));
+          }
+          op->SetBlocksAttr(attr_name, block_descs);
         }
       }
     }
@@ -68,24 +81,22 @@ ProgramDesc::ProgramDesc(const ProgramDesc &o) {
 
 ProgramDesc::ProgramDesc(const proto::ProgramDesc &desc) {
   desc_ = desc;
-  for (auto &block_desc : *desc_.mutable_blocks()) {
-    blocks_.emplace_back(new BlockDesc(this, &block_desc));
-  }
-  for (auto &block : blocks_) {
-    for (auto *op : block->AllOps()) {
-      for (const auto &attr : op->Proto()->attrs()) {
-        if (attr.type() == proto::AttrType::BLOCK) {
-          size_t blk_idx = attr.block_idx();
-          op->SetBlockAttr(attr.name(), this->MutableBlock(blk_idx));
-        }
-      }
-    }
-  }
+  InitFromProto();
+}
+
+void ProgramDesc::CopyFrom(const proto::ProgramDesc &desc) {
+  blocks_.clear();
+  desc_ = desc;
+  InitFromProto();
 }
 
 ProgramDesc::ProgramDesc(const std::string &binary_str) {
   PADDLE_ENFORCE(desc_.ParseFromString(binary_str),
                  "Fail to parse program_desc from binary string.");
+  InitFromProto();
+}
+
+void ProgramDesc::InitFromProto() {
   for (auto &block_desc : *desc_.mutable_blocks()) {
     blocks_.emplace_back(new BlockDesc(this, &block_desc));
   }
@@ -95,6 +106,13 @@ ProgramDesc::ProgramDesc(const std::string &binary_str) {
         if (attr.type() == proto::AttrType::BLOCK) {
           size_t blk_idx = attr.block_idx();
           op->SetBlockAttr(attr.name(), this->MutableBlock(blk_idx));
+        } else if (attr.type() == proto::AttrType::BLOCKS) {
+          auto blks_idx = attr.blocks_idx();
+          std::vector<BlockDesc *> block_descs;
+          for (int blk_idx : blks_idx) {
+            block_descs.push_back(this->MutableBlock(blk_idx));
+          }
+          op->SetBlocksAttr(attr.name(), block_descs);
         }
       }
     }
@@ -103,10 +121,16 @@ ProgramDesc::ProgramDesc(const std::string &binary_str) {
 
 const std::vector<std::string> ProgramDesc::GetFeedTargetNames() {
   auto &global_block = Block(0);
+  // The order of feed_target_names must follow the index specified in `col`.
+  // since feed operator's order doesn't necessary follow 'col'.
   std::vector<std::string> feed_target_names;
   for (auto *op : global_block.AllOps()) {
     if (op->Type() == kFeedOpType) {
-      feed_target_names.insert(feed_target_names.begin(), op->Output("Out")[0]);
+      int col = boost::get<int>(op->GetAttr("col"));
+      if (col >= feed_target_names.size()) {
+        feed_target_names.resize(col + 1);
+      }
+      feed_target_names[col] = op->Output("Out")[0];
     }
   }
   return feed_target_names;
@@ -114,10 +138,16 @@ const std::vector<std::string> ProgramDesc::GetFeedTargetNames() {
 
 const std::vector<std::string> ProgramDesc::GetFetchTargetNames() {
   auto &global_block = Block(0);
+  // The order of fetch_target_names must follow the index specified in `col`.
+  // since fetch operator's order doesn't necessary follow 'col'.
   std::vector<std::string> fetch_target_names;
   for (auto *op : global_block.AllOps()) {
     if (op->Type() == kFetchOpType) {
-      fetch_target_names.push_back(op->Input("X")[0]);
+      int col = boost::get<int>(op->GetAttr("col"));
+      if (col >= fetch_target_names.size()) {
+        fetch_target_names.resize(col + 1);
+      }
+      fetch_target_names[col] = op->Input("X")[0];
     }
   }
   return fetch_target_names;
diff --git a/paddle/fluid/framework/program_desc.h b/paddle/fluid/framework/program_desc.h
index 65fa0a0cfd5ba6d9b8765cee1309e118cb74348a..2ec0e9d7a0969d44f88c7407bfb8cd4646530147 100644
--- a/paddle/fluid/framework/program_desc.h
+++ b/paddle/fluid/framework/program_desc.h
@@ -53,8 +53,12 @@ class ProgramDesc {
 
   void Flush();
 
+  void CopyFrom(const proto::ProgramDesc &desc);
+
   proto::ProgramDesc *Proto();
 
+  int64_t Version() const;
+
   // The output variable of feed_op is referenced as feed_target.
   // This function is used to collect the output variable's name of all
   // feed_ops.
@@ -76,6 +80,8 @@ class ProgramDesc {
   void SetFetchHolderName(const std::string &fetch_holder_name);
 
  private:
+  void InitFromProto();
+
   proto::ProgramDesc desc_;
 
   std::vector<std::unique_ptr<BlockDesc>> blocks_;
diff --git a/paddle/fluid/framework/program_desc_test.cc b/paddle/fluid/framework/program_desc_test.cc
index 6c46e9aad5b7fbf67fdcc07a12e7932ac8b6412b..7e689a37da8a16bd9b1ac6650b9322d2eb5a2c85 100644
--- a/paddle/fluid/framework/program_desc_test.cc
+++ b/paddle/fluid/framework/program_desc_test.cc
@@ -42,6 +42,19 @@ TEST(ProgramDesc, copy_ctor) {
   out->SetType(proto::VarType::LOD_TENSOR);
   op->SetOutput("Y", {out->Name()});
 
+  BlockDesc* new_block = program.AppendBlock(*global_block);
+  op = new_block->AppendOp();
+  op->SetType("mul");
+
+  op = global_block->AppendOp();
+  op->SetType("op_with_subblock");
+  op->SetAttr("sub_block", new_block);
+
+  std::vector<BlockDesc*> sub_blocks;
+  sub_blocks.push_back(program.AppendBlock(*global_block));
+  sub_blocks.push_back(program.AppendBlock(*global_block));
+  op->SetAttr("sub_blocks", sub_blocks);
+
   ProgramDesc program_copy(program);
 
   auto* global_block_copy = program_copy.MutableBlock(0);
@@ -64,6 +77,8 @@ TEST(ProgramDesc, copy_ctor) {
   assert_same_var("Y", y);
   assert_same_var("Out", out);
 
+  bool found_sub_block = false;
+  bool found_sub_blocks = false;
   for (size_t i = 0; i < global_block->OpSize(); ++i) {
     auto op_origin = global_block->Op(i);
     auto op_copy = global_block_copy->Op(i);
@@ -72,10 +87,28 @@ TEST(ProgramDesc, copy_ctor) {
     ASSERT_EQ(op_origin->Inputs(), op_copy->Inputs());
     ASSERT_EQ(op_origin->Outputs(), op_copy->Outputs());
 
-    ASSERT_EQ(op_copy->Proto()->SerializeAsString(),
-              op_origin->Proto()->SerializeAsString());
+    ASSERT_EQ(op_origin->Proto()->attrs().size(),
+              op_copy->Proto()->attrs().size());
+    for (auto it = op_origin->Proto()->attrs().begin();
+         it != op_origin->Proto()->attrs().end(); ++it) {
+      for (auto it_2 = op_copy->Proto()->attrs().begin();
+           it_2 != op_copy->Proto()->attrs().end(); ++it_2) {
+        if (it->name() == it_2->name()) {
+          ASSERT_TRUE(it_2->SerializeAsString() == it->SerializeAsString());
+        }
+      }
+    }
+
+    if (op->Type() == "op_with_subblock") {
+      ASSERT_EQ(1, op->GetBlockAttrId("sub_block"));
+      found_sub_block = true;
+
+      ASSERT_EQ(2, op->GetBlocksAttrIds("sub_blocks").size());
+      found_sub_blocks = true;
+    }
   }
-
+  ASSERT_TRUE(found_sub_block);
+  ASSERT_TRUE(found_sub_blocks);
   // Not check block's protostr are same it because the order of vars could be
   // different and it is correct.
 }
diff --git a/paddle/fluid/framework/reader.cc b/paddle/fluid/framework/reader.cc
index 0b36f1116d15004b355e854e101abb9ad3297836..40eafda9bf294f7e8ddd067e9014447f4de1cc0e 100644
--- a/paddle/fluid/framework/reader.cc
+++ b/paddle/fluid/framework/reader.cc
@@ -13,29 +13,62 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/reader.h"
+#include <deque>
 
 namespace paddle {
 namespace framework {
-ReaderBase::~ReaderBase() {}
-
-FileReader::FileReader(const std::vector<DDim> &dims) : dims_(dims) {}
 
-void FileReader::ReadNext(std::vector<LoDTensor> *out) {
+void ReaderBase::ReadNext(std::vector<LoDTensor> *out) {
+  std::lock_guard<std::mutex> lock(mu_);
+  PADDLE_ENFORCE_EQ(status_, ReaderStatus::kRunning);
   ReadNextImpl(out);
-  if (out->empty()) {
-    return;
-  }
+}
 
-  PADDLE_ENFORCE_EQ(out->size(), dims_.size());
-  for (size_t i = 0; i < dims_.size(); ++i) {
-    auto &actual = (*out)[i].dims();
-    auto &expect = dims_[i];
+void ReaderBase::InsertDecoratedReader(
+    const std::shared_ptr<ReaderBase> &decorated_reader) {
+  std::lock_guard<std::mutex> guard(mu_);
+  decorated_readers_.emplace_back(decorated_reader);
+}
 
-    PADDLE_ENFORCE_EQ(actual.size(), expect.size());
-    for (int j = 0; j < actual.size(); ++j) {
-      //      PADDLE_ENFORCE(actual[i] == expect[i] || expect[i] == -1);
+std::unordered_set<ReaderBase *> ReaderBase::GetEndPoints() {
+  std::unordered_set<ReaderBase *> result;
+  std::deque<ReaderBase *> queue;
+  queue.emplace_back(this);
+  while (!queue.empty()) {  // BFS search
+    auto *front = queue.front();
+    queue.pop_front();
+    if (front->decorated_readers_.empty()) {
+      result.emplace(front);
+    } else {
+      for (auto &reader : front->decorated_readers_) {
+        if (auto *reader_ptr = reader.lock().get()) {
+          queue.emplace_back(reader_ptr);
+        }
+      }
     }
   }
+
+  return result;
 }
+
+void ReaderBase::Shutdown() {
+  std::lock_guard<std::mutex> lock(mu_);
+  if (status_ != ReaderStatus::kStopped) {
+    ShutdownImpl();
+    status_ = ReaderStatus::kStopped;
+  }
+}
+
+void ReaderBase::Start() {
+  std::lock_guard<std::mutex> lock(mu_);
+  if (status_ != ReaderStatus::kRunning) {
+    StartImpl();
+    status_ = ReaderStatus::kRunning;
+  }
+}
+
+ReaderBase::~ReaderBase() {}
+
+DecoratedReader::~DecoratedReader() { reader_->Shutdown(); }
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/reader.h b/paddle/fluid/framework/reader.h
index 64d4ceab624312ed366d7e835072899f1f033a88..82562bf883d88787858912f7039cf8fef003eccf 100644
--- a/paddle/fluid/framework/reader.h
+++ b/paddle/fluid/framework/reader.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <memory>
+#include <unordered_set>
 #include <vector>
 
 #include "paddle/fluid/framework/ddim.h"
@@ -26,59 +27,116 @@ namespace framework {
 
 class ReaderBase {
  public:
-  virtual void ReadNext(std::vector<LoDTensor>* out) = 0;
+  virtual void ReadNext(std::vector<LoDTensor>* out);
 
-  virtual void ReInit() = 0;
+  virtual void Shutdown();
+
+  virtual void Start();
+
+  // Return the readers which are the end of decorating chain. Basically
+  // they are readers just before read op.
+  std::unordered_set<ReaderBase*> GetEndPoints();
 
   virtual ~ReaderBase();
+
+ protected:
+  virtual void ReadNextImpl(std::vector<LoDTensor>* out) {}
+
+  virtual void ShutdownImpl() {}
+
+  virtual void StartImpl() {}
+
+  enum ReaderStatus { kRunning, kStopped };
+
+  ReaderStatus status_{kRunning};
+
+  mutable std::mutex mu_;
+
+ private:
+  friend class DecoratedReader;
+  // These methods can be only invoked inside DecoratedReader to record the
+  // decorating chain.
+  void InsertDecoratedReader(
+      const std::shared_ptr<ReaderBase>& decorated_reader);
+  // A set of which readers that decorated this reader.
+  std::vector<std::weak_ptr<ReaderBase>> decorated_readers_;
 };
 
-class DecoratedReader : public ReaderBase {
+class DecoratedReader : public ReaderBase,
+                        public std::enable_shared_from_this<DecoratedReader> {
  public:
   explicit DecoratedReader(const std::shared_ptr<ReaderBase>& reader)
       : ReaderBase(), reader_(reader) {
     PADDLE_ENFORCE_NOT_NULL(reader_);
   }
 
-  void ReInit() override { reader_->ReInit(); }
-
- protected:
-  std::shared_ptr<ReaderBase> reader_;
-};
-
-class FileReader : public ReaderBase {
- public:
-  explicit FileReader(const std::vector<DDim>& dims);
+  void RegisterDecorateChain() {
+    reader_->InsertDecoratedReader(shared_from_this());
+  }
 
-  void ReadNext(std::vector<LoDTensor>* out) override;
+  ~DecoratedReader();
 
  protected:
-  virtual void ReadNextImpl(std::vector<LoDTensor>* out) = 0;
+  void ShutdownImpl() override { reader_->Shutdown(); }
 
- private:
-  std::vector<DDim> dims_;
+  void StartImpl() override { reader_->Start(); }
+
+  std::shared_ptr<ReaderBase> reader_;
 };
 
+// FileReader is just a conceptual class.
+class FileReader : public ReaderBase {};
+
 // The ReaderHolder is used as reader' unified wrapper,
 // making it easier to access different type reader in Variables.
 class ReaderHolder {
  public:
-  void Reset(ReaderBase* reader) { reader_.reset(reader); }
+  template <typename T>
+  void Reset(const std::shared_ptr<T>& reader) {
+    auto reader_base = std::dynamic_pointer_cast<ReaderBase>(reader);
+    PADDLE_ENFORCE_NOT_NULL(reader_base);
+    reader_ = reader_base;
+  }
 
-  std::shared_ptr<ReaderBase> Get() const { return reader_; }
+  const std::shared_ptr<ReaderBase>& Get() const { return reader_; }
 
   void ReadNext(std::vector<LoDTensor>* out) {
     PADDLE_ENFORCE_NOT_NULL(reader_);
     reader_->ReadNext(out);
   }
-  void ReInit() {
+
+  void ResetAll() {
+    auto end_readers = reader_->GetEndPoints();
+    for (auto* reader : end_readers) {
+      reader->Shutdown();
+    }
+    for (auto* reader : end_readers) {
+      reader->Start();
+    }
+  }
+
+  void Shutdown() {
     PADDLE_ENFORCE_NOT_NULL(reader_);
-    reader_->ReInit();
+    reader_->Shutdown();
   }
 
+  void Start() {
+    PADDLE_ENFORCE_NOT_NULL(reader_);
+    reader_->Start();
+  }
+
+  operator const std::shared_ptr<ReaderBase>&() const { return this->reader_; }
+
  private:
   std::shared_ptr<ReaderBase> reader_;
 };
 
+template <typename T, typename... ARGS>
+inline std::shared_ptr<DecoratedReader> MakeDecoratedReader(ARGS&&... args) {
+  std::shared_ptr<DecoratedReader> reader(new T(std::forward<ARGS>(args)...));
+  reader->RegisterDecorateChain();
+  return reader;
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/reader_test.cc b/paddle/fluid/framework/reader_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f0d07cb7c1367576084b9494e7758103bb45d1e5
--- /dev/null
+++ b/paddle/fluid/framework/reader_test.cc
@@ -0,0 +1,52 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/reader.h"
+#include <memory>
+#include "gtest/gtest.h"
+
+class StubDecoratedReader : public paddle::framework::DecoratedReader {
+ public:
+  explicit StubDecoratedReader(const std::shared_ptr<ReaderBase> &reader)
+      : DecoratedReader(reader) {}
+
+  void ReadNextImpl(std::vector<paddle::framework::LoDTensor> *out) override {}
+};
+
+class StubRootReader : public paddle::framework::ReaderBase {
+ public:
+  void ReadNextImpl(std::vector<paddle::framework::LoDTensor> *out) override {}
+};
+
+TEST(READER, decorate_chain) {
+  auto root = std::make_shared<StubRootReader>();
+  auto end_point1 =
+      paddle::framework::MakeDecoratedReader<StubDecoratedReader>(root);
+  auto end_point2 =
+      paddle::framework::MakeDecoratedReader<StubDecoratedReader>(root);
+
+  {
+    auto endpoints = root->GetEndPoints();
+    ASSERT_EQ(endpoints.size(), 2U);
+    ASSERT_NE(endpoints.count(end_point1.get()), 0);
+    ASSERT_NE(endpoints.count(end_point2.get()), 0);
+  }
+
+  {
+    auto end_point3 =
+        paddle::framework::MakeDecoratedReader<StubDecoratedReader>(root);
+    ASSERT_EQ(root->GetEndPoints().size(), 3U);
+  }
+  { ASSERT_EQ(root->GetEndPoints().size(), 2U); }
+}
diff --git a/paddle/fluid/framework/rw_lock.h b/paddle/fluid/framework/rw_lock.h
new file mode 100644
index 0000000000000000000000000000000000000000..da163835e8652ae479121bd67f2eed77332b2740
--- /dev/null
+++ b/paddle/fluid/framework/rw_lock.h
@@ -0,0 +1,131 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#if !defined(_WIN32)
+#include <pthread.h>
+#endif  // !_WIN32
+
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+
+#if !defined(_WIN32)
+struct RWLock {
+  RWLock() { pthread_rwlock_init(&lock_, nullptr); }
+
+  ~RWLock() { pthread_rwlock_destroy(&lock_); }
+
+  void RDLock() {
+    PADDLE_ENFORCE_EQ(pthread_rwlock_rdlock(&lock_), 0,
+                      "acquire read lock failed");
+  }
+
+  void WRLock() {
+    PADDLE_ENFORCE_EQ(pthread_rwlock_wrlock(&lock_), 0,
+                      "acquire write lock failed");
+  }
+
+  void UNLock() {
+    PADDLE_ENFORCE_EQ(pthread_rwlock_unlock(&lock_), 0, "unlock failed");
+  }
+
+ private:
+  pthread_rwlock_t lock_;
+};
+#else
+// https://stackoverflow.com/questions/7125250/making-pthread-rwlock-wrlock-recursive
+// In windows, rw_lock seems like a hack. Use empty object and do nothing.
+struct RWLock {
+  void RDLock() {}
+  void WRLock() {}
+  void UNLock() {}
+};
+#endif
+
+class RWLockGuard {
+ public:
+  enum Status { kUnLock, kWRLock, kRDLock };
+
+  RWLockGuard(RWLock* rw_lock, Status init_status)
+      : lock_(rw_lock), status_(Status::kUnLock) {
+    switch (init_status) {
+      case Status::kRDLock: {
+        RDLock();
+        break;
+      }
+      case Status::kWRLock: {
+        WRLock();
+        break;
+      }
+      case Status::kUnLock: {
+        break;
+      }
+    }
+  }
+
+  void WRLock() {
+    switch (status_) {
+      case Status::kUnLock: {
+        lock_->WRLock();
+        status_ = Status::kWRLock;
+        break;
+      }
+      case Status::kWRLock: {
+        break;
+      }
+      case Status::kRDLock: {
+        PADDLE_THROW(
+            "Please unlock read lock first before invoking write lock.");
+        break;
+      }
+    }
+  }
+
+  void RDLock() {
+    switch (status_) {
+      case Status::kUnLock: {
+        lock_->RDLock();
+        status_ = Status::kRDLock;
+        break;
+      }
+      case Status::kRDLock: {
+        break;
+      }
+      case Status::kWRLock: {
+        PADDLE_THROW(
+            "Please unlock write lock first before invoking read lock.");
+        break;
+      }
+    }
+  }
+
+  void UnLock() {
+    if (status_ != Status::kUnLock) {
+      lock_->UNLock();
+      status_ = Status::kUnLock;
+    }
+  }
+
+  ~RWLockGuard() { UnLock(); }
+
+ private:
+  RWLock* lock_;
+  Status status_;
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/rw_lock_test.cc b/paddle/fluid/framework/rw_lock_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..16f9cbb65229f10912ee90436c3557aaaca169b8
--- /dev/null
+++ b/paddle/fluid/framework/rw_lock_test.cc
@@ -0,0 +1,81 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/rw_lock.h"
+#include <gtest/gtest.h>
+#include <chrono>  // NOLINT
+#include <thread>  // NOLINT
+#include <vector>
+
+namespace f = paddle::framework;
+
+void f1(f::RWLock *lock) {
+  lock->RDLock();
+  lock->UNLock();
+}
+
+TEST(RWLOCK, read_read) {
+  f::RWLock lock;
+  lock.RDLock();
+  std::thread t1(f1, &lock);
+  std::thread t2(f1, &lock);
+  t1.join();
+  t2.join();
+  lock.UNLock();
+}
+
+void f2(f::RWLock *lock, std::vector<int> *result) {
+  lock->RDLock();
+  ASSERT_EQ(result->size(), 0UL);
+  lock->UNLock();
+}
+
+void f3(f::RWLock *lock, std::vector<int> *result) {
+  lock->WRLock();
+  result->push_back(1);
+  lock->UNLock();
+}
+
+TEST(RWLOCK, read_write) {
+  f::RWLock lock;
+  std::vector<int> result;
+
+  lock.RDLock();
+  std::thread t1(f2, &lock, &result);
+  t1.join();
+  std::thread t2(f3, &lock, &result);
+  std::this_thread::sleep_for(std::chrono::seconds(1));
+  ASSERT_EQ(result.size(), 0UL);
+  lock.UNLock();
+  t2.join();
+  ASSERT_EQ(result.size(), 1UL);
+}
+
+void f4(f::RWLock *lock, std::vector<int> *result) {
+  lock->RDLock();
+  ASSERT_EQ(result->size(), 1UL);
+  lock->UNLock();
+}
+
+TEST(RWLOCK, write_read) {
+  f::RWLock lock;
+  std::vector<int> result;
+
+  lock.WRLock();
+  std::thread t1(f4, &lock, &result);
+  std::this_thread::sleep_for(std::chrono::seconds(1));
+  result.push_back(1);
+  lock.UNLock();
+  t1.join();
+}
diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc
index caea191cb3513fbe701df0dca668d28fefb6a1d3..ece9a69a99852d81beda00a43362daef16cd0565 100644
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -84,6 +84,12 @@ void Scope::DropKids() {
   kids_.clear();
 }
 
+bool Scope::HasKid(const Scope* scope) const {
+  std::unique_lock<std::mutex> lock(mutex_);
+  auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
+  return it != this->kids_.end();
+}
+
 std::vector<std::string> Scope::LocalVarNames() const {
   std::unique_lock<std::mutex> lock(mutex_);
   std::vector<std::string> known_vars;
diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h
index 47d040240a213f65153252419ebb429461e866c5..e42fff1d79d92fb7ed61768a614d8cd98f6775a0 100644
--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@@ -73,6 +73,9 @@ class Scope {
   /// Drop all kids scopes belonged to this scope.
   void DropKids();
 
+  /// Find if a scope exists in the kid scopes
+  bool HasKid(const Scope* scope) const;
+
   // enumerate all the variables current contains.
   std::vector<std::string> LocalVarNames() const;
 
diff --git a/paddle/fluid/framework/selected_rows.cc b/paddle/fluid/framework/selected_rows.cc
index 06ed87e7e8a2d5324b48a466b05207042ec1b7fa..8c290bb095d554a973e66a3a19606a06759fd668 100644
--- a/paddle/fluid/framework/selected_rows.cc
+++ b/paddle/fluid/framework/selected_rows.cc
@@ -49,7 +49,7 @@ struct TensorCopyVisitor {
         size_(size) {}
 
   template <typename T>
-  void operator()() const {
+  void apply() const {
     // TODO(Yancey1989): support other place
     platform::CPUPlace cpu;
     memory::Copy(cpu, dst_->mutable_data<T>(cpu) + dst_offset_, cpu,
@@ -120,66 +120,76 @@ bool SelectedRows::HasKey(int64_t key) const {
                                                                    : true;
 }
 
-std::vector<std::pair<int64_t, int64_t>> SelectedRows::Get(
-    const std::vector<int64_t>& keys, framework::Tensor* value) const {
+int64_t SelectedRows::AutoGrownIndex(int64_t key, bool auto_grown) {
+  rwlock_->RDLock();
+  auto iter = id_to_index_.find(key);
+  if (iter == id_to_index_.end()) {
+    rwlock_->UNLock();
+    if (!auto_grown) {
+      PADDLE_THROW("key %d not found", key);
+    }
+    rwlock_->WRLock();
+    auto map_size = id_to_index_.size();
+    auto vector_size = rows_.size();
+    if (map_size != vector_size) {
+      rwlock_->UNLock();
+      PADDLE_THROW(
+          "id_to_index_ size %d should have the same size with rows_ %d",
+          map_size, vector_size);
+    }
+    auto write_iter = id_to_index_.find(key);
+    if (write_iter == id_to_index_.end()) {
+      int row_num = rows_.size();
+      if (row_num == value_->dims()[0]) {
+        rwlock_->UNLock();
+        PADDLE_THROW("selected rows is full, then length exceed %d", row_num);
+      }
+      // key logic to put a key into id_to_index_
+      rows_.push_back(key);
+      auto index = static_cast<int64_t>(rows_.size() - 1);
+      id_to_index_[key] = index;
+      rwlock_->UNLock();
+      return index;
+    } else {
+      auto index = write_iter->second;
+      rwlock_->UNLock();
+      return index;
+    }
+  } else {
+    auto index = iter->second;
+    rwlock_->UNLock();
+    return index;
+  }
+}
+
+void SelectedRows::SyncIndex() {
+  rwlock_->WRLock();
+  id_to_index_.clear();
+  for (size_t i = 0; i < rows_.size(); ++i) {
+    id_to_index_[rows_[i]] = i;
+  }
+  rwlock_->UNLock();
+}
+
+void SelectedRows::Get(const framework::Tensor& ids, framework::Tensor* value,
+                       bool auto_grown) {
   PADDLE_ENFORCE(value->IsInitialized(),
                  "The value tensor should be initialized.");
-  std::vector<std::pair<int64_t, int64_t>> non_keys_pair;
-  if (keys.empty()) {
+  if (ids.numel() == 0) {
     VLOG(3) << "keys is empty, please check data!";
   } else {
     int64_t value_width = value_->numel() / value_->dims()[0];
     PADDLE_ENFORCE_EQ(value_width, value->numel() / value->dims()[0],
                       "output tensor should have the same shape with table "
                       "except the dims[0].");
-
-    for (size_t i = 0; i < keys.size(); ++i) {
-      int64_t index = Index(keys[i]);
-      if (index == -1) {
-        non_keys_pair.push_back(
-            std::make_pair(keys[i], static_cast<int64_t>(i)));
-      } else {
-        framework::VisitDataType(
-            framework::ToDataType(value_->type()),
-            TensorCopyVisitor(value, i * value_width, *value_.get(),
-                              index * value_width, value_width));
-      }
-    }
-  }
-  return non_keys_pair;
-}
-
-bool SelectedRows::Set(int64_t key, const framework::Tensor& value) {
-  PADDLE_ENFORCE(value.IsInitialized(), "The value should be initialized.");
-  if (value_->IsInitialized()) {
-    PADDLE_ENFORCE_EQ(
-        value.type(), value_->type(),
-        "The type of the value should be same with the original value");
-  }
-  PADDLE_ENFORCE_EQ(value.dims()[0], static_cast<size_t>(1),
-                    "The first dim of value should be 1.");
-  std::lock_guard<std::mutex> lock(*auto_grown_mutex_.get());
-  auto index = Index(key);
-  bool is_new_key = false;
-  if (index == -1) {
-    rows_.push_back(key);
-    index = rows_.size() - 1;
-    is_new_key = true;
-    // whether need to resize the table
-    if (static_cast<int64_t>(rows_.size()) > value_->dims()[0]) {
-      auto dims = value_->dims();
-      dims[0] = (dims[0] + 1) << 1;
-      framework::VisitDataType(framework::ToDataType(value.type()),
-                               ReAllocateVisitor(dims, value_.get()));
+    for (int i = 0; i < ids.numel(); ++i) {
+      int64_t index = AutoGrownIndex(ids.data<int64_t>()[i], auto_grown);
+      framework::VisitDataType(
+          framework::ToDataType(value_->type()),
+          TensorCopyVisitor(value, i * value_width, *value_.get(),
+                            index * value_width, value_width));
     }
   }
-
-  framework::VisitDataType(
-      framework::ToDataType(value.type()),
-      TensorCopyVisitor(value_.get(),
-                        index * value_->numel() / value_->dims()[0], value,
-                        static_cast<int64_t>(0), value.numel()));
-  return is_new_key;
 }
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/selected_rows.h b/paddle/fluid/framework/selected_rows.h
index 7160670ddd204c20021ea87cdd67ee4721d03451..daf5e95304fb84eaba26a30c45414d5021e7ffcb 100644
--- a/paddle/fluid/framework/selected_rows.h
+++ b/paddle/fluid/framework/selected_rows.h
@@ -17,10 +17,12 @@ limitations under the License. */
 #include <algorithm>
 #include <memory>
 #include <mutex>  // NOLINT
+#include <unordered_map>
 #include <utility>
 #include <vector>
 
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/rw_lock.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/memory/memcpy.h"
 
@@ -48,13 +50,13 @@ class SelectedRows {
   SelectedRows(const std::vector<int64_t>& rows, const int64_t& height)
       : rows_(rows), height_(height) {
     value_.reset(new Tensor());
-    auto_grown_mutex_.reset(new std::mutex);
+    rwlock_.reset(new RWLock);
   }
 
   SelectedRows() {
     height_ = 0;
     value_.reset(new Tensor());
-    auto_grown_mutex_.reset(new std::mutex);
+    rwlock_.reset(new RWLock);
   }
 
   platform::Place place() const { return value_->place(); }
@@ -74,47 +76,51 @@ class SelectedRows {
   void set_rows(const Vector<int64_t>& rows) { rows_ = rows; }
 
   /*
-   * @brief wheter has the specified key in the table.
+   * @brief Get the index of key in rows
+   *
+   * @return -1 if the key does not exists.
+   */
+  int64_t Index(int64_t key) const {
+    auto it = std::find(rows_.begin(), rows_.end(), key);
+    if (it == rows_.end()) {
+      PADDLE_THROW("id %s not in table", key);
+    }
+    return static_cast<int64_t>(std::distance(rows_.begin(), it));
+  }
+
+  /*
+   * @brief whether has the specified key in the table.
    *
    * @return true if the key is exists.
    */
   bool HasKey(int64_t key) const;
 
   /*
-   * @brief Get value by the key list, if the
+   * @brief Get value by the key list.
+   * Note!!! this interface is only used when selected_rows is used as
+   * parameters
+   * for distribute lookup table.
    *
    * @return a list of pair which contains the non-exists key and the index in
    * the value
    */
-  std::vector<std::pair<int64_t, int64_t>> Get(const std::vector<int64_t>& keys,
-                                               framework::Tensor* value) const;
+  void Get(const framework::Tensor& ids, framework::Tensor* value,
+           bool auto_grown = false);
 
   /*
-   * @brief Set a key-value pair into the table.
-   *  This function will double the value memory if it's not engouth.
+   * @brief Get the index of the key from id_to_index_ map. If the key not
+   * exist,
+   * add the key into id_to_index_.
    *
-   * @note:
-   *    1. The first dim of the value should be 1
-   *    2. The value should be initialized and the data type
-   *       should be the same with the table.
-   *
-   * @return true if the key is a new one, otherwise false
+   * Note!!! this interface is only used when selected_rows is used as
+   * parameters
+   * for distribute lookup table.
    *
+   * @return index of the key.
    */
-  bool Set(int64_t key, const Tensor& value);
+  int64_t AutoGrownIndex(int64_t key, bool auto_grown);
 
-  /*
-   * @brief Get the index of key in rows
-   *
-   * @return -1 if the key does not exists.
-   */
-  int64_t Index(int64_t key) const {
-    auto it = std::find(rows_.begin(), rows_.end(), key);
-    if (it == rows_.end()) {
-      return static_cast<int64_t>(-1);
-    }
-    return static_cast<int64_t>(std::distance(rows_.begin(), it));
-  }
+  void SyncIndex();
 
   DDim GetCompleteDims() const {
     std::vector<int64_t> dims = vectorize(value_->dims());
@@ -127,9 +133,10 @@ class SelectedRows {
   // SelectedRows are simply concated when adding together. Until a
   // SelectedRows add a Tensor, will the duplicate rows be handled.
   Vector<int64_t> rows_;
+  std::unordered_map<int64_t, int64_t> id_to_index_;
   std::unique_ptr<Tensor> value_{nullptr};
   int64_t height_;
-  std::unique_ptr<std::mutex> auto_grown_mutex_{nullptr};
+  std::unique_ptr<RWLock> rwlock_{nullptr};
 };
 
 /*
diff --git a/paddle/fluid/framework/selected_rows_test.cc b/paddle/fluid/framework/selected_rows_test.cc
index eefcaa5672c5a3debf162f5c8eda653408dcf221..5ca864cfdf7176850dd31dd42ef3306061a742cf 100644
--- a/paddle/fluid/framework/selected_rows_test.cc
+++ b/paddle/fluid/framework/selected_rows_test.cc
@@ -9,8 +9,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/selected_rows.h"
+#include <time.h>
+#include <thread>  // NOLINT
+
 #include "gtest/gtest.h"
+#include "paddle/fluid/framework/selected_rows.h"
 
 namespace paddle {
 namespace framework {
@@ -59,39 +62,129 @@ TEST_F(SelectedRowsTester, SerializeAndDeseralize) {
   ASSERT_EQ(selected_rows_->GetCompleteDims(), dst_tensor.GetCompleteDims());
 }
 
-TEST_F(SelectedRowsTester, SparseTable) {
+TEST(SelectedRows, SparseTable) {
   platform::CPUPlace cpu;
   SelectedRows table;
+
+  int64_t table_size = 100;
+  int64_t embedding_width = 8;
   // initialize a sparse table
-  table.mutable_value()->Resize(framework::make_ddim({1, 100}));
-  table.mutable_value()->mutable_data<float>(cpu);
-  table.mutable_rows()->push_back(1);
+  table.mutable_value()->Resize(
+      framework::make_ddim({table_size, embedding_width}));
+  auto* data = table.mutable_value()->mutable_data<float>(cpu);
+  for (int64_t i = 0; i < table_size; ++i) {
+    for (int64_t j = 0; j < embedding_width; ++j) {
+      data[i * embedding_width + j] = static_cast<float>(i);
+    }
+  }
+  ASSERT_EQ(table.AutoGrownIndex(10, true), 0);
+  ASSERT_EQ(table.AutoGrownIndex(8, true), 1);
+  ASSERT_EQ(table.AutoGrownIndex(8, true), 1);
+  ASSERT_EQ(table.AutoGrownIndex(6, true), 2);
+  ASSERT_TRUE(table.HasKey(10));
+  ASSERT_TRUE(table.HasKey(8));
+  ASSERT_TRUE(table.HasKey(6));
+  ASSERT_EQ(table.rows().size(), 3);
+
+  framework::Tensor ids;
+  ids.Resize(framework::make_ddim({4}));
+  auto* ids_data = ids.mutable_data<int64_t>(cpu);
+  ids_data[0] = static_cast<int64_t>(6);
+  ids_data[1] = static_cast<int64_t>(6);
+  ids_data[2] = static_cast<int64_t>(8);
+  ids_data[3] = static_cast<int64_t>(10);
 
-  int64_t key = 10000;
-  int64_t non_key = 999;
-  framework::Tensor value;
-  value.Resize(framework::make_ddim({1, 100}));
-  auto ptr = value.mutable_data<float>(cpu);
-  ptr[0] = static_cast<float>(10);
+  framework::Tensor get_value;
+  auto* value_data = get_value.mutable_data<float>(
+      framework::make_ddim({4, embedding_width}), cpu);
+  table.Get(ids, &get_value);
 
-  ASSERT_EQ(table.rows().size(), static_cast<size_t>(1));
-  ASSERT_EQ(table.HasKey(key), false);
+  for (int j = 0; j < embedding_width; ++j) {
+    ASSERT_EQ(value_data[0 * embedding_width + j], 2);
+  }
+  for (int j = 0; j < embedding_width; ++j) {
+    ASSERT_EQ(value_data[1 * embedding_width + j], 2);
+  }
+  for (int j = 0; j < embedding_width; ++j) {
+    ASSERT_EQ(value_data[2 * embedding_width + j], 1);
+  }
+  for (int j = 0; j < embedding_width; ++j) {
+    ASSERT_EQ(value_data[3 * embedding_width + j], 0);
+  }
+}
 
-  table.Set(key, value);
+void f1(SelectedRows* table, int table_size) {
+  for (int i = 1000000; i > 0; --i) {
+    auto id = i % table_size;
+    int64_t index1 = table->AutoGrownIndex(id, true);
+    int64_t index2 = table->AutoGrownIndex(id, false);
+    int64_t index3 = table->AutoGrownIndex(id, true);
+    ASSERT_EQ(index1, index2);
+    ASSERT_EQ(index2, index3);
+  }
+}
 
-  ASSERT_EQ(table.rows().size(), static_cast<size_t>(2));
-  ASSERT_EQ(table.HasKey(key), true);
-  // check re-allocate
-  ASSERT_EQ(table.value().dims()[0], static_cast<int64_t>(4));
+void f2(SelectedRows* table, int table_size) {
+  for (int i = 0; i < 1000000; ++i) {
+    auto id = i % table_size;
+    int64_t index1 = table->AutoGrownIndex(id, true);
+    int64_t index2 = table->AutoGrownIndex(id, false);
+    int64_t index3 = table->AutoGrownIndex(id, true);
+    ASSERT_EQ(index1, index2);
+    ASSERT_EQ(index2, index3);
+  }
+}
 
-  framework::Tensor get_value;
-  get_value.mutable_data<float>(framework::make_ddim({2, 100}), cpu);
-  std::vector<int64_t> keys({non_key, key});
-  auto non_key_pairs = table.Get(keys, &get_value);
+void f3(SelectedRows* table, int table_size) {
+  clock_t t1 = clock();
+  for (int i = 100000; i > 0; --i) {
+    auto id1 = table->AutoGrownIndex(i % table_size, true);
+    auto id2 = table->Index(i % table_size);
+    ASSERT_EQ(id1, id2);
+  }
+  clock_t t2 = clock();
+  std::cout << "f3 run time:" << t2 - t1 << std::endl;
+}
+
+void f4(SelectedRows* table, int table_size) {
+  clock_t t1 = clock();
+  for (int i = 0; i < 100000; ++i) {
+    auto id1 = table->AutoGrownIndex(i % table_size, true);
+    auto id2 = table->Index(i % table_size);
+    ASSERT_EQ(id1, id2);
+  }
+  clock_t t2 = clock();
+  std::cout << "f4 run time:" << t2 - t1 << std::endl;
+}
+
+TEST(SelectedRows, MultiThreadAutoIndex) {
+  platform::CPUPlace cpu;
+  SelectedRows table;
+
+  int64_t table_size = 100000;
+  int64_t embedding_width = 8;
+  // initialize a sparse table
+  table.mutable_value()->Resize(
+      framework::make_ddim({table_size, embedding_width}));
+  auto* data = table.mutable_value()->mutable_data<float>(cpu);
+  for (int64_t i = 0; i < table_size; ++i) {
+    for (int64_t j = 0; j < embedding_width; ++j) {
+      data[i * embedding_width + j] = static_cast<float>(i);
+    }
+  }
 
-  ASSERT_EQ(get_value.data<float>()[100], static_cast<float>(10));
-  ASSERT_EQ(non_key_pairs.size(), static_cast<size_t>(1));
-  ASSERT_EQ(non_key_pairs[0].first, non_key);
+  std::thread t1(f1, &table, table_size);
+  std::thread t11(f1, &table, table_size);
+  std::thread t2(f2, &table, table_size);
+  std::thread t22(f2, &table, table_size);
+  t1.join();
+  t11.join();
+  t2.join();
+  t22.join();
+  std::thread t3(f3, &table, table_size);
+  std::thread t4(f4, &table, table_size);
+  t3.join();
+  t4.join();
 }
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/tensor.cc b/paddle/fluid/framework/tensor.cc
index c7286dacf01659f3af0927a71856e5a6496cb877..b6ba0df033af12d48e88eb57a3b97b559077250d 100644
--- a/paddle/fluid/framework/tensor.cc
+++ b/paddle/fluid/framework/tensor.cc
@@ -31,7 +31,8 @@ size_t Tensor::memory_size() const {
   return holder_ == nullptr ? 0UL : holder_->size() - offset_;
 }
 
-void* Tensor::mutable_data(platform::Place place, std::type_index type) {
+void* Tensor::mutable_data(platform::Place place, std::type_index type,
+                           size_t requested_size) {
   if (holder_ != nullptr) {
     holder_->set_type(type);
   }
@@ -39,7 +40,11 @@ void* Tensor::mutable_data(platform::Place place, std::type_index type) {
                     "When calling this method, the Tensor's numel must be "
                     "equal or larger than zero. "
                     "Please check Tensor::Resize has been called first.");
-  int64_t size = numel() * SizeOfType(type);
+  size_t size = numel() * SizeOfType(type);
+  if (requested_size) {
+    PADDLE_ENFORCE_GE(requested_size, size);
+    size = requested_size;
+  }
   /* some versions of boost::variant don't have operator!= */
   if (holder_ == nullptr || !(holder_->place() == place) ||
       holder_->size() < size + offset_) {
@@ -68,10 +73,10 @@ void* Tensor::mutable_data(platform::Place place, std::type_index type) {
                                  offset_);
 }
 
-void* Tensor::mutable_data(platform::Place place) {
+void* Tensor::mutable_data(platform::Place place, size_t requested_size) {
   PADDLE_ENFORCE(this->holder_ != nullptr,
                  "Cannot invoke mutable data if current hold nothing.");
-  return mutable_data(place, holder_->type());
+  return mutable_data(place, holder_->type(), requested_size);
 }
 
 Tensor& Tensor::ShareDataWith(const Tensor& src) {
@@ -112,5 +117,6 @@ Tensor& Tensor::Resize(const DDim& dims) {
 const DDim& Tensor::dims() const { return dims_; }
 
 int64_t Tensor::numel() const { return product(dims_); }
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
index 775c01765c96ecdc7c3aef5174b90c52ed281e69..f1d268548578fea12082e2edb213a3749eccbfaf 100644
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -82,29 +82,31 @@ class Tensor {
   template <typename T>
   const T* data() const;
 
-  bool IsInitialized() const;
+  inline bool IsInitialized() const;
 
   /**
    * @brief   Return a pointer to mutable memory block.
    * @note    If not exist, then allocation.
    */
   template <typename T>
-  T* mutable_data(platform::Place place);
+  T* mutable_data(platform::Place place, size_t requested_size = 0);
 
-  void* mutable_data(platform::Place place, std::type_index type);
+  void* mutable_data(platform::Place place, std::type_index type,
+                     size_t requested_size = 0);
 
-  void* mutable_data(platform::Place place);
+  void* mutable_data(platform::Place place, size_t requested_size = 0);
 
   /**
    * @brief     Return a pointer to mutable memory block.
    *
-   * @param[in] dims    The dimensions of the memory block.
-   * @param[in] place   The place of the memory block.
+   * @param[in] dims           The dimensions of the memory block.
+   * @param[in] place          The place of the memory block.
+   * @param[in] requested_size The size of the block in bytes.
    *
    * @note      If not exist, then allocation.
    */
   template <typename T>
-  T* mutable_data(DDim dims, platform::Place place);
+  T* mutable_data(DDim dims, platform::Place place, size_t requested_size = 0);
 
   /*! Return the dimensions of the memory block. */
   const DDim& dims() const;
diff --git a/paddle/fluid/framework/tensor_impl.h b/paddle/fluid/framework/tensor_impl.h
index 7f678f869aac4616c8bca440d0431f765da41dd6..6d3047c95d6cf30c2a5308d4f69ded367066d78c 100644
--- a/paddle/fluid/framework/tensor_impl.h
+++ b/paddle/fluid/framework/tensor_impl.h
@@ -46,19 +46,28 @@ inline T* Tensor::data() {
 }
 
 template <typename T>
-inline T* Tensor::mutable_data(DDim dims, platform::Place place) {
+inline T* Tensor::mutable_data(DDim dims, platform::Place place,
+                               size_t requested_size) {
   static_assert(std::is_pod<T>::value, "T must be POD");
   Resize(dims);
-  return mutable_data<T>(place);
+  return mutable_data<T>(place, requested_size);
 }
 
 template <typename T>
-inline T* Tensor::mutable_data(platform::Place place) {
+inline T* Tensor::mutable_data(platform::Place place, size_t requested_size) {
   static_assert(std::is_pod<T>::value, "T must be POD");
-  return reinterpret_cast<T*>(mutable_data(place, typeid(T)));
+  return reinterpret_cast<T*>(mutable_data(place, typeid(T), requested_size));
 }
 
 inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) {
+  int rank = src.dims().size();
+  PADDLE_ENFORCE_GE(
+      rank, 2,
+      "'ReshapeToMatrix()' is only used for flatten high rank "
+      "tensors to matrixs. Can not be used in reshaping vectors.");
+  if (rank == 2) {
+    return src;
+  }
   Tensor res;
   res.ShareDataWith(src);
   res.Resize(flatten_to_2d(src.dims(), num_col_dims));
diff --git a/paddle/fluid/framework/tensor_test.cc b/paddle/fluid/framework/tensor_test.cc
index 0a1cb6d5703dace5e6be73285655ecd9d2ad89fb..cb2061c06a429d8e8116001a4aa4e8c46ea13428 100644
--- a/paddle/fluid/framework/tensor_test.cc
+++ b/paddle/fluid/framework/tensor_test.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/tensor.h"
 #include <gtest/gtest.h>
 #include <string>
+#include "paddle/fluid/platform/float16.h"
 
 namespace framework = paddle::framework;
 namespace platform = paddle::platform;
@@ -213,3 +214,17 @@ TEST(Tensor, Layout) {
   src.set_layout(framework::DataLayout::kAnyLayout);
   ASSERT_EQ(src.layout(), framework::DataLayout::kAnyLayout);
 }
+
+TEST(Tensor, FP16) {
+  using platform::float16;
+  framework::Tensor src;
+  float16* src_ptr = src.mutable_data<float16>({2, 3}, platform::CPUPlace());
+  for (int i = 0; i < 2 * 3; ++i) {
+    src_ptr[i] = static_cast<float16>(i);
+  }
+  EXPECT_EQ(src.memory_size(), 2 * 3 * sizeof(float16));
+  // EXPECT a human readable error message
+  // src.data<uint8_t>();
+  // Tensor holds the wrong type, it holds N6paddle8platform7float16E at
+  // [/paddle/Paddle/paddle/fluid/framework/tensor_impl.h:43]
+}
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index f98011e896f4033ef210e0eb69f93ce7800a3cd6..05c4a17a01c6fabe48f3fe18544c13153feb0673 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -15,6 +15,7 @@
 #include <algorithm>
 #include <limits>
 #include <vector>
+#include "paddle/fluid/framework/data_type.h"
 
 namespace paddle {
 namespace framework {
@@ -148,7 +149,7 @@ struct AnyDTypeVisitor {
       : predicate_(predicate), tensor_(tensor), ctx_(ctx), out_(out) {}
 
   template <typename T>
-  void operator()() const {
+  void apply() const {
     auto t = EigenVector<T>::Flatten(tensor_);
     auto o = EigenScalar<bool>::From(*out_);
     // return any of predicate_(t) is true.
@@ -261,7 +262,8 @@ void TensorToStream(std::ostream& os, const Tensor& tensor,
     os.write(out.data(), size);
   }
   {  // the 3rd field, tensor data
-    uint64_t size = tensor.memory_size();
+    uint64_t size = tensor.numel() * framework::SizeOfType(tensor.type());
+
     auto* data_ptr = tensor.data<void>();
     PADDLE_ENFORCE(size < std::numeric_limits<std::streamsize>::max(),
                    "Index overflow when writing tensor");
@@ -300,7 +302,7 @@ struct DeserializedDataFunctor {
       : buf_(buf), tensor_(tensor), place_(place) {}
 
   template <typename T>
-  void operator()() {
+  void apply() {
     *buf_ = tensor_->mutable_data<T>(place_);
   }
 
@@ -331,6 +333,9 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
     tensor->Resize(framework::make_ddim(dims));
     void* buf;
     auto ctx = platform::CPUDeviceContext();
+    size_t size =
+        tensor->numel() *
+        framework::SizeOfType(framework::ToTypeIndex(desc.data_type()));
     if (platform::is_gpu_place(dev_ctx.GetPlace())) {
 #ifdef PADDLE_WITH_CUDA
       Tensor cpu_tensor;
@@ -338,7 +343,7 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
       framework::VisitDataType(
           desc.data_type(),
           DeserializedDataFunctor(&buf, &cpu_tensor, ctx.GetPlace()));
-      is.read(static_cast<char*>(buf), cpu_tensor.memory_size());
+      is.read(static_cast<char*>(buf), size);
       auto dst_place = dev_ctx.GetPlace();
       framework::TensorCopy(cpu_tensor, dst_place, dev_ctx, tensor);
 #else
@@ -348,7 +353,7 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
       framework::VisitDataType(
           desc.data_type(),
           DeserializedDataFunctor(&buf, tensor, ctx.GetPlace()));
-      is.read(static_cast<char*>(buf), tensor->memory_size());
+      is.read(static_cast<char*>(buf), size);
     }
   }
 }
diff --git a/paddle/fluid/framework/threadpool.cc b/paddle/fluid/framework/threadpool.cc
index f26f212d4d5793b88fd1e6d782cdf983bf341879..18cdca3a658a6a89e6ab637a7f38825756acfea8 100644
--- a/paddle/fluid/framework/threadpool.cc
+++ b/paddle/fluid/framework/threadpool.cc
@@ -20,6 +20,9 @@
 DEFINE_int32(io_threadpool_size, 100,
              "number of threads used for doing IO, default 100");
 
+DEFINE_int32(dist_threadpool_size, 0,
+             "number of threads used for distributed executed.");
+
 namespace paddle {
 namespace framework {
 
@@ -35,6 +38,10 @@ void ThreadPool::Init() {
   if (threadpool_.get() == nullptr) {
     // TODO(Yancey1989): specify the max threads number
     int num_threads = std::thread::hardware_concurrency();
+    if (FLAGS_dist_threadpool_size > 0) {
+      num_threads = FLAGS_dist_threadpool_size;
+      VLOG(1) << "set dist_threadpool_size to " << num_threads;
+    }
     PADDLE_ENFORCE_GT(num_threads, 0);
     threadpool_.reset(new ThreadPool(num_threads));
   }
diff --git a/paddle/fluid/framework/var_type.h b/paddle/fluid/framework/var_type.h
index 429997c8b89fef7aa164e878095ab3b5c9998e5b..e9550dbfb976bee70741158b94b04084919e8271 100644
--- a/paddle/fluid/framework/var_type.h
+++ b/paddle/fluid/framework/var_type.h
@@ -26,7 +26,7 @@ namespace paddle {
 namespace framework {
 
 template <typename T>
-bool IsType(const std::type_index& type_index) {
+inline bool IsType(const std::type_index& type_index) {
   return type_index == std::type_index(typeid(T));
 }
 
diff --git a/paddle/fluid/framework/version.cc b/paddle/fluid/framework/version.cc
new file mode 100644
index 0000000000000000000000000000000000000000..81c0392bf3cc7378cec06a9de3ae81f2b221ecec
--- /dev/null
+++ b/paddle/fluid/framework/version.cc
@@ -0,0 +1,36 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/version.h"
+#include <algorithm>
+
+namespace paddle {
+namespace framework {
+bool IsProgramVersionSupported(int64_t version) {
+  static int num_supported =
+      sizeof(kSupportedProgramVersion) / sizeof(kSupportedProgramVersion[0]);
+  return std::find(kSupportedProgramVersion,
+                   kSupportedProgramVersion + num_supported,
+                   version) != kSupportedProgramVersion + num_supported;
+}
+
+bool IsTensorVersionSupported(uint32_t version) {
+  static int num_supported =
+      sizeof(kSupportedTensorVersion) / sizeof(kSupportedTensorVersion[0]);
+  return std::find(kSupportedTensorVersion,
+                   kSupportedTensorVersion + num_supported,
+                   version) != kSupportedTensorVersion + num_supported;
+}
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/version.h b/paddle/fluid/framework/version.h
new file mode 100644
index 0000000000000000000000000000000000000000..9945bc58c69df8456ff3d1aa0c777970bdbdbf98
--- /dev/null
+++ b/paddle/fluid/framework/version.h
@@ -0,0 +1,47 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cstdint>
+
+#pragma once
+
+namespace paddle {
+namespace framework {
+
+// Note:
+// Program and Tensor that pass the IsXXXVersionSupported should
+// be supported by the current codes. Otherwise, it's a compatibility
+// bug.
+
+// The program version the current codes generate.
+constexpr int64_t kCurProgramVersion = 0;
+
+// The program version that was generated by previous or current codes
+// and supported by current codes.
+constexpr int64_t kSupportedProgramVersion[] = {0};
+
+// Due to historical reasons, tensor version use uint32_t.
+// The tensor version the current codes generate.
+constexpr uint32_t kCurTensorVersion = 0;
+
+// The tensor version that was generated by previous or current codes
+// and supported by current codes.
+constexpr uint32_t kSupportedTensorVersion[] = {0};
+
+bool IsProgramVersionSupported(int64_t version);
+
+bool IsTensorVersionSupported(uint32_t version);
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/version_test.cc b/paddle/fluid/framework/version_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e8c5f256000522af976bbf487741a586f1abc439
--- /dev/null
+++ b/paddle/fluid/framework/version_test.cc
@@ -0,0 +1,30 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/version.h"
+#include "gtest/gtest.h"
+
+namespace paddle {
+namespace framework {
+TEST(Version, Basic) {
+  EXPECT_TRUE(IsProgramVersionSupported(0));
+  EXPECT_FALSE(IsProgramVersionSupported(1));
+  EXPECT_FALSE(IsProgramVersionSupported(-1));
+
+  EXPECT_TRUE(IsTensorVersionSupported(0));
+  EXPECT_FALSE(IsTensorVersionSupported(1));
+  EXPECT_FALSE(IsTensorVersionSupported(-1));
+}
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index 1895aea7f98cb1ad12b2ce16545339252349ea37..efb91bcf75a3cb99a67d5a3251b1d42fc4b04170 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -1,21 +1,38 @@
-set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor )
+# analysis and tensorrt must be added before creating static library,
+# otherwise, there would be undefined reference to them in static library.
+add_subdirectory(analysis)
+if (TENSORRT_FOUND)
+  add_subdirectory(tensorrt)
+endif()
+
+set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor)
 
 # TODO(panyx0718): Should this be called paddle_fluid_inference_api_internal?
 cc_library(paddle_fluid_api
     SRCS io.cc
-    DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
+    DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB}) 
 
 get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
 
-if(WITH_CONTRIB)
-  set(fluid_modules "${fluid_modules}" paddle_inference_api)
-endif()
+# paddle_fluid_origin exclude inference api interface
+cc_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api)
+
+#if(APPLE)
+  add_subdirectory(api)
+#endif()
 
 # Create static library
-cc_library(paddle_fluid DEPS ${fluid_modules} paddle_fluid_api)
+cc_library(paddle_fluid DEPS ${fluid_modules} paddle_fluid_api paddle_inference_api analysis_predictor)
+if(NOT APPLE)
+  # TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac.
+  set(LINK_FLAGS "-Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/paddle_fluid.sym")
+  set_target_properties(paddle_fluid PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
+endif()
+
 # Create shared library
 cc_library(paddle_fluid_shared SHARED
-    SRCS io.cc
+    SRCS io.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api_impl.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/api/analysis_predictor.cc
     DEPS ${fluid_modules} paddle_fluid_api)
 
 set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)
@@ -23,15 +40,22 @@ if(NOT APPLE)
   # TODO(liuyiqun): Temporarily disable the link flag because it is not support on Mac.
   set(LINK_FLAGS "-Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/paddle_fluid.map")
   set_target_properties(paddle_fluid_shared PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
+  # check symbol hidden
+  FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/check_symbol.cmake
+    "execute_process(COMMAND bash -c \"${CMAKE_CURRENT_SOURCE_DIR}/check_symbol.sh"
+    " ${CMAKE_CURRENT_BINARY_DIR}/libpaddle_fluid.so\" RESULT_VARIABLE symbol_res)\n"
+    "if(NOT \"\${symbol_res}\" STREQUAL \"0\")\n"
+    "  message(FATAL_ERROR \"Check symbol failed.\")\n"
+    "endif()\n")
+  add_custom_command(
+    OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/.check_symbol"
+    COMMAND ${CMAKE_COMMAND} -P "${CMAKE_CURRENT_BINARY_DIR}/check_symbol.cmake"
+    DEPENDS paddle_fluid_shared)
+  add_custom_target(check_symbol ALL DEPENDS "${CMAKE_CURRENT_BINARY_DIR}/.check_symbol")
 endif()
 
 if(WITH_TESTING)
-  # both tests/book and analysis depends the models that generated by python/paddle/fluid/tests/book
+  # tests/book depends the models that generated by python/paddle/fluid/tests/book
   add_subdirectory(tests/book)
-endif()
-
-add_subdirectory(analysis)
-
-if (TENSORRT_FOUND)
-  add_subdirectory(tensorrt)
+  add_subdirectory(tests/api)
 endif()
diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt
index cdd67fdc929851979fe0a38afe1af74ec7321b8a..c2a1c6634bd8f8de0796456e91cb3c530d4c6823 100644
--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
@@ -1,14 +1,24 @@
-cc_library(analysis SRCS pass_manager.cc dot.cc node.cc data_flow_graph.cc graph_traits.cc subgraph_splitter.cc
+cc_library(ir_pass_manager SRCS ir_pass_manager.cc DEPS graph pass)
+set(analysis_deps
+    framework_proto proto_desc ir_pass_manager graph pass paddle_fluid_api executor pretty_log)
+
+cc_library(analysis SRCS pass_manager.cc node.cc data_flow_graph.cc graph_traits.cc subgraph_splitter.cc
+  analyzer.cc
+  helper.cc
+  # passes
+  analysis_pass.cc
   fluid_to_data_flow_graph_pass.cc
   data_flow_graph_to_fluid_pass.cc
   dfg_graphviz_draw_pass.cc
   tensorrt_subgraph_pass.cc
   tensorrt_subgraph_node_mark_pass.cc
-  analyzer.cc
-  helper.cc
-  DEPS framework_proto proto_desc)
+  fluid_to_ir_pass.cc
+  model_store_pass.cc
+  DEPS ${analysis_deps})
+
 cc_test(test_node SRCS node_tester.cc DEPS analysis)
 cc_test(test_dot SRCS dot_tester.cc DEPS analysis)
+cc_binary(inference_analyzer SRCS analyzer_main.cc DEPS analysis paddle_fluid)
 
 set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests)
 
@@ -16,23 +26,28 @@ function (inference_analysis_test TARGET)
     if(WITH_TESTING)
         set(options "")
         set(oneValueArgs "")
-        set(multiValueArgs SRCS)
+        set(multiValueArgs SRCS ARGS EXTRA_DEPS)
         cmake_parse_arguments(analysis_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-
+        set(mem_opt "")
+        if(WITH_GPU)
+            set(mem_opt "--fraction_of_gpu_memory_to_use=0.5")
+        endif()
         cc_test(${TARGET}
                 SRCS "${analysis_test_SRCS}"
-                DEPS analysis
-                ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model --fraction_of_gpu_memory_to_use=0.5)
+                DEPS analysis pass ${GLOB_PASS_LIB} ${analysis_test_EXTRA_DEPS}
+                ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model ${mem_opt} ${analysis_test_ARGS})
         set_tests_properties(${TARGET} PROPERTIES DEPENDS test_word2vec)
     endif(WITH_TESTING)
 endfunction(inference_analysis_test)
 
+inference_analysis_test(test_analyzer SRCS analyzer_tester.cc EXTRA_DEPS paddle_inference_api)
 inference_analysis_test(test_data_flow_graph SRCS data_flow_graph_tester.cc)
 inference_analysis_test(test_data_flow_graph_to_fluid_pass SRCS data_flow_graph_to_fluid_pass_tester.cc)
+inference_analysis_test(test_fluid_to_ir_pass SRCS fluid_to_ir_pass_tester.cc)
 inference_analysis_test(test_fluid_to_data_flow_graph_pass SRCS fluid_to_data_flow_graph_pass_tester.cc)
 inference_analysis_test(test_subgraph_splitter SRCS subgraph_splitter_tester.cc)
 inference_analysis_test(test_dfg_graphviz_draw_pass SRCS dfg_graphviz_draw_pass_tester.cc)
 inference_analysis_test(test_tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass_tester.cc)
 inference_analysis_test(test_pass_manager SRCS pass_manager_tester.cc)
 inference_analysis_test(test_tensorrt_subgraph_node_mark_pass SRCS tensorrt_subgraph_node_mark_pass_tester.cc)
-inference_analysis_test(test_analyzer SRCS analyzer_tester.cc)
+inference_analysis_test(test_model_store_pass SRCS model_store_pass_tester.cc)
diff --git a/paddle/fluid/inference/analysis/pass.cc b/paddle/fluid/inference/analysis/analysis_pass.cc
similarity index 91%
rename from paddle/fluid/inference/analysis/pass.cc
rename to paddle/fluid/inference/analysis/analysis_pass.cc
index 121b72c0a0aa9a0c568b04f7ee9a5bc5c1d6f5f8..9be9f755b9ed7273d842f8c0e2046f0ca0ce2247 100644
--- a/paddle/fluid/inference/analysis/pass.cc
+++ b/paddle/fluid/inference/analysis/analysis_pass.cc
@@ -12,4 +12,4 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/inference/analysis/pass.h"
+#include "paddle/fluid/inference/analysis/analysis_pass.h"
diff --git a/paddle/fluid/inference/analysis/pass.h b/paddle/fluid/inference/analysis/analysis_pass.h
similarity index 58%
rename from paddle/fluid/inference/analysis/pass.h
rename to paddle/fluid/inference/analysis/analysis_pass.h
index 6b4dbb3bb5ddd9f15f26758beef1d1b5bbf49142..b6edb5529ace2ad5bd1b35bfbee1f7a744457cc3 100644
--- a/paddle/fluid/inference/analysis/pass.h
+++ b/paddle/fluid/inference/analysis/analysis_pass.h
@@ -28,10 +28,10 @@ namespace paddle {
 namespace inference {
 namespace analysis {
 
-class Pass {
+class AnalysisPass {
  public:
-  Pass() = default;
-  virtual ~Pass() = default;
+  AnalysisPass() = default;
+  virtual ~AnalysisPass() = default;
   // Mutable Pass.
   virtual bool Initialize(Argument *argument) { return false; }
   // Readonly Pass.
@@ -42,52 +42,25 @@ class Pass {
   virtual bool Finalize() { return false; }
 
   // Get a Pass appropriate to print the Node this pass operates on.
-  virtual Pass *CreatePrinterPass(std::ostream &os,
-                                  const std::string &banner) const {
+  virtual AnalysisPass *CreatePrinterPass(std::ostream &os,
+                                          const std::string &banner) const {
     return nullptr;
   }
 
   // Create a debugger Pass that draw the DFG by graphviz toolkit.
-  virtual Pass *CreateGraphvizDebugerPass() const { return nullptr; }
+  virtual AnalysisPass *CreateGraphvizDebugerPass() const { return nullptr; }
 
-  // Run on a single Node.
-  virtual void Run(Node *x) { LOG(FATAL) << "not valid"; }
-  // Run on a single Function.
-  virtual void Run(Function *x) { LOG(FATAL) << "not valid"; }
-  // Run on a single FunctionBlock.
-  virtual void Run(FunctionBlock *x) { LOG(FATAL) << "not valid"; }
   // Run on a single DataFlowGraph.
-  virtual void Run(DataFlowGraph *x) { LOG(FATAL) << "not valid"; }
+  virtual void Run(DataFlowGraph *x) = 0;
 
   // Human-readable short representation.
   virtual std::string repr() const = 0;
   // Human-readable long description.
-  virtual std::string description() const = 0;
-};
-
-// NodePass process on any Node types.
-class NodePass : public Pass {
- public:
-  virtual void Run(Node *node) = 0;
-};
-
-// NodePass process on any Function node types.
-class FunctionPass : public Pass {
- public:
-  virtual void Run(Function *node) = 0;
-};
-
-// NodePass process on any FunctionBlock node types.
-class FunctionBlockPass : public Pass {
- public:
-  virtual void Run(FunctionBlock *node) = 0;
+  virtual std::string description() const { return "No DOC"; }
 };
 
 // GraphPass processes on any GraphType.
-class DataFlowGraphPass : public Pass {
- public:
-  virtual void Run(DataFlowGraph *graph) = 0;
-};
+class DataFlowGraphPass : public AnalysisPass {};
 
 }  // namespace analysis
 }  // namespace inference
diff --git a/paddle/fluid/inference/analysis/analyzer.cc b/paddle/fluid/inference/analysis/analyzer.cc
index a4625f008c15300b88ef0bce71cd7d8aa473c9a8..6dc39cae0522efd48c2e2921611adebd6937ddf7 100644
--- a/paddle/fluid/inference/analysis/analyzer.cc
+++ b/paddle/fluid/inference/analysis/analyzer.cc
@@ -14,55 +14,83 @@
 
 #include "paddle/fluid/inference/analysis/analyzer.h"
 #include <string>
+#include <vector>
+
 #include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
 #include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
 #include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
+#include "paddle/fluid/inference/analysis/fluid_to_ir_pass.h"
+#include "paddle/fluid/inference/analysis/model_store_pass.h"
 #include "paddle/fluid/inference/analysis/pass_manager.h"
 #include "paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h"
 #include "paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h"
 
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-DEFINE_bool(inference_analysis_enable_tensorrt_subgraph_engine, false,
+DEFINE_bool(IA_enable_tensorrt_subgraph_engine, false,
             "Enable subgraph to TensorRT engine for acceleration");
 
-DEFINE_string(inference_analysis_graphviz_log_root, "./",
+DEFINE_bool(IA_enable_ir, false, "Turn on IR support");
+
+DEFINE_string(IA_graphviz_log_root, "./",
               "Graphviz debuger for data flow graphs.");
 
+DEFINE_string(IA_output_storage_path, "", "optimized model output path");
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
 class DfgPassManagerImpl final : public DfgPassManager {
  public:
   DfgPassManagerImpl() {
     // TODO(Superjomn) set the key with pass reprs.
-    AddPass("fluid-to-data-flow-graph", new FluidToDataFlowGraphPass);
-    if (FLAGS_inference_analysis_enable_tensorrt_subgraph_engine) {
-      auto trt_teller = [](const Node* node) {
-        if (!node->IsFunction()) return false;
-        return static_cast<const Function*>(node)->func_type() == "mul";
-      };
-      AddPass("tensorrt-subgraph-marker",
-              new TensorRTSubgraphNodeMarkPass(trt_teller));
-      AddPass("tensorrt-subgraph", new TensorRTSubGraphPass(trt_teller));
+    if (!FLAGS_IA_enable_ir) {
+      AddPass("fluid-to-data-flow-graph", new FluidToDataFlowGraphPass);
+    } else {
+      AddPass("fluid-to-ir-pass", new FluidToIrPass);
     }
+    TryAddTensorRtPass();
     AddPass("data-flow-graph-to-fluid", new DataFlowGraphToFluidPass);
+    if (!FLAGS_IA_output_storage_path.empty()) {
+      AddPass("model-store-pass", new ModelStorePass);
+    }
   }
 
   std::string repr() const override { return "dfg-pass-manager"; }
   std::string description() const override { return "DFG pass manager."; }
 
  private:
-  void AddPass(const std::string& name, Pass* pass) {
-    LOG(INFO) << "Adding pass " << name;
+  void AddPass(const std::string& name, AnalysisPass* pass) {
+    VLOG(3) << "Adding pass " << name;
     Register(name, pass);
     AddGraphvizDebugerPass(pass);
   }
 
+  void TryAddTensorRtPass() {
+    if (FLAGS_IA_enable_tensorrt_subgraph_engine) {
+      auto trt_teller = [&](const Node* node) {
+        std::unordered_set<std::string> teller_set(
+            {"elementwise_add", "mul", "conv2d", "pool2d", "relu", "softmax",
+             "depthwise_conv2d", "batch_norm", "concat"});
+        if (!node->IsFunction()) return false;
+
+        const auto* func = static_cast<const Function*>(node);
+        if (teller_set.count(func->func_type())) {
+          return true;
+        } else {
+          return false;
+        }
+      };
+
+      AddPass("tensorrt-subgraph-marker",
+              new TensorRTSubgraphNodeMarkPass(trt_teller));
+      AddPass("tensorrt-subgraph", new TensorRTSubGraphPass(trt_teller));
+    }
+  }
+
   // Add the graphviz debuger pass if the parent pass has one.
-  void AddGraphvizDebugerPass(Pass* pass) {
+  void AddGraphvizDebugerPass(AnalysisPass* pass) {
     auto* debuger_pass = pass->CreateGraphvizDebugerPass();
     if (debuger_pass) {
-      LOG(INFO) << " - register debug pass [" << debuger_pass->repr() << "]";
       Register(debuger_pass->repr(), debuger_pass);
     }
   }
@@ -71,6 +99,16 @@ class DfgPassManagerImpl final : public DfgPassManager {
 Analyzer::Analyzer() { Register("manager1", new DfgPassManagerImpl); }
 
 void Analyzer::Run(Argument* argument) {
+  std::vector<std::string> passes;
+  for (auto& pass : all_ir_passes_) {
+    if (!disabled_ir_passes_.count(pass)) {
+      passes.push_back(pass);
+      passes.push_back("graph_viz_pass");  // add graphviz for debug.
+    }
+  }
+  passes.push_back("graph_viz_pass");
+  argument->Set(kFluidToIrPassesAttr, new std::vector<std::string>(passes));
+
   for (auto& x : data_) {
     PADDLE_ENFORCE(x->Initialize(argument));
     x->RunAll();
@@ -78,6 +116,11 @@ void Analyzer::Run(Argument* argument) {
   }
 }
 
+Analyzer& Analyzer::DisableIrPasses(const std::vector<std::string>& passes) {
+  disabled_ir_passes_.insert(passes.begin(), passes.end());
+  return *this;
+}
+
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/analyzer.h b/paddle/fluid/inference/analysis/analyzer.h
index e9e14fb1947da059c8d126d3da182ce446f6421e..9bdbefc07cbc4bf7a4714927c84855837610430e 100644
--- a/paddle/fluid/inference/analysis/analyzer.h
+++ b/paddle/fluid/inference/analysis/analyzer.h
@@ -16,43 +16,36 @@ limitations under the License. */
 
 /*
  * This file contains Analyzer, an class that exposed as a library that analyze
- * and optimize
- * Fluid ProgramDesc for inference. Similar to LLVM, it has multiple flags to
- * control whether
- * an process is applied on the program.
+ * and optimize Fluid ProgramDesc for inference. Similar to LLVM, it has
+ * multiple flags to
+ * control whether an process is applied on the program.
  *
  * The processes are called Passes in analysis, the Passes are placed in a
- * pipeline, the first
- * Pass is the FluidToDataFlowGraphPass which transforms a Fluid ProgramDesc to
- * a data flow
- * graph, the last Pass is DataFlowGraphToFluidPass which transforms a data flow
- * graph to a
- * Fluid ProgramDesc. The passes in the middle of the pipeline can be any Passes
- * which take a
- * node or data flow graph as input.
+ * pipeline, the first Pass is the FluidToDataFlowGraphPass which transforms a
+ * Fluid ProgramDesc to
+ * a data flow graph, the last Pass is DataFlowGraphToFluidPass which transforms
+ * a data flow graph to a Fluid ProgramDesc. The passes in the middle of the
+ * pipeline can be any Passes
+ * which take a node or data flow graph as input.
  *
  * The Analyzer can be used in two methods, the first is a executable file which
- * can be used to
- * pre-process the inference model and can be controlled by passing difference
- * command flags;
+ * can be used to pre-process the inference model and can be controlled by
+ * passing difference command flags;
  * the other way is to compose inside the inference API as a runtime pre-process
- * phase in the
- * inference service.
+ * phase in the inference service.
  */
 
 #include <gflags/gflags.h>
-#include "paddle/fluid/inference/analysis/pass.h"
+#include <string>
+#include <vector>
+#include "paddle/fluid/inference/analysis/analysis_pass.h"
+#include "paddle/fluid/inference/analysis/flags.h"
 #include "paddle/fluid/inference/analysis/pass_manager.h"
 
 namespace paddle {
 namespace inference {
 namespace analysis {
 
-// TODO(Superjomn) add a definition flag like PADDLE_WITH_TENSORRT and hide this
-// flag if not available.
-DECLARE_bool(inference_analysis_enable_tensorrt_subgraph_engine);
-DECLARE_string(inference_analysis_graphviz_log_root);
-
 class Analyzer : public OrderedRegistry<PassManager> {
  public:
   // Register all the pass-managers.
@@ -60,7 +53,31 @@ class Analyzer : public OrderedRegistry<PassManager> {
 
   void Run(Argument* argument);
 
+  Analyzer& DisableIrPasses(const std::vector<std::string>& passes);
+
   DISABLE_COPY_AND_ASSIGN(Analyzer);
+
+ private:
+  // All avaiable IR passes.
+  // The bigger fuse comes first, so that the small operators prefer to be
+  // merged in a larger fuse op. The small fusion will not break the pattern of
+  // larger fusion.
+  const std::vector<std::string> all_ir_passes_{{
+      // Manual update the passes here.
+      "infer_clean_graph_pass",    //
+      "attention_lstm_fuse_pass",  //
+      "fc_lstm_fuse_pass",         //
+      "mul_lstm_fuse_pass",        //
+      "fc_gru_fuse_pass",          //
+      "mul_gru_fuse_pass",         //
+      "seq_concat_fc_fuse_pass",   //
+      "fc_fuse_pass",              //
+#ifdef PADDLE_WITH_MKLDNN
+      "conv_relu_mkldnn_fuse_pass",  //
+#endif
+  }};
+
+  std::unordered_set<std::string> disabled_ir_passes_;
 };
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/analysis/analyzer_main.cc b/paddle/fluid/inference/analysis/analyzer_main.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5e1fe3eb797cdced56a61aa2db0c3d18601824f8
--- /dev/null
+++ b/paddle/fluid/inference/analysis/analyzer_main.cc
@@ -0,0 +1,33 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/*
+ * This file implements analysizer -- an executation help to analyze and
+ * optimize trained model.
+ */
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+
+int main(int argc, char** argv) {
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  using paddle::inference::analysis::Analyzer;
+  using paddle::inference::analysis::Argument;
+
+  Argument argument;
+  Analyzer analyzer;
+  analyzer.Run(&argument);
+
+  return 0;
+}
diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc
index d7c1a72932a39f878add2bb884e280b91d3c38c0..3b5be7f3ee33c73a9704bafa9f1b736c8a3cd9ea 100644
--- a/paddle/fluid/inference/analysis/analyzer_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
@@ -13,17 +13,76 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/analysis/analyzer.h"
+
+#include <google/protobuf/text_format.h>
+#include <gtest/gtest.h>
 #include "paddle/fluid/inference/analysis/ut_helper.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/api/paddle_inference_pass.h"
 
 namespace paddle {
 namespace inference {
 namespace analysis {
 
-TEST_F(DFG_Tester, main) {
+using namespace framework;  // NOLINT
+
+TEST(Analyzer, analysis_without_tensorrt) {
+  FLAGS_IA_enable_tensorrt_subgraph_engine = false;
+  Argument argument;
+  argument.fluid_model_dir.reset(new std::string(FLAGS_inference_model_dir));
   Analyzer analyser;
   analyser.Run(&argument);
 }
 
+TEST(Analyzer, analysis_with_tensorrt) {
+  FLAGS_IA_enable_tensorrt_subgraph_engine = true;
+  Argument argument;
+  argument.fluid_model_dir.reset(new std::string(FLAGS_inference_model_dir));
+  Analyzer analyser;
+  analyser.Run(&argument);
+}
+
+void TestWord2vecPrediction(const std::string &model_path) {
+  NativeConfig config;
+  config.model_dir = model_path;
+  config.use_gpu = false;
+  config.device = 0;
+  auto predictor =
+      ::paddle::CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(
+          config);
+
+  // One single batch
+
+  int64_t data[4] = {1, 2, 3, 4};
+  PaddleTensor tensor;
+  tensor.shape = std::vector<int>({4, 1});
+  tensor.data = PaddleBuf(data, sizeof(data));
+  tensor.dtype = PaddleDType::INT64;
+
+  // For simplicity, we set all the slots with the same data.
+  std::vector<PaddleTensor> slots(4, tensor);
+  std::vector<PaddleTensor> outputs;
+  CHECK(predictor->Run(slots, &outputs));
+
+  PADDLE_ENFORCE(outputs.size(), 1UL);
+  // Check the output buffer size and result of each tid.
+  PADDLE_ENFORCE(outputs.front().data.length(), 33168UL);
+  float result[5] = {0.00129761, 0.00151112, 0.000423564, 0.00108815,
+                     0.000932706};
+  const size_t num_elements = outputs.front().data.length() / sizeof(float);
+  // The outputs' buffers are in CPU memory.
+  for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
+    LOG(INFO) << "data: "
+              << static_cast<float *>(outputs.front().data.data())[i];
+    PADDLE_ENFORCE(static_cast<float *>(outputs.front().data.data())[i],
+                   result[i]);
+  }
+}
+
+TEST(Analyzer, word2vec_without_analysis) {
+  TestWord2vecPrediction(FLAGS_inference_model_dir);
+}
+
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
index 6d316f20bff7a68754b0afec6463bd5d7579227f..e8fb0775b45761f64fd6fd28306c35b76d1e40c4 100644
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -23,8 +23,10 @@
 
 #pragma once
 
+#include <string>
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/inference/analysis/data_flow_graph.h"
+#include "paddle/fluid/platform/variant.h"
 
 namespace paddle {
 namespace inference {
@@ -36,6 +38,16 @@ namespace analysis {
  * All the fields should be registered here for clearness.
  */
 struct Argument {
+  Argument() = default;
+  explicit Argument(const std::string& fluid_model_dir)
+      : fluid_model_dir(new std::string(fluid_model_dir)) {}
+  // The directory of the trained model.
+  std::unique_ptr<std::string> fluid_model_dir;
+  // The path of `__model__` and `param`, this is used when the file name of
+  // model and param is changed.
+  std::unique_ptr<std::string> fluid_model_program_path;
+  std::unique_ptr<std::string> fluid_model_param_path;
+
   // The graph that process by the Passes or PassManagers.
   std::unique_ptr<DataFlowGraph> main_dfg;
 
@@ -44,6 +56,50 @@ struct Argument {
 
   // The processed program desc.
   std::unique_ptr<framework::proto::ProgramDesc> transformed_program_desc;
+
+  // The output storage path of ModelStorePass.
+  std::unique_ptr<std::string> model_output_store_path;
+
+  // Support for any other attributes.
+  template <typename T>
+  void Set(const std::string& key, T* data) {
+    PADDLE_ENFORCE_NOT_NULL(data);
+    PADDLE_ENFORCE(!attrs_.count(key), "Duplicate set Argument's attr [%s]",
+                   key);
+    attrs_[key] = data;
+    attr_deleters_[key] = [data, key]() {
+      VLOG(3) << "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx";
+      VLOG(3) << "argument delete attr: " << key;
+      delete data;
+    };
+  }
+
+  bool Has(const std::string& name) const { return attrs_.count(name); }
+
+  template <typename T>
+  T* Release(const std::string& key) {
+    PADDLE_ENFORCE(attrs_.count(key));
+    auto* res = boost::any_cast<T*>(attrs_.at(key));
+    attrs_.erase(key);
+    attr_deleters_.erase(key);
+    return res;
+  }
+
+  template <typename T>
+  T& Get(const std::string& key) {
+    PADDLE_ENFORCE(Has(key));
+    return *boost::any_cast<T*>(attrs_.at(key));
+  }
+
+  ~Argument() {
+    for (auto& item : attr_deleters_) {
+      item.second();
+    }
+  }
+
+ private:
+  std::unordered_map<std::string, boost::any> attrs_;
+  std::unordered_map<std::string, std::function<void()>> attr_deleters_;
 };
 
 #define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
diff --git a/paddle/fluid/inference/analysis/data_flow_graph.cc b/paddle/fluid/inference/analysis/data_flow_graph.cc
index d09bf3ed161703b0cf273522921e157c7360a0bc..8c7d58678fd29cb25d13d64a08e6c6f26f242d8b 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph.cc
@@ -19,14 +19,16 @@ limitations under the License. */
 namespace paddle {
 namespace inference {
 namespace analysis {
+using ir_node_t = framework::ir::Node;
+using ir_graph_t = framework::ir::Graph;
 
 // It is a better idea that the inputs and outputs of this graph is set manually
 // before, but there must be a Pass that helps to prune the unnecessary ops that
 // do not contribute to the given targets, so in this pass, analysis and get the
 // inputs and outputs is OK.
 void DataFlowGraph::Build() {
-  inputs.clear();
-  outputs.clear();
+  inputs_.clear();
+  outputs_.clear();
   std::unordered_set<Node *> ins;
   std::unordered_set<Node *> outs;
   for (auto &node : nodes.nodes()) {
@@ -42,18 +44,140 @@ void DataFlowGraph::Build() {
   // similarly, the nodes that in outs but not in ins is the graphs' outputs
   for (auto *in : ins) {
     if (!outs.count(in)) {
-      inputs.push_back(in);
+      inputs_.push_back(in);
     }
   }
   for (auto *out : outs) {
-    if (!outs.count(out)) {
-      outputs.push_back(out);
+    if (!ins.count(out)) {
+      outputs_.push_back(out);
     }
   }
 
   Clean();
 }
 
+void DataFlowGraph::Build(const framework::proto::ProgramDesc &prog) {
+  // insert vars
+  // The `var2id` keeps a map from a variable's name to its Node-id, the Node-id
+  // will keep updating to its latest alias during the graph-building.
+  std::unordered_map<std::string, size_t> var2id;
+  auto &main_block = prog.blocks(framework::kRootBlockIndex);
+  for (int i = 0; i < main_block.vars_size(); i++) {
+    const auto &var = main_block.vars(i);
+    auto *v = nodes.Create(Node::Type::kValue);
+    v->SetName(var.name());
+    v->SetPbDesc(const_cast<void *>(static_cast<const void *>(&var)));
+    v->SetPbMsg(var.SerializeAsString());
+    var2id[var.name()] = v->id();
+  }
+
+  // The variables in a SSA can only write once, so if a variable is written
+  // multiple times(quite common in our ProgramDesc design), multiple alias
+  // Nodes of this variable will be created, and each will just write once.
+
+  // An set that keep all the names of the variables(the original, not alias)
+  // that have been written(as outputs). Once an Op's output variable hit the
+  // set, it should create a new alias and update the global alias for this
+  // variable. And that make a Data Flow Graph a SSA.
+  std::unordered_set<Node *> unique_written_vars;
+  for (int i = 0; i < main_block.ops_size(); i++) {
+    const auto &op = main_block.ops(i);
+    auto *o = nodes.Create(Node::Type::kFunction);
+    o->SetName(op.type());
+    static_cast<Function *>(o)->SetFuncType(op.type());
+    // Link to the original protobuf message's memory, make it easier to
+    // generate from a data flow graph to fluid ProgramDesc.
+    o->SetPbDesc(const_cast<void *>(static_cast<const void *>(&op)));
+    o->SetPbMsg(op.SerializeAsString());
+
+    // set inputs and outputs
+    for (int j = 0; j < op.inputs_size(); j++) {
+      auto &in_var = op.inputs(j);
+      for (int k = 0; k < in_var.arguments_size(); k++) {
+        auto *in = nodes.GetMutable(var2id.at(in_var.arguments(k)));
+        in->outlinks.push_back(o);
+        o->inlinks.push_back(in);
+        unique_written_vars.insert(in);
+      }
+    }
+    for (int j = 0; j < op.outputs_size(); j++) {
+      auto &out_var = op.outputs(j);
+      for (int k = 0; k < out_var.arguments_size(); k++) {
+        auto *out = nodes.GetMutable(var2id[out_var.arguments(k)]);
+        if (unique_written_vars.count(out)) {
+          // Loop found, for example, a = op(a), use SSA, change to a1 = op(a).
+          auto *out_alias = nodes.Create(Node::Type::kValue);
+          out_alias->SetName(out->name());
+          out_alias->SetPbDesc(out->pb_desc());
+          out_alias->SetPbMsg(out->pb_msg());
+          var2id[out_alias->name()] =
+              out_alias->id();  // update variable's alias Node
+          LOG(INFO) << "loop found in graph, create SSA alias node ["
+                    << out_alias->repr() << "] for [" << out->repr() << "]";
+          out = out_alias;
+        }
+        out->inlinks.push_back(o);
+        o->outlinks.push_back(out);
+      }
+    }
+  }
+  // Analysis and extract the inputs and outputs of this graph.
+  Build();
+}
+
+void DataFlowGraph::Build(const framework::ir::Graph &graph) {
+  // Create nodes
+  std::unordered_map<ir_node_t *, Node *> ir_node_map;
+  for (auto *ir_node : graph.Nodes()) {
+    Node *x{nullptr};
+    if (ir_node->IsOp()) {
+      PADDLE_ENFORCE(ir_node->Op());
+      VLOG(4) << "get op " << ir_node << " " << ir_node->Name();
+      x = nodes.Create(Node::Type::kFunction);
+      x->attr("ir_node").Pointer() = ir_node;
+      PADDLE_ENFORCE(ir_node->Op()->Proto());
+      x->SetName(ir_node->Op()->Proto()->type());
+      x->SetPbMsg(ir_node->Op()->Proto()->SerializeAsString());
+    } else if (ir_node->IsVar()) {
+      // Not create a Node for IR ControlDepVar, considering Inference currently
+      // just used in single thread scenerio.
+      VLOG(4) << "get var " << ir_node->Name();
+      x = nodes.Create(Node::Type::kValue);
+      x->attr("ir_node").Pointer() = ir_node;
+      x->SetName(ir_node->Name());
+      // x->SetPbMsg(ir_node->Var()->Proto()->SerializeAsString());
+    } else {
+      PADDLE_THROW("Failed to create an Node from IR, unknown type");
+    }
+    ir_node_map.emplace(ir_node, x);
+  }
+  VLOG(4) << "finish creating Nodes";
+
+  VLOG(4) << "to create edge";
+  // Create links
+  for (auto *ir_node : graph.Nodes()) {
+    auto it = ir_node_map.find(ir_node);
+    // Skip ControlDepVar.
+    if (it == ir_node_map.end()) continue;
+    auto *node = it->second;
+    for (auto *x : ir_node->inputs) {
+      if (!ir_node_map.count(x)) continue;
+      node->inlinks.push_back(ir_node_map.at(x));
+    }
+    for (auto *x : ir_node->outputs) {
+      if (!ir_node_map.count(x)) continue;
+      node->outlinks.push_back(ir_node_map.at(x));
+    }
+  }
+
+  Build();
+  PADDLE_ENFORCE(!inputs_.empty(),
+                 "Can't deduce any inputs from the graph, Is the graph empty?");
+
+  ir_graph = &graph;
+  VLOG(3) << "finished build from IR";
+}
+
 void DataFlowGraph::Clean() {
   for (auto &node : nodes.nodes()) {
     std::unordered_set<Node *> inlinks_set(node->inlinks.begin(),
@@ -61,11 +185,9 @@ void DataFlowGraph::Clean() {
     std::unordered_set<Node *> outlinks_set(node->outlinks.begin(),
                                             node->outlinks.end());
     if (inlinks_set.size() < node->inlinks.size()) {
-      LOG(INFO) << "Clean: node " << node->repr() << " prune duplicate inputs";
       node->inlinks.assign(inlinks_set.begin(), inlinks_set.end());
     }
     if (outlinks_set.size() < node->outlinks.size()) {
-      LOG(INFO) << "Clean: node " << node->repr() << " prune duplicate inputs";
       node->outlinks.assign(outlinks_set.begin(), outlinks_set.end());
     }
   }
@@ -90,6 +212,20 @@ std::string DataFlowGraph::DotString() const {
   return dot.Build();
 }
 
+std::string DataFlowGraph::HumanReadableInfo(bool show_values,
+                                             bool show_functions) const {
+  std::stringstream values, functions;
+  for (auto &n : nodes.nodes()) {
+    if (show_values && n->IsValue()) {
+      values << n->repr() << "\n";
+    }
+    if (show_functions && n->IsFunction()) {
+      functions << n->repr() << "\n";
+    }
+  }
+  return "Values:\n" + values.str() + "\n\n" + "Functions:\n" + functions.str();
+}
+
 //
 // NodesBFSIterator
 //
@@ -98,10 +234,10 @@ GraphTraits<DataFlowGraph>::NodesBFSIterator::NodesBFSIterator(
     const std::vector<Node *> &source)
     : queue_(source.begin(), source.end()) {}
 
-// GraphTraits<DataFlowGraph>::NodesBFSIterator::NodesBFSIterator(
-//     GraphTraits<DataFlowGraph>::NodesBFSIterator &&other) noexcept
-//     : queue_(std::move(other.queue_)),
-//       visited_(std::move(other.visited_)) {}
+GraphTraits<DataFlowGraph>::NodesBFSIterator::NodesBFSIterator(
+    GraphTraits<DataFlowGraph>::NodesBFSIterator &&other) noexcept
+    : queue_(std::move(other.queue_)),
+      visited_(std::move(other.visited_)) {}
 
 GraphTraits<DataFlowGraph>::NodesBFSIterator::NodesBFSIterator(
     const GraphTraits<DataFlowGraph>::NodesBFSIterator &other)
@@ -145,8 +281,8 @@ bool GraphTraits<DataFlowGraph>::NodesBFSIterator::operator==(
   if (queue_.empty()) return other.queue_.empty();
   if ((!queue_.empty()) && (!other.queue_.empty())) {
     return queue_.front() == other.queue_.front() &&
-           visited_.size() == other.visited_.size();  // here need to check the
-                                                      // equality of queue and
+           visited_.size() == other.visited_.size();
+    // equality of queue and
     // visited. Just a light but week implementation.
   }
   return false;
@@ -160,10 +296,10 @@ GraphTraits<DataFlowGraph>::NodesDFSIterator::NodesDFSIterator(
   for (auto *x : source) stack_.push(x);
 }
 
-// GraphTraits<DataFlowGraph>::NodesDFSIterator::NodesDFSIterator(
-//     GraphTraits<DataFlowGraph>::NodesDFSIterator &&other) noexcept
-//     : stack_(std::move(other.stack_)),
-//       visited_(std::move(other.visited_)) {}
+GraphTraits<DataFlowGraph>::NodesDFSIterator::NodesDFSIterator(
+    GraphTraits<DataFlowGraph>::NodesDFSIterator &&other) noexcept
+    : stack_(std::move(other.stack_)),
+      visited_(std::move(other.visited_)) {}
 
 GraphTraits<DataFlowGraph>::NodesDFSIterator::NodesDFSIterator(
     const GraphTraits<DataFlowGraph>::NodesDFSIterator &other)
@@ -208,6 +344,153 @@ Node *GraphTraits<DataFlowGraph>::NodesDFSIterator::operator->() {
   return stack_.top();
 }
 
+inline bool CheckNodeIndegreeEquals(const Node &node, size_t n) {
+  return node.inlinks.size() == n;
+}
+
+GraphTraits<DataFlowGraph>::NodesTSIterator::NodesTSIterator(
+    const std::vector<Node *> &source) {
+  PADDLE_ENFORCE(!source.empty(),
+                 "Start points of topological sorting should not be empty!");
+  // CHECK all the inputs' in-degree is 0
+  for (auto *node : source) {
+    PADDLE_ENFORCE(CheckNodeIndegreeEquals(*node, 0));
+  }
+
+  std::unordered_set<Node *> visited;
+  std::unordered_set<Node *> to_visit{source.begin(), source.end()};
+
+  std::vector<Node *> inlink_visited;
+  while (!to_visit.empty()) {
+    std::vector<Node *> queue(to_visit.begin(), to_visit.end());
+    for (auto *p : queue) {
+      if (p->deleted()) {
+        visited.insert(p);
+        to_visit.erase(p);
+        continue;
+      }
+      inlink_visited.clear();
+
+      std::copy_if(p->inlinks.begin(), p->inlinks.end(),
+                   std::back_inserter(inlink_visited),
+                   [&](Node *x) { return visited.count(x); });
+
+      if (inlink_visited.size() == p->inlinks.size()) {
+        sorted_.push_back(p);
+        for (auto *_ : p->outlinks) {
+          if (!visited.count(_)) {
+            to_visit.insert(_);
+          }
+        }
+
+        to_visit.erase(p);
+        visited.insert(p);
+      }
+    }
+  }
+}
+
+GraphTraits<DataFlowGraph>::NodesTSIterator::NodesTSIterator(
+    const paddle::inference::analysis::GraphTraits<
+        DataFlowGraph>::NodesTSIterator &other)
+    : sorted_(other.sorted_), cursor_(other.cursor_) {}
+
+Node &GraphTraits<DataFlowGraph>::NodesTSIterator::operator*() {
+  PADDLE_ENFORCE_LT(cursor_, sorted_.size());
+  return *sorted_[cursor_];
+}
+
+paddle::inference::analysis::GraphTraits<DataFlowGraph>::NodesTSIterator
+    &GraphTraits<DataFlowGraph>::NodesTSIterator::operator++() {
+  if (++cursor_ >= sorted_.size()) {
+    sorted_.clear();
+    cursor_ = 0;
+  }
+  return *this;
+}
+paddle::inference::analysis::GraphTraits<DataFlowGraph>::NodesTSIterator &
+GraphTraits<DataFlowGraph>::NodesTSIterator::operator=(
+    const paddle::inference::analysis::GraphTraits<
+        DataFlowGraph>::NodesTSIterator &other) {
+  cursor_ = other.cursor_;
+  sorted_ = other.sorted_;
+  return *this;
+}
+
+bool GraphTraits<DataFlowGraph>::NodesTSIterator::operator==(
+    const paddle::inference::analysis::GraphTraits<
+        DataFlowGraph>::NodesTSIterator &other) {
+  return sorted_ == other.sorted_ && cursor_ == other.cursor_;
+}
+
+Node *GraphTraits<DataFlowGraph>::NodesTSIterator::operator->() {
+  PADDLE_ENFORCE_LT(cursor_, sorted_.size());
+  return sorted_[cursor_];
+}
+
+std::pair<std::vector<Node *>, std::vector<Node *>>
+ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph) {  // NOLINT
+  std::unordered_set<Node *> nodes(graph.begin(), graph.end());
+  std::unordered_set<Node *> inputs;
+  std::unordered_set<Node *> outputs;
+  // Input a Value, check whether its inlink is in the subgraph.
+  auto inlink_in_subgraph = [&](Node *n) {
+    for (auto *in : n->inlinks) {
+      if (nodes.count(in)) return true;
+    }
+    return false;
+  };
+
+  for (auto &node : graph) {
+    for (auto *in : node->inlinks) {
+      // The Value that is written by nodes inside a sub-graph shouldn't be the
+      // input of the sub-graph.
+      if (!nodes.count(in) && in->type() == Node::Type::kValue &&
+          !inlink_in_subgraph(in)) {
+        inputs.insert(in);
+      }
+    }
+    for (auto *out : node->outlinks) {
+      if (!nodes.count(out) && out->type() == Node::Type::kValue) {
+        outputs.insert(out);
+      }
+    }
+  }
+  return std::make_pair(std::vector<Node *>(inputs.begin(), inputs.end()),
+                        std::vector<Node *>(outputs.begin(), outputs.end()));
+}
+
+// Filter the Intermediate results of the subgraph node.
+void FilterRedundantOutputOfSubGraph(DataFlowGraph *graph) {
+  std::vector<Node *> op_nodes;
+  for (auto &node : GraphTraits<DataFlowGraph>(*graph).nodes_in_TS()) {
+    if (node.type() == Node::Type::kValue || node.deleted()) {
+      continue;
+    }
+    op_nodes.push_back(&node);
+  }
+  size_t op_num = op_nodes.size();
+  for (size_t i = 0; i < op_num; i++) {
+    if (op_nodes[i]->type() == Node::Type::kFunction) continue;
+    std::unordered_set<std::string> follow_up_input_names;
+    for (size_t j = i + 1; j < op_num; j++) {
+      for (auto *in : op_nodes[j]->inlinks) {
+        follow_up_input_names.insert(in->name());
+      }
+    }
+    std::vector<Node *> filtered_subgraph_outlinks;
+    for (auto *out : op_nodes[i]->outlinks) {
+      if (follow_up_input_names.count(out->name())) {
+        filtered_subgraph_outlinks.push_back(out);
+      } else {
+        out->SetDeleted();
+      }
+    }
+    // The filtered_subgraph_outlinks may be empty.
+    op_nodes[i]->outlinks = filtered_subgraph_outlinks;
+  }
+}
+
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/data_flow_graph.h b/paddle/fluid/inference/analysis/data_flow_graph.h
index a4fefc83e0c551d52bec87299bcbc966e7a2dbf7..437e097acd24aad384df6712ce0de6106b3b5c65 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph.h
+++ b/paddle/fluid/inference/analysis/data_flow_graph.h
@@ -26,6 +26,7 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
+#include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/inference/analysis/graph_traits.h"
 #include "paddle/fluid/inference/analysis/node.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -36,19 +37,48 @@ namespace analysis {
 
 /*
  * DataFlowGraph - A container of Value and Function Nodes.
+ *
+ * This is the base graph for any other type of graphs, such as SSA or CFG.
  */
 struct DataFlowGraph {
   NodeMap nodes;
-  std::vector<Node *> inputs;
-  std::vector<Node *> outputs;
+  // inputs and outputs are deduced from the graph.
+  // Used to interact with IR.
+  const framework::ir::Graph *ir_graph{nullptr};
 
   // Extract inputs and outputs of the graph.
   void Build();
 
+  void Build(const framework::proto::ProgramDesc &prog);
+
+  // Build a graph from ir::Graph.
+  void Build(const framework::ir::Graph &graph);
+
+  // Get an attribute.
+  AnyAttr &Attr(const std::string &key) { return attrs_[key]; }
+
   // Output a DOT graph file for debug.
   std::string DotString() const;
 
+  std::string HumanReadableInfo(bool show_values = true,
+                                bool show_functions = true) const;
+
+  const std::vector<Node *> &inputs() const {
+    PADDLE_ENFORCE(!inputs_.empty(),
+                   "No inputs are deduced, need to Build() first.");
+    return inputs_;
+  }
+  const std::vector<Node *> &outputs() const {
+    PADDLE_ENFORCE(!outputs_.empty(),
+                   "No outputs are deduced, need to Build() first.");
+    return outputs_;
+  }
+
  private:
+  mutable std::vector<Node *> inputs_;
+  mutable std::vector<Node *> outputs_;
+  std::unordered_map<std::string, AnyAttr> attrs_;
+
   // Remove duplicate edges and so on.
   void Clean();
 };
@@ -65,7 +95,7 @@ struct GraphTraits<DataFlowGraph> {
       : public std::iterator<std::forward_iterator_tag, Node *> {
     NodesBFSIterator() = default;
     explicit NodesBFSIterator(const std::vector<Node *> &source);
-    // NodesBFSIterator(NodesBFSIterator &&other) noexcept;
+    NodesBFSIterator(NodesBFSIterator &&other) noexcept;
     // NOTE Heavy to use.
     NodesBFSIterator(const NodesBFSIterator &other);
 
@@ -88,8 +118,8 @@ struct GraphTraits<DataFlowGraph> {
   struct NodesDFSIterator
       : public std::iterator<std::forward_iterator_tag, Node *> {
     NodesDFSIterator() = default;
-    explicit NodesDFSIterator(const std::vector<Node *> &source);
-    // NodesDFSIterator(NodesDFSIterator &&other) noexcept;
+    NodesDFSIterator(const std::vector<Node *> &source);
+    NodesDFSIterator(NodesDFSIterator &&other) noexcept;
     NodesDFSIterator(const NodesDFSIterator &other);
 
     Node &operator*();
@@ -107,7 +137,33 @@ struct GraphTraits<DataFlowGraph> {
     std::unordered_set<Node *> visited_;
   };
 
-  explicit GraphTraits(DataFlowGraph *graph) : graph_(graph) {}
+  // Topological sorting iterator on nodes.
+  struct NodesTSIterator
+      : public std::iterator<std::forward_iterator_tag, Node *> {
+    NodesTSIterator() = default;
+    NodesTSIterator(const std::vector<Node *> &source);
+    NodesTSIterator(NodesTSIterator &&other)
+        : sorted_(std::move(other.sorted_)), cursor_(other.cursor_) {
+      other.cursor_ = 0;
+    }
+    NodesTSIterator(const NodesTSIterator &other);
+
+    Node &operator*();
+    NodesTSIterator &operator++();
+    // TODO(Superjomn) current implementation just compare the first
+    // element, need to compare the graph and all the elements in the queue and
+    // set.
+    NodesTSIterator &operator=(const NodesTSIterator &other);
+    bool operator==(const NodesTSIterator &other);
+    bool operator!=(const NodesTSIterator &other) { return !(*this == other); }
+    Node *operator->();
+
+   private:
+    std::vector<Node *> sorted_;
+    size_t cursor_{0};
+  };
+
+  explicit GraphTraits(const DataFlowGraph &graph) : graph_(graph) {}
 
   // default use BFS to visit the nodes.
   iterator_range<NodesBFSIterator> nodes() {
@@ -119,55 +175,35 @@ struct GraphTraits<DataFlowGraph> {
   iterator_range<NodesDFSIterator> nodes_in_DFS() {
     return iterator_range<NodesDFSIterator>(nodes_dfs_begin(), nodes_dfs_end());
   }
+  iterator_range<NodesTSIterator> nodes_in_TS() {
+    return iterator_range<NodesTSIterator>(nodes_ts_begin(), nodes_ts_end());
+  }
 
  private:
   NodesBFSIterator nodes_bfs_begin() {
-    return NodesBFSIterator(graph_->inputs);
+    return NodesBFSIterator(graph_.inputs());
   }
   NodesBFSIterator nodes_bfs_end() { return NodesBFSIterator(); }
+
   NodesDFSIterator nodes_dfs_begin() {
-    return NodesDFSIterator(graph_->inputs);
+    return NodesDFSIterator(graph_.inputs());
   }
   NodesDFSIterator nodes_dfs_end() { return NodesDFSIterator(); }
 
+  NodesTSIterator nodes_ts_begin() { return NodesTSIterator(graph_.inputs()); }
+  NodesTSIterator nodes_ts_end() { return NodesTSIterator(); }
+
  private:
-  DataFlowGraph *graph_;
+  const DataFlowGraph &graph_;
 };
 
 // Extract the inputs and outputs of a graph. The inputs and outputs of a
 // sub-graph is the inputs nodes and output nodes that doesn't inside the
 // sub-graph.
-static std::pair<std::vector<Node *>, std::vector<Node *>>
-ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph) {  // NOLINT
-  std::unordered_set<Node *> nodes(graph.begin(), graph.end());
-  std::unordered_set<Node *> inputs;
-  std::unordered_set<Node *> outputs;
-  // Input a Value, check whether its inlink is in the subgraph.
-  auto inlink_in_subgraph = [&](Node *n) {
-    for (auto *in : n->inlinks) {
-      if (nodes.count(in)) return true;
-    }
-    return false;
-  };
-  for (auto &node : graph) {
-    for (auto *in : node->inlinks) {
-      // The Value that is written by nodes inside a sub-graph shouldn't be the
-      // input of the sub-graph.
-      if (!nodes.count(in) && in->type() == Node::Type::kValue &&
-          !inlink_in_subgraph(in)) {
-        inputs.insert(in);
-      }
-    }
-    for (auto *out : node->outlinks) {
-      if (!nodes.count(out) && out->type() == Node::Type::kValue) {
-        outputs.insert(out);
-      }
-    }
-  }
-  return std::make_pair(std::vector<Node *>(inputs.begin(), inputs.end()),
-                        std::vector<Node *>(outputs.begin(), outputs.end()));
-}
+std::pair<std::vector<Node *>, std::vector<Node *>>
+ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph);  // NOLINT
 
+void FilterRedundantOutputOfSubGraph(DataFlowGraph *graph);
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_tester.cc b/paddle/fluid/inference/analysis/data_flow_graph_tester.cc
index 9d7cceeb65888b8ba3fdf39e88fc2877abd82d11..1682011c3d8cc9927a4b026b370671798cace625 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph_tester.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_tester.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/inference/analysis/data_flow_graph.h"
+#include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/inference/analysis/ut_helper.h"
 
 namespace paddle {
@@ -20,43 +21,145 @@ namespace inference {
 namespace analysis {
 
 TEST(DataFlowGraph, BFS) {
-  auto desc = LoadProgramDesc();
+  auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__");
   auto dfg = ProgramDescToDFG(desc);
   dfg.Build();
 
-  for (auto* in : dfg.inputs) {
+  for (auto* in : dfg.inputs()) {
     LOG(INFO) << "inputs: " << in->name() << " "
               << static_cast<int>(in->type());
   }
-  for (auto* out : dfg.outputs) {
+  for (auto* out : dfg.outputs()) {
     LOG(INFO) << "outputs: " << out->name() << " "
               << static_cast<int>(out->type());
   }
 
-  GraphTraits<DataFlowGraph> trait(&dfg);
-  auto nodes = trait.nodes();
   size_t count = 0;
-  for (auto it = nodes.begin(); it != nodes.end(); ++it) {
-    LOG(INFO) << "visiting " << it->name();
+  for (auto& node : GraphTraits<DataFlowGraph>(dfg).nodes()) {
+    LOG(INFO) << "visiting " << node.name();
     ++count;
   }
   ASSERT_EQ(count, dfg.nodes.size());
 }
 
 TEST(DataFlowGraph, DFS) {
-  auto desc = LoadProgramDesc();
-  auto dfg = ProgramDescToDFG(desc);
-  dfg.Build();
-  GraphTraits<DataFlowGraph> trait(&dfg);
-  auto nodes = trait.nodes_in_DFS();
+  auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__");
+  DataFlowGraph dfg;
+  dfg.Build(desc);
   size_t count = 0;
-  for (auto it = nodes.begin(); it != nodes.end(); ++it) {
-    LOG(INFO) << "visiting " << it->name();
+  for (auto& node : GraphTraits<DataFlowGraph>(dfg).nodes_in_DFS()) {
+    LOG(INFO) << "visiting " << node.name();
     ++count;
   }
   ASSERT_EQ(count, dfg.nodes.size());
 }
 
+// Topological sorting.
+/*
+ * Graph topology
+ * inputs: 0, 1, 2
+ * 0 -> 4
+ * 0 -> 5
+ * 1 -> 6
+ * 2 -> 7
+ * 4 -> 5
+ * 4 -> 7
+ * 4 -> 3
+ * 7 -> 3
+ */
+TEST(DataFlowGraph, TS) {
+  DataFlowGraph graph;
+
+  for (int i = 0; i < 8; i++) {
+    auto* node = graph.nodes.Create(Node::Type::kValue);
+    node->SetName("node-" + std::to_string(i));
+  }
+
+  auto add_link = [&](int i, int j) {
+    Node* source = graph.nodes.GetMutable(i);
+    Node* target = graph.nodes.GetMutable(j);
+    target->inlinks.push_back(source);
+    source->outlinks.push_back(target);
+  };
+
+  add_link(0, 4);
+  add_link(0, 5);
+  add_link(1, 6);
+  add_link(2, 7);
+  add_link(4, 5);
+  add_link(4, 7);
+  add_link(4, 3);
+  add_link(7, 3);
+  graph.Build();
+
+  auto its = GraphTraits<DataFlowGraph>(graph).nodes_in_TS();
+  std::vector<int> sorted_ids;
+  for (auto it = its.begin(); it != its.end(); ++it) {
+    LOG(INFO) << it->name();
+    sorted_ids.push_back(it->id());
+  }
+
+  // Assert a occurs prior to b in the sorted_ids.
+  auto assert_positive_sequence_pair = [&](int a, int b) {
+    auto a_offset = std::find(sorted_ids.begin(), sorted_ids.end(), a);
+    auto b_offset = std::find(sorted_ids.begin(), sorted_ids.end(), b);
+    ASSERT_LT(a_offset, b_offset);
+  };
+
+  assert_positive_sequence_pair(2, 7);
+  assert_positive_sequence_pair(7, 3);
+  assert_positive_sequence_pair(4, 3);
+  assert_positive_sequence_pair(0, 4);
+  assert_positive_sequence_pair(0, 5);
+  assert_positive_sequence_pair(1, 6);
+  assert_positive_sequence_pair(4, 5);
+  assert_positive_sequence_pair(4, 7);
+}
+
+TEST(DataFlowGraph, Build_ProgramDesc) {
+  auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__");
+  DataFlowGraph graph;
+  graph.Build(desc);
+  ASSERT_EQ(graph.nodes.size(), 38UL);
+}
+
+void SetOp(framework::ProgramDesc* prog, const std::string& type,
+           const std::vector<std::string>& inputs,
+           const std::vector<std::string>& outputs) {
+  auto* op = prog->MutableBlock(0)->AppendOp();
+  op->SetType(type);
+  op->SetInput("Xs", inputs);
+  op->SetOutput("Xs", outputs);
+}
+
+TEST(DataFlowGraph, Build_IR_Graph) {
+  framework::ProgramDesc prog;
+  for (auto& v : std::vector<std::string>({"a", "b", "c", "d", "e", "f"})) {
+    auto* var = prog.MutableBlock(0)->Var(v);
+    var->SetType(framework::proto::VarType::SELECTED_ROWS);
+    if (v == "c") {
+      var->SetPersistable(true);
+    }
+  }
+
+  SetOp(&prog, "OP0", std::vector<std::string>({"a"}),
+        std::vector<std::string>({"b"}));
+  SetOp(&prog, "OP1", std::vector<std::string>({"a"}),
+        std::vector<std::string>({"c"}));
+  SetOp(&prog, "mul", std::vector<std::string>({"b", "c"}),
+        std::vector<std::string>({"d"}));
+  SetOp(&prog, "elementwise_add", std::vector<std::string>({"d", "e"}),
+        std::vector<std::string>({"f"}));
+
+  DataFlowGraph graph;
+
+  framework::ir::Graph ir_graph(prog);
+
+  graph.Build(ir_graph);
+
+  ASSERT_EQ(graph.nodes.size(), ir_graph.Nodes().size());
+}
+
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
index 29ca008123addf07959b965a4b54bf55b18c401d..5652940ec6d4cc7ba9a1d3a3e65f7dca1690d8c4 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
@@ -15,24 +15,26 @@
 #include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
 #include <vector>
 #include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/proto_desc.h"
 #include "paddle/fluid/inference/analysis/analyzer.h"
 #include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
+#include "paddle/fluid/inference/io.h"
 
 namespace paddle {
 namespace inference {
+
 namespace analysis {
 
 using framework::proto::ProgramDesc;
 
 std::vector<std::string> ExtractParameters(
-    const std::vector<std::unique_ptr<Node>>& nodes);
+    const std::vector<std::unique_ptr<Node>> &nodes);
 
-bool DataFlowGraphToFluidPass::Initialize(Argument* argument) {
+bool DataFlowGraphToFluidPass::Initialize(Argument *argument) {
   ANALYSIS_ARGUMENT_CHECK_FIELD(argument)
   ANALYSIS_ARGUMENT_CHECK_FIELD(argument->origin_program_desc)
-  PADDLE_ENFORCE(!argument->transformed_program_desc);
   // The transformed_program_desc should inherit all the VarDesc and BlockDesc
   // from the original program desc. The operators of the main block(the first
   // block) should rewritten by data flow graph.
@@ -47,76 +49,172 @@ bool DataFlowGraphToFluidPass::Initialize(Argument* argument) {
 
 bool DataFlowGraphToFluidPass::Finalize() { return true; }
 
-void DataFlowGraphToFluidPass::Run(DataFlowGraph* graph) {
-  auto traits = GraphTraits<DataFlowGraph>(graph);
-  for (auto it = traits.nodes().begin(); it != traits.nodes().end(); ++it) {
-    if (it->deleted()) continue;
+void DataFlowGraphToFluidPass::Run(DataFlowGraph *graph) {
+  // FilterRedundantOutputOfSubGraph(graph);
+  for (auto &node : GraphTraits<DataFlowGraph>(*graph).nodes_in_TS()) {
+    if (node.deleted()) continue;
 
-    switch (it->type()) {
+    switch (node.type()) {
       case Node::Type::kFunction: {
-        LOG(INFO) << "add function " << it->repr();
-        AddFluidOp(&(*it));
+        AddFluidOp(&node);
       } break;
       case Node::Type::kFunctionBlock: {
-        LOG(INFO) << "add engine op " << it->repr() << " , "
-                  << static_cast<FunctionBlock*>(&(*it))->subgraph.size();
-        AddEngineOp(&(*it));
+        AddEngineOp(&node);
       } break;
       default:
         continue;
     }
   }
+
+  if (argument_->Has(framework::ir::kParamScopeAttr)) {
+    LOG(WARNING) << "parameter changes in the scope takes effect";
+  }
+
+  PADDLE_ENFORCE(argument_->transformed_program_desc.get());
 }
 
-void DataFlowGraphToFluidPass::AddFluidOp(Node* node) {
-  auto* ori_op = static_cast<framework::proto::OpDesc*>(node->pb_desc());
+void DataFlowGraphToFluidPass::AddFluidOp(Node *node) {
+  PADDLE_ENFORCE(node);
+  PADDLE_ENFORCE(node->IsFunction());
+  PADDLE_ENFORCE(node->pb_desc() || !node->pb_msg().empty(),
+                 "node has invalid protobuf repr.");
+
   // currently only the main block is analyzed.
-  auto* main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
-  auto* op = main_block->add_ops();
-  *op = *ori_op;  // copy the attributes, by default, these will not be changed
-                  // by analysis phrase.
-  // The inputs and outputs of the existing ops are not changed by tensorrt
-  // subgraph pass.
-  // NOTE It might be changed by other passes in the long run.
+  PADDLE_ENFORCE(desc_);
+  auto *main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
+  auto *op = main_block->add_ops();
+
+  if (node->pb_desc()) {
+    auto *ori_op = static_cast<framework::proto::OpDesc *>(node->pb_desc());
+    *op =
+        *ori_op;  // copy the attributes, by default, these will not be changed
+    // by analysis phrase.
+    // The inputs and outputs of the existing ops are not changed by tensorrt
+    // subgraph pass.
+    // NOTE It might be changed by other passes in the long run.
+  } else {
+    op->ParseFromString(node->pb_msg());
+  }
 }
 
-void CreateTrtEngineOp(Node* node, const DataFlowGraph& graph,
-                       const framework::proto::BlockDesc& block) {
+void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph,
+                       framework::proto::BlockDesc *block) {
   static int counter{0};
   PADDLE_ENFORCE(node->IsFunctionBlock());
   framework::OpDesc desc;
-  auto* func = static_cast<FunctionBlock*>(node);
+  auto *func = static_cast<FunctionBlock *>(node);
 
   // collect inputs
-  std::vector<std::string> io;
-  for (auto* x : func->inlinks) {
-    io.push_back(x->name());
+  std::unordered_set<std::string> input_names;
+  std::unordered_set<std::string> input_names_with_id;
+  for (auto *x : func->inlinks) {
+    input_names.insert(x->name());
+    input_names_with_id.insert(x->name() + std::to_string(x->id()));
   }
-  desc.SetInput("Xs", io);
+  desc.SetInput(
+      "Xs", std::vector<std::string>(input_names.begin(), input_names.end()));
 
-  // collect outputs
-  io.clear();
-  for (auto* x : func->outlinks) {
-    io.push_back(x->name());
+  std::unordered_set<std::string> output_names;
+  std::unordered_set<std::string> output_names_with_id;
+  for (auto *x : func->outlinks) {
+    output_names.insert(x->name());
+    output_names_with_id.insert(x->name() + std::to_string(x->id()));
   }
-  desc.SetOutput("Ys", io);
 
+  desc.SetOutput(
+      "Ys", std::vector<std::string>(output_names.begin(), output_names.end()));
   desc.SetType("tensorrt_engine");
+
+  std::unordered_map<std::string, std::string> output_name_map;
+
+  // The following procedure is used to rename all the intermediate
+  // variables and the output variables of the subgraph.
+  // Why we do this?
+  // During the transition from fluid OP to tensorrt OP, we map
+  // the input and output Tensor(fluid data structure) of fluid OP
+  // to the correspondin ITensor (trt data structure) through the
+  // Tensor name. When we set up ITensor for an variable, we must
+  // ensure that it has not been set before.
+  // If there is variable in the fluid graph, which is not only the
+  // input of a OP, but also the output of a Op, there will be problems.
+  // So we have to rename the variable in the subgraph to make sure
+  // it is either an OP's input or an OP's output.
+
+  auto subgraph_nodes = func->subgraph;
+  for (int index = 0; index < block->ops_size(); index++) {
+    framework::proto::OpDesc *op = block->mutable_ops(index);
+    auto correspond_node = subgraph_nodes[index];
+    PADDLE_ENFORCE_EQ(correspond_node->name(), op->type());
+
+    std::unordered_map<std::string, size_t> var2id;
+    for (auto *in_var : correspond_node->inlinks) {
+      var2id[in_var->name()] = in_var->id();
+    }
+    // rename for the input variables of op inside subgraph
+    for (int i = 0; i < op->inputs_size(); i++) {
+      framework::proto::OpDesc_Var *in_var = op->mutable_inputs(i);
+      std::vector<std::string> replaced_names;
+      for (int k = 0; k < in_var->arguments_size(); k++) {
+        std::string arg_value = in_var->arguments(k);
+        std::string arg_value_with_id =
+            arg_value + std::to_string(var2id[arg_value]);
+        if (input_names_with_id.count(arg_value_with_id)) {
+          replaced_names.push_back(arg_value);
+        } else {
+          replaced_names.push_back(arg_value_with_id);
+        }
+      }
+      in_var->clear_arguments();
+      for (size_t k = 0; k < replaced_names.size(); k++) {
+        in_var->add_arguments(replaced_names[k]);
+      }
+    }
+    var2id.clear();
+    for (auto out_var : correspond_node->outlinks) {
+      var2id[out_var->name()] = out_var->id();
+    }
+
+    // rename for the output variables of op inside subgraph
+    for (int i = 0; i < op->outputs_size(); i++) {
+      framework::proto::OpDesc_Var *out_var = op->mutable_outputs(i);
+      std::vector<std::string> replaced_names;
+      for (int k = 0; k < out_var->arguments_size(); k++) {
+        std::string arg_value = out_var->arguments(k);
+        std::string arg_value_with_id =
+            arg_value + std::to_string(var2id[arg_value]);
+        if (output_names_with_id.count(arg_value_with_id)) {
+          output_name_map[arg_value] = arg_value_with_id;
+        }
+        replaced_names.push_back(arg_value_with_id);
+      }
+      out_var->clear_arguments();
+      for (size_t k = 0; k < replaced_names.size(); k++) {
+        out_var->add_arguments(replaced_names[k]);
+      }
+    }
+  }
+  // When tensorrt engine runs at the end of the operation,
+  // output_mapping help us copy the data from the renamed ITensor
+  // to Tensor.
+  std::vector<std::string> output_mapping;
+  for (auto name : output_names) {
+    PADDLE_ENFORCE(output_name_map.count(name) != 0);
+    output_mapping.push_back(output_name_map[name]);
+  }
+
+  PADDLE_ENFORCE(!block->vars().empty(), "the block has no var-desc");
   // Set attrs
-  SetAttr(desc.Proto(), "subgraph", block.SerializeAsString());
-  SetAttr(desc.Proto(), "engine_unique_key",
-          "trt-" + std::to_string(counter++));
-  SetAttr(desc.Proto(), "max_batch", 100);  // TODO(Superjomn) add config latter
-  SetAttr(desc.Proto(), "max_workspace",
-          1024);  // TODO(Superjomn) add config latter
+  SetAttr(desc.Proto(), "subgraph", block->SerializeAsString());
+  SetAttr(desc.Proto(), "engine_uniq_key", "trt-" + std::to_string(counter++));
   SetAttr(desc.Proto(), "parameters", ExtractParameters(graph.nodes.nodes()));
+  SetAttr(desc.Proto(), "output_name_mapping", output_mapping);
   node->SetPbMsg(desc.Proto()->SerializeAsString());
 }
 
 std::vector<std::string> ExtractParameters(
-    const std::vector<std::unique_ptr<Node>>& nodes) {
+    const std::vector<std::unique_ptr<Node>> &nodes) {
   std::vector<std::string> parameters;
-  for (const auto& node : nodes) {
+  for (const auto &node : nodes) {
     if (!node->IsValue()) continue;
     PADDLE_ENFORCE(!node->pb_msg().empty(), "pb_msg should be set first");
     framework::proto::VarDesc var;
@@ -128,21 +226,31 @@ std::vector<std::string> ExtractParameters(
   return parameters;
 }
 
-void DataFlowGraphToFluidPass::AddEngineOp(Node* node) {
+void DataFlowGraphToFluidPass::AddEngineOp(Node *node) {
   // TODO(Superjomn) Here need to expose some arguments for default setting.
   PADDLE_ENFORCE(node->IsFunctionBlock());
-  auto* block_node = static_cast<FunctionBlock*>(node);
+  auto *block_node = static_cast<FunctionBlock *>(node);
   framework::proto::BlockDesc proto;
   framework::BlockDesc block_desc(nullptr, &proto);
+  block_desc.Proto()->set_parent_idx(-1);
+  block_desc.Proto()->set_idx(0);
+  VLOG(4) << "origin variable size: "
+          << argument_->origin_program_desc->blocks(0).vars().size();
+  VLOG(4) << "transformed variable size: " << block_desc.Proto()->vars().size();
   // copy ops.
-  for (auto* node : block_node->subgraph) {
-    auto* op = block_desc.AppendOp();
+
+  for (auto *node : block_node->subgraph) {
+    auto *op = block_desc.AppendOp();
     PADDLE_ENFORCE(!node->pb_msg().empty());
     op->Proto()->ParseFromString(node->pb_msg());
   }
-  CreateTrtEngineOp(node, *argument_->main_dfg, *block_desc.Proto());
-  auto* main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
-  auto* op = main_block->add_ops();
+
+  *block_desc.Proto()->mutable_vars() =
+      argument_->origin_program_desc->blocks(0).vars();
+  PADDLE_ENFORCE(!block_desc.Proto()->vars().empty());
+  CreateTrtEngineOp(node, *argument_->main_dfg, block_desc.Proto());
+  auto *main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
+  auto *op = main_block->add_ops();
   PADDLE_ENFORCE(!node->pb_msg().empty(), "failed to set desc for block");
   op->ParseFromString(node->pb_msg());
 }
@@ -151,7 +259,7 @@ namespace {
 class DFG_DebuggerPass : public DFG_GraphvizDrawPass {
  public:
   using Config = DFG_GraphvizDrawPass::Config;
-  explicit DFG_DebuggerPass(const Config& config)
+  explicit DFG_DebuggerPass(const Config &config)
       : DFG_GraphvizDrawPass(config) {}
 
   std::string repr() const override { return "dfg-to-fluid-debuger-pass"; }
@@ -160,9 +268,9 @@ class DFG_DebuggerPass : public DFG_GraphvizDrawPass {
 };
 }  // namespace
 
-Pass* DataFlowGraphToFluidPass::CreateGraphvizDebugerPass() const {
+AnalysisPass *DataFlowGraphToFluidPass::CreateGraphvizDebugerPass() const {
   return new DFG_DebuggerPass(DFG_GraphvizDrawPass::Config(
-      FLAGS_inference_analysis_graphviz_log_root,
+      FLAGS_IA_graphviz_log_root,
       "data_flow_graph_to_fluid_graphviz_debugger"));
 }
 
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h
index edc84b02ed20991e3e7c6c437d2b1fac169bae03..891c7226e245fa3b92892785362c186185a61f62 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h
@@ -21,11 +21,12 @@
 
 #include <string>
 #include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/inference/analysis/analysis_pass.h"
 #include "paddle/fluid/inference/analysis/data_flow_graph.h"
-#include "paddle/fluid/inference/analysis/pass.h"
 
 namespace paddle {
 namespace inference {
+
 namespace analysis {
 class DataFlowGraphToFluidPass final : public DataFlowGraphPass {
  public:
@@ -41,7 +42,7 @@ class DataFlowGraphToFluidPass final : public DataFlowGraphPass {
     return "Transform a DFG to a Fluid ProgramDesc";
   }
 
-  Pass *CreateGraphvizDebugerPass() const override;
+  AnalysisPass *CreateGraphvizDebugerPass() const override;
 
  protected:
   // Add a Fluid Op into the ProgramDesc.
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc
index d8fc5e580a98f76233f01fdc4d7987311f78ee45..4ef381db295b986b91173a728b6d98640f6f4f51 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc
@@ -26,21 +26,21 @@ namespace paddle {
 namespace inference {
 namespace analysis {
 
-TEST_F(DFG_Tester, Test) {
-  DataFlowGraph graph;
+TEST(DataFlowGraph, Test) {
+  Argument argument(FLAGS_inference_model_dir);
 
   FluidToDataFlowGraphPass pass0;
   DataFlowGraphToFluidPass pass1;
   ASSERT_TRUE(pass0.Initialize(&argument));
   ASSERT_TRUE(pass1.Initialize(&argument));
 
-  pass0.Run(&graph);
-  pass1.Run(&graph);
+  pass0.Run(argument.main_dfg.get());
+  pass1.Run(argument.main_dfg.get());
 
   pass0.Finalize();
   pass1.Finalize();
 
-  LOG(INFO) << graph.nodes.size();
+  LOG(INFO) << argument.main_dfg->nodes.size();
 }
 
 };  // namespace analysis
diff --git a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc
index a6f85484756417e103cbb60bcb664e8b800b9f28..648b8f7d6a6ec4bafbad2838c5631e776c8699b1 100644
--- a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc
+++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc
@@ -29,7 +29,7 @@ void DFG_GraphvizDrawPass::Run(DataFlowGraph *graph) {
 
   auto png_path = dot_path.substr(0, dot_path.size() - 4) + ".png";
   std::string message;
-  LOG(INFO) << "draw to " << png_path;
+  VLOG(3) << "draw to " << png_path;
   ExecShellCommand("dot -Tpng " + dot_path + " -o " + png_path, &message);
 }
 
@@ -46,9 +46,9 @@ std::string DFG_GraphvizDrawPass::Draw(DataFlowGraph *graph) {
   for (size_t i = 0; i < graph->nodes.size(); i++) {
     const Node &node = graph->nodes.Get(i);
     if (!config_.display_deleted_node && node.deleted()) continue;
-    for (auto &in : node.inlinks) {
-      if (!config_.display_deleted_node && in->deleted()) continue;
-      dot.AddEdge(in->repr(), node.repr(), {});
+    for (auto &out : node.outlinks) {
+      if (!config_.display_deleted_node && out->deleted()) continue;
+      dot.AddEdge(node.repr(), out->repr(), {});
     }
   }
   return dot.Build();
diff --git a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h
index 17445ab4407a159ca11345bc9a9226b3ad0044f0..e537bfc0e64d4ff46b3d61499a1a0298ed83533f 100644
--- a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h
+++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h
@@ -21,8 +21,8 @@ limitations under the License. */
 
 #include <fstream>
 #include <string>
+#include "paddle/fluid/inference/analysis/analysis_pass.h"
 #include "paddle/fluid/inference/analysis/dot.h"
-#include "paddle/fluid/inference/analysis/pass.h"
 
 namespace paddle {
 namespace inference {
diff --git a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc
index 162455b9c4e06b7fbb4bdede30444faf6a8a1509..928be7917047382d9b86294f6039b26b0ebf6f49 100644
--- a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc
@@ -23,12 +23,18 @@ namespace paddle {
 namespace inference {
 namespace analysis {
 
-TEST_F(DFG_Tester, dfg_graphviz_draw_pass_tester) {
-  auto dfg = ProgramDescToDFG(*argument.origin_program_desc);
+TEST(DFG_GraphvizDrawPass, dfg_graphviz_draw_pass_tester) {
+  Argument argument(FLAGS_inference_model_dir);
+  FluidToDataFlowGraphPass pass0;
+  ASSERT_TRUE(pass0.Initialize(&argument));
+  pass0.Run(argument.main_dfg.get());
+
+  // auto dfg = ProgramDescToDFG(*argument.origin_program_desc);
+
   DFG_GraphvizDrawPass::Config config("./", "test");
   DFG_GraphvizDrawPass pass(config);
   pass.Initialize(&argument);
-  pass.Run(&dfg);
+  pass.Run(argument.main_dfg.get());
 
   // test content
   std::ifstream file("./0-graph_test.dot");
@@ -40,7 +46,7 @@ TEST_F(DFG_Tester, dfg_graphviz_draw_pass_tester) {
     no++;
   }
   // DFG is sensitive to ProgramDesc, be careful to change the existing models.
-  ASSERT_EQ(no, 82);
+  ASSERT_EQ(no, 83);
 }
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/analysis/dot.h b/paddle/fluid/inference/analysis/dot.h
index 4bf1840fdda8508b52d7274a338c5b1c95baf354..4693729cb43d7a9df96b11c4bf3064a70d1db4c3 100644
--- a/paddle/fluid/inference/analysis/dot.h
+++ b/paddle/fluid/inference/analysis/dot.h
@@ -29,13 +29,13 @@ namespace paddle {
 namespace inference {
 namespace analysis {
 
+static size_t dot_node_counter{0};
+
 /*
  * A Dot template that helps to build a DOT graph definition.
  */
 class Dot {
  public:
-  static size_t counter;
-
   struct Attr {
     std::string key;
     std::string value;
@@ -57,7 +57,7 @@ class Dot {
     Node(const std::string& name, const std::vector<Attr>& attrs)
         : name(name),
           attrs(attrs),
-          id_("node_" + std::to_string(Dot::counter++)) {}
+          id_("node_" + std::to_string(dot_node_counter++)) {}
 
     std::string id() const { return id_; }
 
@@ -65,6 +65,10 @@ class Dot {
       std::stringstream ss;
       CHECK(!name.empty());
       ss << id_;
+      if (attrs.empty()) {
+        ss << "[label=" << '"' << name << '"' << "]";
+        return ss.str();
+      }
       for (size_t i = 0; i < attrs.size(); i++) {
         if (i == 0) {
           ss << "[label=" << '"' << name << '"' << " ";
@@ -108,9 +112,11 @@ class Dot {
 
   explicit Dot(const std::vector<Attr>& attrs) : attrs_(attrs) {}
 
-  void AddNode(const std::string& name, const std::vector<Attr>& attrs) {
-    CHECK(!nodes_.count(name)) << "duplicate Node '" << name << "'";
-    nodes_.emplace(name, Node{name, attrs});
+  void AddNode(const std::string& id, const std::vector<Attr>& attrs,
+               std::string label = "") {
+    CHECK(!nodes_.count(id)) << "duplicate Node '" << id << "'";
+    if (label.empty()) label = id;
+    nodes_.emplace(id, Node{label, attrs});
   }
 
   void AddEdge(const std::string& source, const std::string& target,
diff --git a/paddle/fluid/inference/analysis/flags.h b/paddle/fluid/inference/analysis/flags.h
new file mode 100644
index 0000000000000000000000000000000000000000..717e543f01dfa071865a5c14c0b7679e65239daf
--- /dev/null
+++ b/paddle/fluid/inference/analysis/flags.h
@@ -0,0 +1,22 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+
+// TODO(Superjomn) add a definition flag like PADDLE_WITH_TENSORRT and hide this
+// flag if not available.
+DECLARE_bool(IA_enable_tensorrt_subgraph_engine);
+DECLARE_string(IA_graphviz_log_root);
+DECLARE_string(IA_output_storage_path);
+DECLARE_bool(IA_enable_ir);
diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
index e918622d74cfb11d83090555be2a768cc14e7742..2b7d632c839e735ca03c6e17b94307b40cc13374 100644
--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <glog/logging.h>
 #include <string>
 #include <vector>
 
@@ -25,10 +26,21 @@ namespace analysis {
 
 bool FluidToDataFlowGraphPass::Initialize(Argument *argument) {
   ANALYSIS_ARGUMENT_CHECK_FIELD(argument);
-  ANALYSIS_ARGUMENT_CHECK_FIELD(argument->origin_program_desc);
-  PADDLE_ENFORCE(argument);
+  if (argument->origin_program_desc) {
+    LOG(WARNING) << "argument's origin_program_desc is already set, might "
+                    "duplicate called";
+  }
+  if (!argument->fluid_model_program_path) {
+    ANALYSIS_ARGUMENT_CHECK_FIELD(argument->fluid_model_dir);
+    argument->fluid_model_program_path.reset(
+        new std::string(*argument->fluid_model_dir + "/__model__"));
+  }
+  ANALYSIS_ARGUMENT_CHECK_FIELD(argument->fluid_model_program_path);
+  auto program = LoadProgramDesc(*argument->fluid_model_program_path);
+  argument->origin_program_desc.reset(
+      new framework::proto::ProgramDesc(program));
+
   if (!argument->main_dfg) {
-    LOG(INFO) << "Init DFG";
     argument->main_dfg.reset(new DataFlowGraph);
   }
   desc_ = argument->origin_program_desc.get();
@@ -40,48 +52,7 @@ bool FluidToDataFlowGraphPass::Finalize() { return true; }
 void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
   PADDLE_ENFORCE(graph);
   PADDLE_ENFORCE(desc_);
-  // insert vars
-  std::unordered_map<std::string, size_t> var2id;
-  auto &main_block = desc_->blocks(framework::kRootBlockIndex);
-  for (int i = 0; i < main_block.vars_size(); i++) {
-    const auto &var = main_block.vars(i);
-    auto *v = graph->nodes.Create(Node::Type::kValue);
-    v->SetName(var.name());
-    v->SetPbDesc(const_cast<void *>(static_cast<const void *>(&var)));
-    v->SetPbMsg(var.SerializeAsString());
-    var2id[var.name()] = v->id();
-  }
-  for (int i = 0; i < main_block.ops_size(); i++) {
-    const auto &op = main_block.ops(i);
-    auto *o = graph->nodes.Create(Node::Type::kFunction);
-    o->SetName(op.type());
-    static_cast<Function *>(o)->SetFuncType(op.type());
-    // Link to the original protobuf message's memory, make it easier to
-    // generate from a data flow graph to fluid ProgramDesc.
-    o->SetPbDesc(const_cast<void *>(static_cast<const void *>(&op)));
-    o->SetPbMsg(op.SerializeAsString());
-
-    // set inputs and outputs
-    // TODO(Superjomn) make sure the InputNames is the real variable name.
-    for (int j = 0; j < op.inputs_size(); j++) {
-      auto &in_var = op.inputs(j);
-      for (int k = 0; k < in_var.arguments_size(); k++) {
-        auto *in = graph->nodes.GetMutable(var2id.at(in_var.arguments(k)));
-        in->outlinks.push_back(o);
-        o->inlinks.push_back(in);
-      }
-    }
-    for (int j = 0; j < op.outputs_size(); j++) {
-      auto &out_var = op.outputs(j);
-      for (int k = 0; k < out_var.arguments_size(); k++) {
-        auto *out = graph->nodes.GetMutable(var2id[out_var.arguments(k)]);
-        out->inlinks.push_back(o);
-        o->outlinks.push_back(out);
-      }
-    }
-  }
-  // Analysis and extract the inputs and outputs of this graph.
-  graph->Build();
+  graph->Build(*desc_);
 }
 
 namespace {
@@ -95,9 +66,9 @@ class DFG_DebuggerPass : public DFG_GraphvizDrawPass {
 };
 }
 
-Pass *FluidToDataFlowGraphPass::CreateGraphvizDebugerPass() const {
+AnalysisPass *FluidToDataFlowGraphPass::CreateGraphvizDebugerPass() const {
   return new DFG_DebuggerPass(DFG_GraphvizDrawPass::Config(
-      FLAGS_inference_analysis_graphviz_log_root, "fluid-to-dfg-debuger"));
+      FLAGS_IA_graphviz_log_root, "fluid-to-dfg-debuger"));
 }
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
index da8463b63bd0bb1633bfcb9d7d41a884ddd632c7..b9e262020e9522e167b998d57e2be2ac19b48447 100644
--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
@@ -22,15 +22,15 @@
 #include <string>
 
 #include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/inference/analysis/analysis_pass.h"
 #include "paddle/fluid/inference/analysis/data_flow_graph.h"
-#include "paddle/fluid/inference/analysis/pass.h"
 
 namespace paddle {
 namespace inference {
 namespace analysis {
 
 /*
- * Transform a FluidDesc to a data flow graph.
+ * Transform a FluidDesc to a SSA.
  */
 class FluidToDataFlowGraphPass final : public DataFlowGraphPass {
  public:
@@ -46,7 +46,7 @@ class FluidToDataFlowGraphPass final : public DataFlowGraphPass {
     return "transform a fluid ProgramDesc to a data flow graph.";
   }
 
-  Pass *CreateGraphvizDebugerPass() const override;
+  AnalysisPass *CreateGraphvizDebugerPass() const override;
 
  private:
   framework::proto::ProgramDesc const *desc_;
diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc
index cfbbc284e491bd62a6108d6d14e7896a57d1b63e..267a0a84ebf75615e0b390f4a1b3bf3b51793fc7 100644
--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc
@@ -21,15 +21,16 @@ namespace paddle {
 namespace inference {
 namespace analysis {
 
-TEST_F(DFG_Tester, Init) {
+TEST(FluidToDataFlowGraphPass, Test) {
   FluidToDataFlowGraphPass pass;
+  Argument argument(FLAGS_inference_model_dir);
   pass.Initialize(&argument);
-  DataFlowGraph graph;
-  pass.Run(&graph);
+  pass.Run(argument.main_dfg.get());
   // Analysis is sensitive to ProgramDesc, careful to change the original model.
-  ASSERT_EQ(graph.nodes.size(), 37);
+  ASSERT_EQ(argument.main_dfg->nodes.size(), 38UL);
   pass.Finalize();
-  LOG(INFO) << '\n' << graph.DotString();
+  ASSERT_FALSE(argument.main_dfg->DotString().empty());
+  EXPECT_FALSE(argument.main_dfg->inputs().empty());
 }
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/analysis/fluid_to_ir_pass.cc b/paddle/fluid/inference/analysis/fluid_to_ir_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fc60ca3bd0bf706407defb2655a093d999aef7c2
--- /dev/null
+++ b/paddle/fluid/inference/analysis/fluid_to_ir_pass.cc
@@ -0,0 +1,60 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/fluid_to_ir_pass.h"
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/inference/io.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+void FluidToIrPass::EnableParamModify(const std::string &model_dir,
+                                      const std::string &prog_file,
+                                      const std::string &param_file) {
+  PADDLE_ENFORCE(argument_);
+  argument_->Set(framework::ir::kParamScopeAttr, new framework::Scope);
+  // Load parameters.
+  VLOG(3) << "Loading parameters from " << model_dir;
+  LoadParams(&argument_->Get<framework::Scope>(framework::ir::kParamScopeAttr),
+             model_dir, prog_file, param_file);
+}
+
+bool FluidToIrPass::LoadParams(framework::Scope *scope, const std::string &dir,
+                               const std::string &prog_file,
+                               const std::string &param_file) {
+  platform::CPUPlace place;
+  platform::CPUDeviceContext ctx(place);
+  framework::Executor executor(place);
+  PADDLE_ENFORCE(argument_->origin_program_desc.get());
+  framework::ProgramDesc program(*argument_->origin_program_desc);
+  if ((!prog_file.empty()) && (!param_file.empty())) {
+    LOG(INFO) << "load single model file from " << prog_file;
+    Load(&executor, scope, prog_file, param_file);
+  } else if (!dir.empty()) {
+    LOG(INFO) << "load from dir " << dir;
+    Load(&executor, scope, dir);
+  } else {
+    LOG(ERROR) << "failed to load parameters";
+    return false;
+  }
+  return true;
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/fluid_to_ir_pass.h b/paddle/fluid/inference/analysis/fluid_to_ir_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..c2599e218a2306f9353b843b7ea3f18aeacb008e
--- /dev/null
+++ b/paddle/fluid/inference/analysis/fluid_to_ir_pass.h
@@ -0,0 +1,128 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/inference/analysis/analysis_pass.h"
+#include "paddle/fluid/inference/analysis/flags.h"
+#include "paddle/fluid/inference/analysis/ir_pass_manager.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+static const char kFluidToIrPassesAttr[] = "__fluid_to_ir_passes__";
+
+class FluidToIrPass final : public DataFlowGraphPass {
+ public:
+  FluidToIrPass() = default;
+
+  bool Initialize(Argument *argument) override {
+    ANALYSIS_ARGUMENT_CHECK_FIELD(argument);
+    PADDLE_ENFORCE(argument->Has(kFluidToIrPassesAttr),
+                   "argument need the attr %s", kFluidToIrPassesAttr);
+    argument_ = argument;
+    if (argument->origin_program_desc) {
+      LOG(WARNING) << "argument's origin_program_desc is already set, might "
+                      "duplicate called";
+    }
+    // set fluid model program path
+    if (!argument->fluid_model_program_path) {
+      ANALYSIS_ARGUMENT_CHECK_FIELD(argument->fluid_model_dir);
+      argument->fluid_model_program_path.reset(
+          new std::string(*argument->fluid_model_dir + "/__model__"));
+    }
+    ANALYSIS_ARGUMENT_CHECK_FIELD(argument->fluid_model_program_path);
+    // Load program.
+    auto program = LoadProgramDesc(*argument->fluid_model_program_path);
+    argument->origin_program_desc.reset(
+        new framework::proto::ProgramDesc(program));
+    // Create main data flow graph.
+    if (!argument->main_dfg) {
+      argument->main_dfg.reset(new DataFlowGraph);
+    }
+    argument->Set("ir_program_desc", new ProgramDesc(program));
+
+    LOG(INFO) << "Loading parameters";
+    // Load parameters to argument if needed.
+    if (argument->fluid_model_dir || (argument->fluid_model_program_path &&
+                                      argument->fluid_model_param_path)) {
+#define SAFE_GET(ATTR) std::string ATTR = argument->ATTR ? *argument->ATTR : "";
+      SAFE_GET(fluid_model_dir);
+      SAFE_GET(fluid_model_program_path);
+      SAFE_GET(fluid_model_param_path);
+#undef SAFE_GET
+      EnableParamModify(fluid_model_dir, fluid_model_program_path,
+                        fluid_model_param_path);
+    }
+
+    return true;
+  }
+
+  bool Finalize() override { return true; }
+
+  void Run(DataFlowGraph *graph) override {
+    // Call all the IR Passes
+    IRPassManager ir_passes(argument_->Get<ProgramDesc>("ir_program_desc"),
+                            nullptr);
+    // Pass the scope from analysis to IR if needed.
+    if (argument_->Has(framework::ir::kParamScopeAttr)) {
+      // Here the address is passed, attention that IR doesn't own the scope, so
+      // the real scope in analysis should live during the IR phase.
+      ir_passes.graph().Set(
+          framework::ir::kParamScopeAttr,
+          new framework::Scope *(&argument_->Get<framework::Scope>(
+              framework::ir::kParamScopeAttr)));
+    }
+
+    if (FLAGS_IA_enable_ir) {
+      const auto &ir_passes_to_apply =
+          argument_->Get<std::vector<std::string>>(kFluidToIrPassesAttr);
+      ir_passes.Apply(ir_passes_to_apply);
+    }
+
+    PADDLE_ENFORCE(argument_->main_dfg.get());
+    argument_->main_dfg->Build(ir_passes.graph());
+    // inherit the arguments from ir.
+    if (ir_passes.graph().Has(framework::ir::kFuseStatisAttr)) {
+      argument_->Set(
+          framework::ir::kFuseStatisAttr,
+          new std::unordered_map<std::string, int>(
+              ir_passes.graph().Get<std::unordered_map<std::string, int>>(
+                  framework::ir::kFuseStatisAttr)));
+    }
+  }
+
+  void EnableParamModify(const std::string &model_dir,
+                         const std::string &prog_file,
+                         const std::string &param_file);
+
+  std::string repr() const override { return "fluid-to-ir-pass"; }
+
+ private:
+  // Load parameters from a single file or from a directory.
+  bool LoadParams(framework::Scope *scope, const std::string &dir,
+                  const std::string &prog_file, const std::string &param_file);
+
+ private:
+  Argument *argument_{nullptr};
+};
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/fluid_to_ir_pass_tester.cc b/paddle/fluid/inference/analysis/fluid_to_ir_pass_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..367c25805d05f8d10fb8341158760ac6356a5c48
--- /dev/null
+++ b/paddle/fluid/inference/analysis/fluid_to_ir_pass_tester.cc
@@ -0,0 +1,36 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/fluid_to_ir_pass.h"
+
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/analysis/ut_helper.h"
+#include "paddle/fluid/inference/api/paddle_inference_pass.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+TEST(FluidToIrPass, Test) {
+  FluidToIrPass pass;
+  Argument argument(FLAGS_inference_model_dir);
+  argument.Set(kFluidToIrPassesAttr,
+               new std::vector<std::string>({"infer_clean_graph_pass"}));
+  pass.Initialize(&argument);
+  pass.Run(argument.main_dfg.get());
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h
index f1064cd20f28092d80d3fd23a862da080b6cc2f3..5151e2b69ac199dea136535ba445e890596f6227 100644
--- a/paddle/fluid/inference/analysis/helper.h
+++ b/paddle/fluid/inference/analysis/helper.h
@@ -14,7 +14,9 @@ limitations under the License. */
 
 #pragma once
 
+#include <sys/stat.h>
 #include <cstdio>
+#include <fstream>
 #include <string>
 #include <typeindex>
 #include <unordered_map>
@@ -136,6 +138,37 @@ static void ExecShellCommand(const std::string &cmd, std::string *message) {
   }
 }
 
+static framework::proto::ProgramDesc LoadProgramDesc(
+    const std::string &model_path) {
+  std::ifstream fin(model_path, std::ios::in | std::ios::binary);
+  PADDLE_ENFORCE(fin.is_open(), "Cannot open file %s", model_path);
+  fin.seekg(0, std::ios::end);
+  std::string buffer(fin.tellg(), ' ');
+  fin.seekg(0, std::ios::beg);
+  fin.read(&buffer[0], buffer.size());
+  fin.close();
+  framework::proto::ProgramDesc program_desc;
+  program_desc.ParseFromString(buffer);
+  return program_desc;
+}
+
+static bool FileExists(const std::string &filepath) {
+  std::ifstream file(filepath);
+  bool exists = file.is_open();
+  file.close();
+  return exists;
+}
+
+static bool PathExists(const std::string &path) {
+  struct stat statbuf;
+  if (stat(path.c_str(), &statbuf) != -1) {
+    if (S_ISDIR(statbuf.st_mode)) {
+      return true;
+    }
+  }
+  return false;
+}
+
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e76708baf4b39afb0febbcf3ff71281dfbfc8627
--- /dev/null
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -0,0 +1,59 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/ir_pass_manager.h"
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/string/pretty_log.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+using string::PrettyLogEndl;
+using string::PrettyLog;
+using string::Style;
+
+IRPassManager::IRPassManager(const ProgramDesc &program,
+                             framework::Scope *scope)
+    : program_(program) {
+  graph_.reset(new framework::ir::Graph(program));
+  if (scope)
+    graph_->Set(framework::ir::kParamScopeAttr, new framework::Scope *(scope));
+}
+
+void IRPassManager::Apply(const std::vector<std::string> &passes) {
+  // Apply all the passes
+  std::string pre_pass;
+  int pass_num = 0;
+  for (const std::string &pass_name : passes) {
+    PrettyLogEndl(Style::H2(), "--- Running IR pass [%s]", pass_name);
+    auto pass = framework::ir::PassRegistry::Instance().Get(pass_name);
+    if (pass_name == "graph_viz_pass") {
+      std::string dot_file_path = std::to_string(pass_num) + "_ir_" +
+                                  (pre_pass.empty() ? "origin" : pre_pass) +
+                                  ".dot";
+      pass->Set("graph_viz_path", new std::string(std::move(dot_file_path)));
+      pass_num++;
+    }
+    graph_ = pass->Apply(std::move(graph_));
+    pre_pass = pass_name;
+  }
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.h b/paddle/fluid/inference/analysis/ir_pass_manager.h
new file mode 100644
index 0000000000000000000000000000000000000000..bb230283b7c2cc783d0b68ea0aa3cca1cabc75e6
--- /dev/null
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.h
@@ -0,0 +1,48 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/*
+ * This file defines IRPassManager, it helps control the passes in IR. Inference
+ * phrase will load the model program and parameters from disk, that is quite
+ * different from the training phase.
+ * This manager will control the Passes and make the passes in IR work smoothly
+ * for inference.
+ */
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+using framework::ProgramDesc;
+
+class IRPassManager final {
+ public:
+  IRPassManager(const ProgramDesc &program, framework::Scope *scope);
+
+  void Apply(const std::vector<std::string> &passes);
+
+  framework::ir::Graph &graph() const { return *graph_; }
+
+ private:
+  std::unique_ptr<framework::ir::Graph> graph_;
+  ProgramDesc program_;
+};
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/model_store_pass.cc b/paddle/fluid/inference/analysis/model_store_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c313db08875669010ddcca13aa66b383ee6d26f8
--- /dev/null
+++ b/paddle/fluid/inference/analysis/model_store_pass.cc
@@ -0,0 +1,67 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string>
+
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include "paddle/fluid/inference/analysis/argument.h"
+#include "paddle/fluid/inference/analysis/model_store_pass.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+void ModelStorePass::Run(DataFlowGraph *x) {
+  if (!argument_->fluid_model_param_path) {
+    PADDLE_ENFORCE_NOT_NULL(argument_->fluid_model_dir);
+    argument_->fluid_model_param_path.reset(
+        new std::string(*argument_->fluid_model_dir + "param"));
+  }
+  PADDLE_ENFORCE_NOT_NULL(argument_->model_output_store_path);
+  // Directly copy param file to destination.
+  std::stringstream ss;
+  // NOTE these commands only works on linux.
+  ss << "mkdir -p " << *argument_->model_output_store_path;
+  VLOG(3) << "run command: " << ss.str();
+  PADDLE_ENFORCE_EQ(system(ss.str().c_str()), 0);
+  ss.str("");
+
+  ss << "cp " << *argument_->fluid_model_dir << "/*"
+     << " " << *argument_->model_output_store_path;
+  VLOG(3) << "run command: " << ss.str();
+  PADDLE_ENFORCE_EQ(system(ss.str().c_str()), 0);
+
+  // Store program
+  PADDLE_ENFORCE_NOT_NULL(argument_->transformed_program_desc,
+                          "program desc is not transformed, should call "
+                          "DataFlowGraphToFluidPass first.");
+  VLOG(3) << "store analyzed program to "
+          << *argument_->model_output_store_path;
+  const std::string program_output_path =
+      *argument_->model_output_store_path + "/__model__";
+  std::ofstream file(program_output_path, std::ios::binary);
+  PADDLE_ENFORCE(file.is_open(), "failed to open %s to write.",
+                 program_output_path);
+  const std::string serialized_message =
+      argument_->transformed_program_desc->SerializeAsString();
+  file.write(serialized_message.c_str(), serialized_message.size());
+}
+
+bool ModelStorePass::Finalize() { return true; }
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/model_store_pass.h b/paddle/fluid/inference/analysis/model_store_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..f14b49e09c2f8e79c6fc4accdbf17f4f7a9bb1a3
--- /dev/null
+++ b/paddle/fluid/inference/analysis/model_store_pass.h
@@ -0,0 +1,55 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/*
+ * This file defines ModelStorePass, which store the runtime DFG to a Paddle
+ * model in the disk, and that model can be reloaded for prediction.
+ */
+
+#pragma once
+#include <string>
+#include "paddle/fluid/inference/analysis/analysis_pass.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+class ModelStorePass : public DataFlowGraphPass {
+ public:
+  bool Initialize(Argument* argument) override {
+    if (!argument) {
+      LOG(ERROR) << "invalid argument";
+      return false;
+    }
+    argument_ = argument;
+    return true;
+  }
+
+  void Run(DataFlowGraph* x) override;
+
+  std::string repr() const override { return "DFG-store-pass"; }
+  std::string description() const override {
+    return R"DD(This file defines ModelStorePass, which store the runtime DFG to a Paddle
+    model in the disk, and that model can be reloaded for prediction again.)DD";
+  }
+
+  bool Finalize() override;
+
+ private:
+  Argument* argument_{nullptr};
+};
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/model_store_pass_tester.cc b/paddle/fluid/inference/analysis/model_store_pass_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d6493fc25edf25003504542f1b01c4105754c8df
--- /dev/null
+++ b/paddle/fluid/inference/analysis/model_store_pass_tester.cc
@@ -0,0 +1,43 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/model_store_pass.h"
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/analysis/analyzer.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+DEFINE_string(inference_model_dir, "", "Model path");
+
+TEST(DFG_StorePass, test) {
+  Analyzer analyzer;
+  Argument argument(FLAGS_inference_model_dir);
+  argument.model_output_store_path.reset(
+      new std::string("./_dfg_store_pass_tmp"));
+  // disable storage in alalyzer
+  FLAGS_IA_output_storage_path = "";
+  analyzer.Run(&argument);
+
+  ModelStorePass pass;
+  pass.Initialize(&argument);
+  pass.Run(argument.main_dfg.get());
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/node.cc b/paddle/fluid/inference/analysis/node.cc
index f2e918f3ff41d9db0c3ec38561015967bed26f4e..3339b5044df0cf91d00aa9ddad310d4bf263bc3c 100644
--- a/paddle/fluid/inference/analysis/node.cc
+++ b/paddle/fluid/inference/analysis/node.cc
@@ -20,17 +20,6 @@ namespace paddle {
 namespace inference {
 namespace analysis {
 
-template <>
-std::string &NodeAttr::As<std::string>() {
-  if (data_.empty()) {
-    type_index_ = std::type_index(typeid(std::string));
-  }
-  PADDLE_ENFORCE_EQ(type_index_, std::type_index(typeid(std::string)));
-  return data_;
-}
-
-std::string &NodeAttr::String() { return As<std::string>(); }
-
 std::vector<Dot::Attr> Value::dot_attrs() const {
   return std::vector<Dot::Attr>({Dot::Attr("style", "filled,rounded"),
                                  Dot::Attr("shape", "box"),
diff --git a/paddle/fluid/inference/analysis/node.h b/paddle/fluid/inference/analysis/node.h
index 47e524bc5c4a6b1324d5f182053129311487522d..af34156bc2f101465d87cb10e2155745022eb521 100644
--- a/paddle/fluid/inference/analysis/node.h
+++ b/paddle/fluid/inference/analysis/node.h
@@ -29,6 +29,7 @@ limitations under the License. */
 #include "paddle/fluid/inference/analysis/device.h"
 #include "paddle/fluid/inference/analysis/dot.h"
 #include "paddle/fluid/inference/analysis/helper.h"
+#include "paddle/fluid/platform/variant.h"
 
 namespace paddle {
 namespace inference {
@@ -37,41 +38,36 @@ namespace analysis {
 class NodeMap;
 
 // A helper class to maintain the status from Pass.
-struct NodeAttr {
+struct AnyAttr {
+  using any_t =
+      boost::variant<bool, float, int32_t, int64_t, void *, std::string>;
   // NOTE T should be a primary type or a struct combined by several primary
   // types.
   // NOTE the STL containers should not use here.
   // Some usages
   //   Attr attr;
   //   attr.Bool() = true;
-
   bool &Bool() { return As<bool>(); }
   float &Float() { return As<float>(); }
   int32_t &Int32() { return As<int32_t>(); }
   int64_t &Int64() { return As<int64_t>(); }
   void *&Pointer() { return As<void *>(); }
-  std::string &String();
+  std::string &String() { return As<std::string>(); }
 
- private:
   template <typename T>
   T &As() {
-    // init storage in the first usage.
-    if (data_.empty()) {
-      VLOG(4) << "resize data to " << sizeof(T);
-      type_index_ = std::type_index(typeid(T));
-      data_.resize(sizeof(T));
+    if (type_index_ == typeid(AnyAttr)) {
+      type_index_ = typeid(T);
+      any_data_ = T();
+    } else {
+      PADDLE_ENFORCE(type_index_ == typeid(T), "fetch error type");
     }
-    PADDLE_ENFORCE(framework::IsType<T>(type_index_),
-                   "type not matched, origin is %s, want %s",
-                   DataTypeNamer::Global().repr(type_index_),
-                   DataTypeNamer::Global().repr<T>());
-    PADDLE_ENFORCE_EQ(data_.size(), sizeof(T), "Node attr type recast error");
-    return *reinterpret_cast<T *>(&data_[0]);
+    return boost::get<T>(any_data_);
   }
 
  private:
-  std::string data_;
-  std::type_index type_index_{typeid(NodeAttr)};
+  any_t any_data_;
+  std::type_index type_index_{typeid(AnyAttr)};
 };
 
 /*
@@ -108,7 +104,7 @@ class Node {
 
   // Get an additional attribute and convert it to T data type. NOTE this will
   // silently create a new attribute if not exists.
-  NodeAttr &attr(const std::string &name) const { return attrs_[name]; }
+  AnyAttr &attr(const std::string &name) const { return attrs_[name]; }
 
   int id() const { return id_; }
 
@@ -153,7 +149,7 @@ class Node {
   Type type_{Type::kNone};
   // Mark this node is deleted by some pass.
   bool deleted_{false};
-  mutable std::unordered_map<std::string, NodeAttr> attrs_;
+  mutable std::unordered_map<std::string, AnyAttr> attrs_;
 };
 
 class Function;
diff --git a/paddle/fluid/inference/analysis/node_tester.cc b/paddle/fluid/inference/analysis/node_tester.cc
index ea832a3a7e47758be9b6bd59a4325ddb576ec446..9207c15373fb4264ff0e738e93ae88e1c08b554c 100644
--- a/paddle/fluid/inference/analysis/node_tester.cc
+++ b/paddle/fluid/inference/analysis/node_tester.cc
@@ -20,6 +20,24 @@ namespace paddle {
 namespace inference {
 namespace analysis {
 
+TEST(NodeAttr, bool) {
+  AnyAttr x;
+  x.Bool() = true;
+  ASSERT_EQ(x.Bool(), true);
+}
+
+TEST(NodeAttr, int32) {
+  AnyAttr x;
+  x.Int32() = 32;
+  ASSERT_EQ(x.Int32(), 32);
+}
+
+TEST(NodeAttr, string) {
+  AnyAttr x;
+  x.String() = "Hello";
+  ASSERT_EQ(x.String(), "Hello");
+}
+
 TEST(Node, Attr) {
   // Node is an abstract class, use Value instead for they share the same Attr
   // logic.
@@ -27,6 +45,9 @@ TEST(Node, Attr) {
   auto* node = nodes.Create(Node::Type::kValue);
   node->attr("v0").Int32() = 2008;
   ASSERT_EQ(node->attr("v0").Int32(), 2008);
+
+  node->attr("str").String() = "hello world";
+  ASSERT_EQ(node->attr("str").String(), "hello world");
 }
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/analysis/pass_manager.cc b/paddle/fluid/inference/analysis/pass_manager.cc
index b428bb22b1f0c5c1a47fc4c46c9070c1ace4a228..a6ac0ee49f8f408faa7a17bf5ef5d2799a9a6238 100644
--- a/paddle/fluid/inference/analysis/pass_manager.cc
+++ b/paddle/fluid/inference/analysis/pass_manager.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/inference/analysis/pass_manager.h"
 #include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
+#include "paddle/fluid/string/pretty_log.h"
 
 namespace paddle {
 namespace inference {
@@ -22,7 +23,7 @@ namespace analysis {
 bool PassManager::Initialize(Argument* argument) {
   argument_ = argument;
   for (auto& pass : data_) {
-    LOG(INFO) << "Initializing pass " << pass->repr();
+    VLOG(3) << "Initializing pass [" << pass->repr() << "]";
     if (!pass->Initialize(argument)) {
       LOG(ERROR) << "Failed to initialize pass [" << pass->repr() << "]";
       return false;
@@ -33,24 +34,14 @@ bool PassManager::Initialize(Argument* argument) {
 
 void DfgPassManager::RunAll() {
   PADDLE_ENFORCE(argument_);
+  VLOG(3) << "Total " << data_.size() << " Analysys passes";
   for (auto& pass : data_) {
-    VLOG(4) << "Running pass [" << pass->repr() << "]";
+    string::PrettyLogEndl(string::Style::H1(), "* Running Analysis pass [%s]",
+                          pass->repr());
     pass->Run(argument_->main_dfg.get());
   }
 }
 
-void NodePassManager::RunAll() {
-  PADDLE_ENFORCE(argument_);
-  PADDLE_ENFORCE(argument_->main_dfg.get());
-  auto trait =
-      GraphTraits<DataFlowGraph>(argument_->main_dfg.get()).nodes_in_DFS();
-  for (auto& node : trait) {
-    for (auto& pass : data_) {
-      pass->Run(&node);
-    }
-  }
-}
-
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/pass_manager.h b/paddle/fluid/inference/analysis/pass_manager.h
index 81a17e0287a5aef8a328e43380ee3691f5a32379..412747c4fcce73303703f586f7a04edf4cc5ee76 100644
--- a/paddle/fluid/inference/analysis/pass_manager.h
+++ b/paddle/fluid/inference/analysis/pass_manager.h
@@ -33,7 +33,7 @@ limitations under the License. */
 
 #include <string>
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/inference/analysis/pass.h"
+#include "paddle/fluid/inference/analysis/analysis_pass.h"
 
 namespace paddle {
 namespace inference {
@@ -43,7 +43,7 @@ namespace analysis {
  * PassManager is the base class for all pass managers, a pass manager has
  * several Pass-es registered, and execute them in the linear order.
  */
-class PassManager : public OrderedRegistry<Pass> {
+class PassManager : public OrderedRegistry<AnalysisPass> {
  public:
   PassManager() = default;
   // Call all the passes' Initialize methods. The desc and data_flow_graph are
@@ -89,18 +89,6 @@ class DfgPassManager : public PassManager {
   virtual ~DfgPassManager() = default;
 };
 
-/*
- * A pass manager that process a Node each time.
- */
-class NodePassManager : public PassManager {
- public:
-  NodePassManager() = default;
-
-  void RunAll() override;
-
-  virtual ~NodePassManager() = default;
-};
-
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/pass_manager_tester.cc b/paddle/fluid/inference/analysis/pass_manager_tester.cc
index dac1c509d728114bd24a2ea1150c407646026fd4..72b0fbf7e571ec97a0ea093d01449c1d5ddb9b91 100644
--- a/paddle/fluid/inference/analysis/pass_manager_tester.cc
+++ b/paddle/fluid/inference/analysis/pass_manager_tester.cc
@@ -34,29 +34,7 @@ class TestDfgPassManager final : public DfgPassManager {
   std::string description() const override { return "test doc"; }
 };
 
-class TestNodePassManager final : public NodePassManager {
- public:
-  virtual ~TestNodePassManager() = default;
-
-  std::string repr() const override { return "test-node-pass-manager"; }
-  std::string description() const override { return "test doc"; }
-};
-
-class TestNodePass final : public NodePass {
- public:
-  virtual ~TestNodePass() = default;
-
-  bool Initialize(Argument* argument) override { return true; }
-
-  void Run(Node* node) override {
-    LOG(INFO) << "- Processing node " << node->repr();
-  }
-
-  std::string repr() const override { return "test-node"; }
-  std::string description() const override { return "some doc"; }
-};
-
-TEST_F(DFG_Tester, DFG_pass_manager) {
+TEST(PassManager, DFG_pass_manager) {
   TestDfgPassManager manager;
   DFG_GraphvizDrawPass::Config config("./", "dfg.dot");
 
@@ -64,19 +42,9 @@ TEST_F(DFG_Tester, DFG_pass_manager) {
   manager.Register("graphviz", new DFG_GraphvizDrawPass(config));
   manager.Register("dfg-to-fluid", new DataFlowGraphToFluidPass);
 
-  ASSERT_TRUE(&argument);
-  ASSERT_TRUE(manager.Initialize(&argument));
-  manager.RunAll();
-}
-
-TEST_F(DFG_Tester, Node_pass_manager) {
-  // Pre-process: initialize the DFG with the ProgramDesc first.
-  FluidToDataFlowGraphPass pass0;
-  pass0.Initialize(&argument);
-  pass0.Run(argument.main_dfg.get());
+  Argument argument(FLAGS_inference_model_dir);
 
-  TestNodePassManager manager;
-  manager.Register("test-node-pass", new TestNodePass);
+  ASSERT_TRUE(&argument);
   ASSERT_TRUE(manager.Initialize(&argument));
   manager.RunAll();
 }
diff --git a/paddle/fluid/inference/analysis/subgraph_splitter.cc b/paddle/fluid/inference/analysis/subgraph_splitter.cc
index 389f9e1a9148a4daf0e5b751cce5cb6325252a4e..b879067d2f2f6294c50e0adb21f9399a7c36698a 100644
--- a/paddle/fluid/inference/analysis/subgraph_splitter.cc
+++ b/paddle/fluid/inference/analysis/subgraph_splitter.cc
@@ -34,7 +34,7 @@ inline void MarkOutLinksInSubGraph(const Function *func) {
 }
 
 void SubGraphSplitter::MarkNodesInsideSubGraph() {
-  for (auto &node : GraphTraits<DataFlowGraph>(graph_).nodes()) {
+  for (auto &node : GraphTraits<DataFlowGraph>(*graph_).nodes()) {
     if (node_inside_subgraph_teller_(&node)) {
       node.attr(kMarkerAttrName).Bool() = true;
       if (node.type() == Node::Type::kFunction) {
@@ -74,13 +74,141 @@ void UnionFindCombine(const node_map_t &node_map, size_t a, size_t b) {
   node_map.at(b)->attr(kUnionFindParent).Int32() = a_ancestor;
 }
 
+// This is a simple representation of a graph.
+// The BriefNode hold the pointer of the Node.
+// This is to avoid changing the original graph
+// in the process of trt graph analysis.
+struct BriefNode {
+  explicit BriefNode(Node *n) { node = n; }
+  Node *node;
+  std::vector<BriefNode *> inlinks;
+  std::vector<BriefNode *> outlinks;
+};
+
+// Union two adjacent BriefNode.
+// Suppose we have two adjacent nodes src and dst.
+// We will perform the following operations:
+// 1. add all inputs(except src) of dst to src inlinks.
+// 2. add all outputs of dst to src outlinks.
+// 3. change all the dst's inputs and outputs
+// corresponding inlinks and outlinks to src node.
+// 4. delete all dst's inlinks and outlinks.
+void UnionContractedNodes(const std::unordered_map<int, BriefNode *> &node_map,
+                          int src_id, int dst_id) {
+  // merge the two adjacent nodes into one node.
+  BriefNode *src_node = node_map.at(src_id);
+  BriefNode *dst_node = node_map.at(dst_id);
+
+  std::unordered_set<BriefNode *> inputs(src_node->inlinks.begin(),
+                                         src_node->inlinks.end());
+  std::unordered_set<BriefNode *> outputs;
+
+  for (auto *n : src_node->outlinks) {
+    if (n != dst_node) outputs.insert(n);
+  }
+
+  // Add the inlinks and outlinks of dst node to src node.
+  std::vector<BriefNode *> dst_in_nodes = dst_node->inlinks;
+  for (BriefNode *node : dst_in_nodes) {
+    if (node != src_node) {
+      inputs.insert(node);
+    }
+  }
+
+  std::vector<BriefNode *> dst_out_nodes = dst_node->outlinks;
+  for (BriefNode *node : dst_out_nodes) {
+    outputs.insert(node);
+  }
+
+// update the dst and src node's inlinks and outlinks.
+#ifdef __clang__
+  src_node->inlinks = std::vector<BriefNode *>(inputs.begin(), inputs.end());
+  src_node->outlinks = std::vector<BriefNode *>(outputs.begin(), outputs.end());
+  dst_node->inlinks.clear();
+  dst_node->outlinks.clear();
+#else
+  src_node->inlinks =
+      std::move(std::vector<BriefNode *>(inputs.begin(), inputs.end()));
+  src_node->outlinks =
+      std::move(std::vector<BriefNode *>(outputs.begin(), outputs.end()));
+  dst_node->inlinks.clear();
+  dst_node->outlinks.clear();
+#endif
+
+  auto inlink_or_outlink_cleaner = [&](std::vector<BriefNode *> &nodes) {
+    for (auto *&n : nodes) {
+      if (n == src_node || n == dst_node) {
+        n = src_node;
+      }
+    }
+  };
+  // Change all the dst inputs and outputs corresponding inlink and
+  // outlink to the src node.
+  for (auto *node : src_node->inlinks) {
+    inlink_or_outlink_cleaner(node->outlinks);
+  }
+
+  for (auto *node : src_node->outlinks) {
+    inlink_or_outlink_cleaner(node->inlinks);
+  }
+}
+
+// FlexibleDFS
+// If reverse is true, do reverse dfs.
+// If enter func is not nullptr, calls enter(node) before visiting any children
+// of node.
+// If leave func not nullptr, calls leave(node) after visiting all parents of
+// node.
+void FlexibleDFS(const std::vector<BriefNode *> &source, bool reverse,
+                 const std::function<bool(const BriefNode *)> &enter,
+                 const std::function<bool(const BriefNode *)> &leave) {
+  typedef struct {
+    const BriefNode *node;
+    bool leave;
+  } FNode;
+
+  std::vector<FNode> stack;
+  for (auto &node : source) {
+    stack.push_back(FNode{node, false});
+  }
+  std::unordered_set<const BriefNode *> visited;
+  while (!stack.empty()) {
+    auto fnode = stack.back();
+    stack.pop_back();
+
+    if (fnode.leave) {
+      if (leave && !leave(fnode.node)) return;
+    }
+    if (visited.count(fnode.node)) continue;
+    visited.insert(fnode.node);
+
+    if (enter && !enter(fnode.node)) return;
+
+    if (leave) stack.push_back(FNode{fnode.node, true});
+    const std::vector<BriefNode *> iter_nodes =
+        reverse == true ? fnode.node->inlinks : fnode.node->outlinks;
+    for (const BriefNode *node : iter_nodes) {
+      if (!visited.count(node)) {
+        stack.push_back(FNode{node, false});
+      }
+    }
+  }
+}
+
 std::vector<std::vector<Node *>> SubGraphSplitter::ExtractSubGraphs() {
+  // Run the Extract algorithm to find all subgraphs.
   std::vector<Node *> marked_nodes;
-  for (auto &node : GraphTraits<DataFlowGraph>(graph_).nodes()) {
+  //  We use brief_node_map to represent the original graph in order to avoid
+  //  changing the original graph.
+  std::unordered_map<int, BriefNode *> brief_node_map;
+
+  for (auto &node : GraphTraits<DataFlowGraph>(*graph_).nodes_in_TS()) {
+    brief_node_map[node.id()] = new BriefNode(&node);
     if (node.attr(kMarkerAttrName).Bool()) {
       marked_nodes.push_back(&node);
     }
   }
+
   // extract sub-graphs in the marked node set, use Union Find algorithm.
   node_map_t node_map;  // id to ptr
   for (auto *n : marked_nodes) {
@@ -88,11 +216,73 @@ std::vector<std::vector<Node *>> SubGraphSplitter::ExtractSubGraphs() {
     n->attr(kUnionFindParent).Int32() = n->id();
     node_map[n->id()] = n;
   }
-  std::unordered_set<Node *> visited;
-  for (auto *n : marked_nodes) {
-    for (auto *out : n->outlinks) {
-      if (node_map.count(out->id())) {
-        UnionFindCombine(node_map, n->id(), out->id());
+
+  // create breif node map
+  for (auto &itr : brief_node_map) {
+    for (Node *node : itr.second->node->inlinks) {
+      itr.second->inlinks.push_back(brief_node_map[node->id()]);
+    }
+
+    for (Node *node : itr.second->node->outlinks) {
+      itr.second->outlinks.push_back(brief_node_map[node->id()]);
+    }
+  }
+
+  for (auto &itr : brief_node_map) {
+    BriefNode *brief_node = itr.second;
+
+    if (!brief_node->node->attr(kMarkerAttrName).Bool()) {
+      VLOG(4) << brief_node->node->id() << " node not a trt candicate.";
+      continue;
+    }
+
+    //  Our algorithm must guarantee that:
+    //  1. The graph is always directed acyclic graph（DAG）.
+    //  2. If there is a path in the subgraph from X to Y (X and Y are both
+    //  nodes in the subgraph), then all paths from X to Y are in the
+    //  subgraph.
+    //
+    //  In order to achieve the above guarantee.
+    //  For adjacent nodes src -> dst.
+    //  1. Get all dst input nodes except src.
+    //  2. Reverse DFS from those input nodes
+    //  3. If there is a path from input nodes to src,
+    //  then the src and dst nodes can not be fused into one node,
+    //  otherwise it can be done.
+
+    while (true) {
+      std::unordered_set<BriefNode *> contract_nodes;
+      for (auto *out : brief_node->outlinks) {
+        // must be an trt candidate
+        if (!out->node->attr(kMarkerAttrName).Bool()) continue;
+        // get all dst input nodes except src.
+        std::vector<BriefNode *> source_nodes;
+        for (auto *n : out->inlinks) {
+          if (n != brief_node) {
+            source_nodes.push_back(n);
+          }
+        }
+
+        // Reverse DFS from the source_nodes.
+        bool have_excess_path = false;
+        FlexibleDFS(source_nodes, true, nullptr,
+                    [&have_excess_path, brief_node](const BriefNode *n) {
+                      if (n == brief_node) {
+                        have_excess_path = true;
+                        return false;
+                      }
+                      return true;
+                    });
+        if (have_excess_path) continue;
+        contract_nodes.insert(out);
+      }
+      if (contract_nodes.empty()) break;
+
+      for (auto dst_node : contract_nodes) {
+        UnionFindCombine(node_map, brief_node->node->id(),
+                         dst_node->node->id());
+        UnionContractedNodes(brief_node_map, brief_node->node->id(),
+                             dst_node->node->id());
       }
     }
   }
@@ -128,6 +318,7 @@ void SubGraphFuse::ReplaceNodesWithSubGraphs() {
     auto io = ExtractInputAndOutputOfSubGraph(subgraph);
     block_node->inlinks = std::move(io.first);
     block_node->outlinks = std::move(io.second);
+
     for (auto *node : subgraph) {
       // TODO(Superjomn) need a unified mechanism to treat deleted node in each
       // pass.
@@ -153,6 +344,7 @@ void SubGraphFuse::ReplaceNodesWithSubGraphs() {
       inlink_or_outlink_cleaner(o->inlinks);
     }
   }
+  FilterRedundantOutputOfSubGraph(graph_);
 }
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc b/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc
index 8134494f8bccb132f2ed7d1ba1fb615a298596ed..531a170512f727d891aa6644ee08a60c25f16876 100644
--- a/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc
+++ b/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc
@@ -31,8 +31,8 @@ SubGraphSplitter::NodeInsideSubgraphTeller teller = [](const Node* node) {
   return false;
 };
 
-TEST_F(DFG_Tester, Split) {
-  auto desc = LoadProgramDesc();
+TEST(SubGraphSplitter, Split) {
+  auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__");
   auto dfg = ProgramDescToDFG(desc);
   LOG(INFO) << "spliter\n" << dfg.DotString();
 
@@ -63,8 +63,8 @@ TEST_F(DFG_Tester, Split) {
   ASSERT_EQ(subgraphs.back().size(), 6UL);
 }
 
-TEST_F(DFG_Tester, Fuse) {
-  auto desc = LoadProgramDesc();
+TEST(SubGraphSplitter, Fuse) {
+  auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__");
   auto dfg = ProgramDescToDFG(desc);
 
   size_t count0 = dfg.nodes.size();
@@ -82,7 +82,7 @@ TEST_F(DFG_Tester, Fuse) {
 
   // At least one nodes should be deleted.
   ASSERT_EQ(dfg.nodes.size(), count0 + 1);  // added a new FunctionBlock
-  ASSERT_EQ(6UL, count1);
+  ASSERT_EQ(11, count1);
 }
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc
index f736e385c11add152dc9ab9485bf1de40f80b2f3..174c8513f92cf869419f04cab5a54af65e9673b8 100644
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc
@@ -68,9 +68,9 @@ class DfgDebuggerPass : public DFG_GraphvizDrawPass {
   }
 };
 
-Pass *TensorRTSubgraphNodeMarkPass::CreateGraphvizDebugerPass() const {
-  DFG_GraphvizDrawPass::Config config(
-      FLAGS_inference_analysis_graphviz_log_root, "tensorrt_marked_node");
+AnalysisPass *TensorRTSubgraphNodeMarkPass::CreateGraphvizDebugerPass() const {
+  DFG_GraphvizDrawPass::Config config(FLAGS_IA_graphviz_log_root,
+                                      "tensorrt_marked_node");
   return new DfgDebuggerPass(config);
 }
 bool TensorRTSubgraphNodeMarkPass::Finalize() { return true; }
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h
index c558a6ebbde371071c7330a14cc986bf764d1773..c881a54c240538b68abdcb9060db69de3bf2b8bb 100644
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h
@@ -20,7 +20,7 @@
 #pragma once
 
 #include <string>
-#include "paddle/fluid/inference/analysis/pass.h"
+#include "paddle/fluid/inference/analysis/analysis_pass.h"
 #include "paddle/fluid/inference/analysis/subgraph_splitter.h"
 
 namespace paddle {
@@ -48,7 +48,7 @@ class TensorRTSubgraphNodeMarkPass : public DataFlowGraphPass {
     return "tensorrt sub-graph mark pass";
   }
 
-  Pass* CreateGraphvizDebugerPass() const override;
+  AnalysisPass* CreateGraphvizDebugerPass() const override;
   bool Finalize() override;
 
  private:
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass_tester.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass_tester.cc
index a6c15e848b99ca318f4583e3d4b88345fe8e5ebc..c1d932878e559180af987594535959afdf475587 100644
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass_tester.cc
@@ -22,11 +22,11 @@ namespace paddle {
 namespace inference {
 namespace analysis {
 
-TEST_F(DFG_Tester, tensorrt_subgraph_node_mark_pass) {
+TEST(TensorRTSubgraphNodeMarkPass, test) {
   // init
   FluidToDataFlowGraphPass pass;
+  Argument argument(FLAGS_inference_model_dir);
   ASSERT_TRUE(pass.Initialize(&argument));
-  argument.main_dfg.reset(new DataFlowGraph);
   pass.Run(argument.main_dfg.get());
 
   TensorRTSubgraphNodeMarkPass::teller_t teller = [](const Node* node) {
@@ -41,7 +41,7 @@ TEST_F(DFG_Tester, tensorrt_subgraph_node_mark_pass) {
   for (auto& node : argument.main_dfg->nodes.nodes()) {
     counter += node->attr(ATTR_supported_by_tensorrt).Bool();
   }
-
+  ASSERT_EQ(counter, 2);
   LOG(INFO) << counter << " nodes marked";
 }
 
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc
index 9993de22800bc0aafdcbf46618e6b479ac1eb187..faf876de6d65d20cf7a084cd97392cfc8d791a42 100644
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc
@@ -25,6 +25,9 @@ TensorRTSubGraphPass::TensorRTSubGraphPass(
 
 void TensorRTSubGraphPass::Run(DataFlowGraph *graph) {
   SubGraphFuse(graph, node_inside_subgraph_teller_)();
+  VLOG(4) << "debug info "
+          << graph->HumanReadableInfo(false /*show_values*/,
+                                      true /*show_functions*/);
 }
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h
index c6741a92095d33d261a4e1667c87a8ca02e51a9f..219e3f5470f627e81005aabf94f9c72c33fd2eed 100644
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h
@@ -15,8 +15,8 @@ limitations under the License. */
 #pragma once
 
 #include <string>
+#include "paddle/fluid/inference/analysis/analysis_pass.h"
 #include "paddle/fluid/inference/analysis/node.h"
-#include "paddle/fluid/inference/analysis/pass.h"
 #include "paddle/fluid/inference/analysis/subgraph_splitter.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc
index 1d749d3fa3f39b351ccee6ebeb82467f7220a0b6..67a5af83d89b771536ea11be51b35244ff5c09d6 100644
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc
@@ -25,7 +25,7 @@ namespace analysis {
 
 DEFINE_string(dot_dir, "./", "");
 
-TEST_F(DFG_Tester, tensorrt_single_pass) {
+TEST(TensorRTSubGraphPass, main) {
   std::unordered_set<std::string> teller_set(
       {"elementwise_add", "mul", "sigmoid"});
   SubGraphSplitter::NodeInsideSubgraphTeller teller = [&](const Node* node) {
@@ -35,7 +35,8 @@ TEST_F(DFG_Tester, tensorrt_single_pass) {
     return false;
   };
 
-  LOG(INFO) << "init";
+  Argument argument(FLAGS_inference_model_dir);
+
   DFG_GraphvizDrawPass::Config config{FLAGS_dot_dir, "origin"};
   DFG_GraphvizDrawPass::Config config1{FLAGS_dot_dir, "fusion"};
 
@@ -44,13 +45,11 @@ TEST_F(DFG_Tester, tensorrt_single_pass) {
   FluidToDataFlowGraphPass pass0;
   TensorRTSubGraphPass trt_pass(std::move(teller));
 
-  LOG(INFO) << "Initialize";
   dfg_pass.Initialize(&argument);
   dfg_pass1.Initialize(&argument);
   pass0.Initialize(&argument);
   trt_pass.Initialize(&argument);
 
-  LOG(INFO) << "Run";
   argument.main_dfg.reset(new DataFlowGraph);
   pass0.Run(argument.main_dfg.get());
   dfg_pass.Run(argument.main_dfg.get());
diff --git a/paddle/fluid/inference/analysis/ut_helper.h b/paddle/fluid/inference/analysis/ut_helper.h
index ce1191a567a4198f003520c40bf02487c48c56eb..1073a6f686eaeeaaae2d93ab044149b7df518085 100644
--- a/paddle/fluid/inference/analysis/ut_helper.h
+++ b/paddle/fluid/inference/analysis/ut_helper.h
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/inference/analysis/data_flow_graph.h"
 #include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
-#include "paddle/fluid/inference/analysis/ut_helper.h"
+#include "paddle/fluid/inference/analysis/helper.h"
 
 namespace paddle {
 namespace inference {
@@ -32,27 +32,12 @@ namespace analysis {
 
 DEFINE_string(inference_model_dir, "", "inference test model dir");
 
-static framework::proto::ProgramDesc LoadProgramDesc(
-    const std::string& model_dir = FLAGS_inference_model_dir) {
-  std::string msg;
-  std::string net_file = FLAGS_inference_model_dir + "/__model__";
-  std::ifstream fin(net_file, std::ios::in | std::ios::binary);
-  PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s", net_file);
-  fin.seekg(0, std::ios::end);
-  msg.resize(fin.tellg());
-  fin.seekg(0, std::ios::beg);
-  fin.read(&(msg.at(0)), msg.size());
-  fin.close();
-  framework::proto::ProgramDesc program_desc;
-  program_desc.ParseFromString(msg);
-  return program_desc;
-}
-
 static DataFlowGraph ProgramDescToDFG(
     const framework::proto::ProgramDesc& desc) {
   DataFlowGraph graph;
   FluidToDataFlowGraphPass pass;
   Argument argument;
+  argument.fluid_model_dir.reset(new std::string(FLAGS_inference_model_dir));
   argument.origin_program_desc.reset(new framework::proto::ProgramDesc(desc));
   pass.Initialize(&argument);
   pass.Run(&graph);
@@ -63,7 +48,7 @@ static DataFlowGraph ProgramDescToDFG(
 class DFG_Tester : public ::testing::Test {
  protected:
   void SetUp() override {
-    auto desc = LoadProgramDesc(FLAGS_inference_model_dir);
+    auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__");
     argument.origin_program_desc.reset(new framework::proto::ProgramDesc(desc));
   }
 
diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5df486f345a98d7737d326c94e4854d24535ff61
--- /dev/null
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -0,0 +1,93 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+if(APPLE)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=pessimizing-move")
+endif(APPLE)
+
+
+set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager ${GLOB_PASS_LIB})
+
+if(WITH_GPU AND TENSORRT_FOUND)
+    set(inference_deps ${inference_deps} paddle_inference_tensorrt_subgraph_engine)
+endif()
+
+function(inference_api_test TARGET_NAME)
+    if (WITH_TESTING)
+        set(options "")
+        set(oneValueArgs SRC)
+        set(multiValueArgs ARGS)
+        cmake_parse_arguments(inference_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+        set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests)
+        cc_test(${TARGET_NAME}
+                SRCS ${inference_test_SRC}
+                DEPS "${inference_deps}"
+                ARGS --dirname=${PYTHON_TESTS_DIR}/book/)
+        if(inference_test_ARGS)
+            set_tests_properties(${TARGET_NAME}
+                    PROPERTIES DEPENDS "${inference_test_ARGS}")
+        endif()
+    endif(WITH_TESTING)
+endfunction(inference_api_test)
+
+cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor)
+cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis)
+cc_test(test_paddle_inference_api
+        SRCS api_tester.cc
+        DEPS paddle_inference_api)
+
+inference_api_test(test_api_impl SRC api_impl_tester.cc
+                    ARGS test_word2vec test_image_classification)
+
+if(WITH_GPU AND TENSORRT_FOUND)
+cc_library(paddle_inference_tensorrt_subgraph_engine
+        SRCS api_tensorrt_subgraph_engine.cc
+        DEPS paddle_inference_api analysis tensorrt_engine paddle_inference_api paddle_fluid_api tensorrt_converter)
+
+inference_api_test(test_api_tensorrt_subgraph_engine SRC api_tensorrt_subgraph_engine_tester.cc ARGS test_word2vec)
+endif()
+
+if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
+    # compile the libinference_anakin_api.a and anakin.so.
+    cc_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber mklml)
+    cc_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber)
+    function(anakin_target target_name)
+      target_compile_options(${target_name} BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
+    endfunction()
+    anakin_target(inference_anakin_api)
+    anakin_target(inference_anakin_api_shared)
+    if (WITH_TESTING)
+        # TODO(luotao): ANAKIN_MODLE_URL etc will move to demo ci later.
+        set(INFERENCE_URL "http://paddle-inference-dist.bj.bcebos.com")
+        set(ANAKIN_RNN_MODLE_URL "${INFERENCE_URL}/anakin_test%2Fditu_rnn.anakin2.model.bin")
+        set(ANAKIN_RNN_DATA_URL "${INFERENCE_URL}/anakin_test%2Fditu_rnn_data.txt")
+        execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_SOURCE_DIR}")
+        execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_RNN_MODLE_URL} -N")
+        execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_RNN_DATA_URL} -N")
+        if(WITH_GPU)
+            set(anakin_test_extra_deps dynload_cuda)
+            set(ANAKIN_MODLE_URL "${INFERENCE_URL}/mobilenet_v2.anakin.bin")
+            execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_MODLE_URL} -N")
+            cc_test(api_anakin_engine_tester SRCS api_anakin_engine_tester.cc 
+                    ARGS --model=${ANAKIN_SOURCE_DIR}/mobilenet_v2.anakin.bin
+                    DEPS inference_anakin_api_shared ${anakin_test_extra_deps} SERIAL)
+        endif()
+        cc_test(api_anakin_engine_rnn_tester SRCS api_anakin_engine_rnn_tester.cc 
+                ARGS --model=${ANAKIN_SOURCE_DIR}/anakin_test%2Fditu_rnn.anakin2.model.bin 
+                     --datapath=${ANAKIN_SOURCE_DIR}/anakin_test%2Fditu_rnn_data.txt
+                DEPS inference_anakin_api_shared ${anakin_test_extra_deps} SERIAL)
+    endif(WITH_TESTING)
+endif()
diff --git a/paddle/contrib/inference/README.md b/paddle/fluid/inference/api/README.md
similarity index 100%
rename from paddle/contrib/inference/README.md
rename to paddle/fluid/inference/api/README.md
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
new file mode 100644
index 0000000000000000000000000000000000000000..684e0ce0e292d852d4601ebd1ccd920382e42c8b
--- /dev/null
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -0,0 +1,157 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/api/analysis_predictor.h"
+#include <memory>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/api/paddle_inference_pass.h"
+#include "paddle/fluid/inference/utils/singleton.h"
+#include "paddle/fluid/platform/profiler.h"
+
+DECLARE_bool(profile);
+
+namespace paddle {
+
+bool AnalysisPredictor::Init(
+    const std::shared_ptr<framework::Scope>& parent_scope) {
+  VLOG(3) << "Predictor::init()";
+#if !defined(_WIN32)
+  if (FLAGS_profile) {
+    LOG(WARNING) << "Profiler is actived, might affect the performance";
+    LOG(INFO) << "You can turn off by set gflags '-profile false'";
+    auto tracking_device = config_.use_gpu ? platform::ProfilerState::kAll
+                                           : platform::ProfilerState::kCPU;
+    platform::EnableProfiler(tracking_device);
+  }
+#endif
+
+  if (config_.use_gpu) {
+    place_ = paddle::platform::CUDAPlace(config_.device);
+    LOG(WARNING) << "ir optimize only supports CPU currently";
+    config_.enable_ir_optim = false;
+  } else {
+    place_ = paddle::platform::CPUPlace();
+  }
+  if (parent_scope) {
+    scope_ = parent_scope;
+    sub_scope_ = &(parent_scope->NewScope());
+  } else {
+    paddle::framework::InitDevices(false);
+    scope_.reset(new paddle::framework::Scope());
+  }
+
+  executor_.reset(new paddle::framework::Executor(place_));
+
+  // Initialize the inference program
+  if (!config_.model_dir.empty()) {
+    // Parameters are saved in separate files sited in
+    // the specified `dirname`.
+    inference_program_ = paddle::inference::Load(executor_.get(), scope_.get(),
+                                                 config_.model_dir);
+  } else if (!config_.prog_file.empty() && !config_.param_file.empty()) {
+    // All parameters are saved in a single file.
+    // The file names should be consistent with that used
+    // in Python API `fluid.io.save_inference_model`.
+    inference_program_ = paddle::inference::Load(
+        executor_.get(), scope_.get(), config_.prog_file, config_.param_file);
+  } else {
+    LOG(ERROR) << "fail to load inference model.";
+    return false;
+  }
+
+  OptimizeInferenceProgram();
+  ctx_ = executor_->Prepare(*inference_program_, 0);
+  if (config_._use_mkldnn) {
+    executor_->EnableMKLDNN(*inference_program_);
+  }
+
+  VLOG(5) << "to create variables";
+  PADDLE_ENFORCE(scope_.get());
+  executor_->CreateVariables(*inference_program_,
+                             sub_scope_ ? sub_scope_ : scope_.get(), 0);
+  // Get the feed_target_names and fetch_target_names
+  PrepareFeedFetch();
+  return true;
+}
+
+void AnalysisPredictor::OptimizeInferenceProgram() {
+  LOG(INFO) << "optimize begin";
+  FLAGS_IA_enable_ir = config_.enable_ir_optim;
+  FLAGS_IA_enable_tensorrt_subgraph_engine = false;
+  FLAGS_IA_output_storage_path = "";  // Don't output the model.
+  // Analyze inference_program
+  if (!config_.model_dir.empty()) {
+    argument_.fluid_model_dir.reset(new std::string(config_.model_dir));
+  } else {
+    PADDLE_ENFORCE(
+        !config_.param_file.empty(),
+        "Either model_dir or (param_file, prog_file) should be set.");
+    PADDLE_ENFORCE(!config_.prog_file.empty());
+    argument_.fluid_model_program_path.reset(
+        new std::string(config_.prog_file));
+    argument_.fluid_model_param_path.reset(new std::string(config_.param_file));
+  }
+  argument_.origin_program_desc.reset(
+      new ProgramDesc(*inference_program_->Proto()));
+  PADDLE_ENFORCE(config_.ir_mode == AnalysisConfig::IrPassMode::kExclude,
+                 "Only kExclude is supported yet.");
+  Analyzer().DisableIrPasses(config_.ir_passes).Run(&argument_);
+
+  CHECK(argument_.transformed_program_desc);
+  VLOG(5) << "to prepare executor";
+  inference_program_.reset(
+      new framework::ProgramDesc(*argument_.transformed_program_desc));
+  if (argument_.Has(framework::ir::kParamScopeAttr)) {
+    // Update scope.
+    scope_.reset(
+        argument_.Release<framework::Scope>(framework::ir::kParamScopeAttr));
+  }
+  LOG(INFO) << "== optimize end ==";
+}
+
+template <>
+std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
+    AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig& config) {
+  VLOG(3) << "create AnalysisConfig";
+  if (config.use_gpu) {
+    // 1. GPU memeroy
+    PADDLE_ENFORCE_GT(
+        config.fraction_of_gpu_memory, 0.f,
+        "fraction_of_gpu_memory in the config should be set to range (0., 1.]");
+    PADDLE_ENFORCE_GE(config.device, 0, "Invalid device id %d", config.device);
+    std::vector<std::string> flags;
+    if (config.fraction_of_gpu_memory >= 0.0f ||
+        config.fraction_of_gpu_memory <= 0.95f) {
+      flags.push_back("dummpy");
+      std::string flag = "--fraction_of_gpu_memory_to_use=" +
+                         std::to_string(config.fraction_of_gpu_memory);
+      flags.push_back(flag);
+      VLOG(3) << "set flag: " << flag;
+      framework::InitGflags(flags);
+    }
+  }
+
+  std::unique_ptr<PaddlePredictor> predictor(new AnalysisPredictor(config));
+  if (!dynamic_cast<AnalysisPredictor*>(predictor.get())->Init(nullptr)) {
+    return nullptr;
+  }
+  return predictor;
+}
+
+}  // namespace paddle
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
new file mode 100644
index 0000000000000000000000000000000000000000..e53925366e9214cd60422efe56884751297c15e5
--- /dev/null
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -0,0 +1,53 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include "paddle/fluid/inference/api/api_impl.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+
+namespace paddle {
+
+using inference::analysis::Argument;
+using inference::analysis::Analyzer;
+using framework::proto::ProgramDesc;
+
+/* This predictor is based on the original native predictor with IR and Analysis
+ * support. It will optimize IR and Parameters in the runtime.
+ * TODO(Superjomn) Replace the Navive predictor?
+ */
+class AnalysisPredictor : public NativePaddlePredictor {
+ public:
+  explicit AnalysisPredictor(const AnalysisConfig& config)
+      : NativePaddlePredictor(config), config_(config) {}
+
+  bool Init(const std::shared_ptr<framework::Scope>& parent_scope);
+
+  bool Run(const std::vector<PaddleTensor>& inputs,
+           std::vector<PaddleTensor>* output_data,
+           int batch_size = -1) override {
+    return NativePaddlePredictor::Run(inputs, output_data, batch_size);
+  }
+
+  void OptimizeInferenceProgram();
+
+  Argument& analysis_argument() { return argument_; }
+
+ private:
+  AnalysisConfig config_;
+  Argument argument_;
+};
+
+}  // namespace paddle
diff --git a/paddle/contrib/inference/paddle_inference_api.cc b/paddle/fluid/inference/api/api.cc
similarity index 67%
rename from paddle/contrib/inference/paddle_inference_api.cc
rename to paddle/fluid/inference/api/api.cc
index ea46b3006f8d0964cc8229d3683ee7b602d6ef0d..c71769a32f604358fe68c927546591310649f116 100644
--- a/paddle/contrib/inference/paddle_inference_api.cc
+++ b/paddle/fluid/inference/api/api.cc
@@ -1,18 +1,16 @@
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
 http://www.apache.org/licenses/LICENSE-2.0
-
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/contrib/inference/paddle_inference_api.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
 
@@ -23,7 +21,6 @@ int PaddleDtypeSize(PaddleDType dtype) {
     case PaddleDType::INT64:
       return sizeof(int64_t);
     default:
-      //
       assert(false);
       return -1;
   }
@@ -41,22 +38,41 @@ PaddleBuf::PaddleBuf(PaddleBuf&& other)
 PaddleBuf::PaddleBuf(const PaddleBuf& other) { *this = other; }
 
 PaddleBuf& PaddleBuf::operator=(const PaddleBuf& other) {
+  if (!other.memory_owned_) {
+    data_ = other.data_;
+    length_ = other.length_;
+    memory_owned_ = other.memory_owned_;
+  } else {
+    Resize(other.length());
+    memcpy(data_, other.data(), other.length());
+    length_ = other.length();
+    memory_owned_ = true;
+  }
+  return *this;
+}
+
+PaddleBuf& PaddleBuf::operator=(PaddleBuf&& other) {
   // only the buffer with external memory can be copied
-  assert(!other.memory_owned_);
   data_ = other.data_;
   length_ = other.length_;
   memory_owned_ = other.memory_owned_;
+  other.data_ = nullptr;
+  other.length_ = 0;
+  other.memory_owned_ = false;
   return *this;
 }
 
 void PaddleBuf::Resize(size_t length) {
   // Only the owned memory can be reset, the external memory can't be changed.
-  if (length_ == length) return;
-  assert(memory_owned_);
-  Free();
-  data_ = new char[length];
-  length_ = length;
-  memory_owned_ = true;
+  if (length_ >= length) return;
+  if (memory_owned_) {
+    Free();
+    data_ = malloc(length);
+    length_ = length;
+    memory_owned_ = true;
+  } else {
+    PADDLE_THROW("The memory is allocated externally, can not Resized");
+  }
 }
 
 void PaddleBuf::Reset(void* data, size_t length) {
@@ -68,8 +84,8 @@ void PaddleBuf::Reset(void* data, size_t length) {
 
 void PaddleBuf::Free() {
   if (memory_owned_ && data_) {
-    assert(length_ > 0);
-    delete static_cast<char*>(data_);
+    PADDLE_ENFORCE_GT(length_, 0);
+    free(static_cast<char*>(data_));
     data_ = nullptr;
     length_ = 0;
   }
diff --git a/paddle/fluid/inference/api/api_anakin_engine.cc b/paddle/fluid/inference/api/api_anakin_engine.cc
new file mode 100644
index 0000000000000000000000000000000000000000..43b31269d2bd82c06e284e3599a3763da693a2af
--- /dev/null
+++ b/paddle/fluid/inference/api/api_anakin_engine.cc
@@ -0,0 +1,265 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/api/api_anakin_engine.h"
+
+#ifdef PADDLE_WITH_CUDA
+#include <cuda.h>
+#endif
+
+#include <mkl_service.h>
+#include <omp.h>
+#include <map>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "framework/core/net/net.h"
+#include "framework/operators/ops.h"
+#include "saber/funcs/timer.h"
+
+namespace paddle {
+
+template <typename Target>
+PaddleInferenceAnakinPredictor<Target>::PaddleInferenceAnakinPredictor(
+    const AnakinConfig &config) {
+  CHECK(Init(config));
+}
+template <>
+PaddleInferenceAnakinPredictor<anakin::X86>::PaddleInferenceAnakinPredictor(
+    const AnakinConfig &config) {
+  omp_set_dynamic(0);
+  omp_set_num_threads(1);
+  mkl_set_num_threads(1);
+  CHECK(Init(config));
+}
+template <typename Target>
+bool PaddleInferenceAnakinPredictor<Target>::Init(const AnakinConfig &config) {
+  if (!(graph_.load(config.model_file))) {
+    VLOG(3) << "fail to load graph from " << config.model_file;
+    return false;
+  }
+  auto inputs = graph_.get_ins();
+  for (auto &input_str : inputs) {
+    graph_.ResetBatchSize(input_str, config.max_batch_size);
+    max_batch_size_ = config.max_batch_size;
+  }
+  // optimization for graph
+  if (!(graph_.Optimize())) {
+    return false;
+  }
+  // construct executer
+  if (executor_p_ == nullptr) {
+    executor_p_ = new anakin::Net<Target, anakin::saber::AK_FLOAT,
+                                  anakin::Precision::FP32>(graph_, true);
+  }
+  return true;
+}
+
+template <typename Target>
+bool PaddleInferenceAnakinPredictor<Target>::Run(
+    const std::vector<PaddleTensor> &inputs,
+    std::vector<PaddleTensor> *output_data, int batch_size) {
+  for (const auto &input : inputs) {
+    if (input.dtype != PaddleDType::FLOAT32) {
+      VLOG(3) << "Only support float type inputs. " << input.name
+              << "'s type is not float";
+      return false;
+    }
+    auto d_tensor_in_p = executor_p_->get_in(input.name);
+    auto net_shape = d_tensor_in_p->shape();
+    if (net_shape.size() != input.shape.size()) {
+      VLOG(3) << " input  " << input.name
+              << "'s shape size should be equal to that of net";
+      return false;
+    }
+    int sum = 1;
+    for_each(input.shape.begin(), input.shape.end(), [&](int n) { sum *= n; });
+    if (sum > net_shape.count()) {
+      graph_.Reshape(input.name, input.shape);
+      delete executor_p_;
+      executor_p_ = new anakin::Net<Target, anakin::saber::AK_FLOAT,
+                                    anakin::Precision::FP32>(graph_, true);
+      d_tensor_in_p = executor_p_->get_in(input.name);
+    }
+
+    anakin::saber::Shape tmp_shape;
+    for (auto s : input.shape) {
+      tmp_shape.push_back(s);
+    }
+    d_tensor_in_p->reshape(tmp_shape);
+
+    if (input.lod.size() > 0) {
+      if (input.lod.size() > 1) {
+        VLOG(3) << " input lod first dim should <=1, but you set "
+                << input.lod.size();
+        return false;
+      }
+      std::vector<int> offset(input.lod[0].begin(), input.lod[0].end());
+      d_tensor_in_p->set_seq_offset(offset);
+      VLOG(3) << "offset.size(): " << offset.size();
+      for (int i = 0; i < offset.size(); i++) {
+        VLOG(3) << offset[i];
+      }
+    }
+
+    float *d_data_p = d_tensor_in_p->mutable_data();
+
+#ifdef PADDLE_WITH_CUDA
+    if (std::is_same<anakin::NV, Target>::value) {
+      if (cudaMemcpy(d_data_p, static_cast<float *>(input.data.data()),
+                     d_tensor_in_p->valid_size() * sizeof(float),
+                     cudaMemcpyHostToDevice) != 0) {
+        VLOG(3) << "copy data from CPU to GPU error";
+        return false;
+      }
+    }
+#endif
+    if (std::is_same<anakin::X86, Target>::value) {
+      memcpy(d_data_p, static_cast<float *>(input.data.data()),
+             d_tensor_in_p->valid_size() * sizeof(float));
+    }
+  }
+#ifdef PADDLE_WITH_CUDA
+  cudaDeviceSynchronize();
+  executor_p_->prediction();
+  cudaDeviceSynchronize();
+#endif
+
+  if (output_data->empty()) {
+    VLOG(3) << "At least one output should be set with tensors' names.";
+    return false;
+  }
+  for (auto &output : *output_data) {
+    auto *tensor = executor_p_->get_out(output.name);
+    output.shape = tensor->valid_shape();
+    if (output.data.length() < tensor->valid_size() * sizeof(float)) {
+      output.data.Resize(tensor->valid_size() * sizeof(float));
+    }
+
+#if PADDLE_WITH_CUDA
+    if (std::is_same<anakin::NV, Target>::value) {
+      // Copy data from GPU -> CPU
+      if (cudaMemcpy(output.data.data(), tensor->mutable_data(),
+                     tensor->valid_size() * sizeof(float),
+                     cudaMemcpyDeviceToHost) != 0) {
+        VLOG(3) << "copy data from GPU to CPU error";
+        return false;
+      }
+    }
+#endif
+    if (std::is_same<anakin::X86, Target>::value) {
+      memcpy(output.data.data(), tensor->mutable_data(),
+             tensor->valid_size() * sizeof(float));
+    }
+  }
+  return true;
+}
+
+template <typename Target>
+anakin::Net<Target, anakin::saber::AK_FLOAT, anakin::Precision::FP32>
+    &PaddleInferenceAnakinPredictor<Target>::get_executer() {
+  return *executor_p_;
+}
+
+// the cloned new Predictor of anakin share the same net weights from original
+// Predictor
+template <typename Target>
+std::unique_ptr<PaddlePredictor>
+PaddleInferenceAnakinPredictor<Target>::Clone() {
+  VLOG(3) << "Anakin Predictor::clone";
+  std::unique_ptr<PaddlePredictor> cls(
+      new PaddleInferenceAnakinPredictor<Target>());
+  // construct executer from other graph
+  auto anakin_predictor_p =
+      dynamic_cast<PaddleInferenceAnakinPredictor<Target> *>(cls.get());
+  if (!anakin_predictor_p) {
+    VLOG(3) << "fail to call Init";
+    return nullptr;
+  }
+  anakin_predictor_p->get_executer().init(graph_);
+
+  return std::move(cls);
+}
+
+#ifdef PADDLE_WITH_CUDA
+template class PaddleInferenceAnakinPredictor<anakin::NV>;
+#endif
+template class PaddleInferenceAnakinPredictor<anakin::X86>;
+
+// A factory to help create difference predictor.
+template <>
+std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
+    AnakinConfig, PaddleEngineKind::kAnakin>(const AnakinConfig &config) {
+  VLOG(3) << "Anakin Predictor create.";
+  if (config.target_type == AnakinConfig::NVGPU) {
+#ifdef PADDLE_WITH_CUDA
+    VLOG(3) << "Anakin Predictor create on [ NVIDIA GPU ].";
+    std::unique_ptr<PaddlePredictor> x(
+        new PaddleInferenceAnakinPredictor<anakin::NV>(config));
+    return x;
+#else
+    LOG(ERROR) << "AnakinConfig::NVGPU could not used in ONLY-CPU environment";
+    return nullptr;
+#endif
+  } else if (config.target_type == AnakinConfig::X86) {
+    VLOG(3) << "Anakin Predictor create on [ Intel X86 ].";
+    std::unique_ptr<PaddlePredictor> x(
+        new PaddleInferenceAnakinPredictor<anakin::X86>(config));
+    return x;
+  } else {
+    VLOG(3) << "Anakin Predictor create on unknown platform.";
+    return nullptr;
+  }
+}
+
+#ifdef PADDLE_ANAKIN_ENABLE_OP_TIMER
+template <typename Target>
+using executor_t =
+    anakin::Net<Target, anakin::saber::AK_FLOAT, anakin::Precision::FP32>;
+
+template <typename Target>
+void DisplayOpTimer(executor_t<Target> *net_executor, int epoch) {
+  std::vector<float> op_time = net_executor->get_op_time();
+  auto exec_funcs = net_executor->get_exec_funcs();
+  auto op_param = net_executor->get_op_param();
+  for (int i = 0; i < op_time.size(); i++) {
+    LOG(INFO) << "name: " << exec_funcs[i].name
+              << " op_type: " << exec_funcs[i].op_name
+              << " op_param: " << op_param[i] << " time " << op_time[i] / epoch;
+  }
+  std::map<std::string, float> op_map;
+  for (int i = 0; i < op_time.size(); i++) {
+    auto it = op_map.find(op_param[i]);
+    if (it != op_map.end())
+      op_map[op_param[i]] += op_time[i];
+    else
+      op_map.insert(std::pair<std::string, float>(op_param[i], op_time[i]));
+  }
+  for (auto it = op_map.begin(); it != op_map.end(); ++it) {
+    LOG(INFO) << it->first << "  " << (it->second) / epoch << " ms";
+  }
+}
+#endif
+
+template <typename Target>
+PaddleInferenceAnakinPredictor<Target>::~PaddleInferenceAnakinPredictor() {
+#ifdef PADDLE_ANAKIN_ENABLE_OP_TIMER
+  DisplayOpTimer<Target>(executor_p_, max_batch_size_);
+#endif
+  delete executor_p_;
+  executor_p_ = nullptr;
+}
+
+}  // namespace paddle
diff --git a/paddle/contrib/inference/paddle_inference_api_anakin_engine.h b/paddle/fluid/inference/api/api_anakin_engine.h
similarity index 66%
rename from paddle/contrib/inference/paddle_inference_api_anakin_engine.h
rename to paddle/fluid/inference/api/api_anakin_engine.h
index 212ba41cdf8ff2feccb6b6498f9679d76a2efe7c..dd08661880d8cc3a9f4401e9af91a3d10e6579b6 100644
--- a/paddle/contrib/inference/paddle_inference_api_anakin_engine.h
+++ b/paddle/fluid/inference/api/api_anakin_engine.h
@@ -19,42 +19,45 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/contrib/inference/paddle_inference_api.h"
+#include <vector>
 
-// from anakin
 #include "framework/core/net/net.h"
+#include "framework/graph/graph.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "saber/core/shape.h"
 #include "saber/saber_types.h"
 
 namespace paddle {
 
+template <typename Target>
 class PaddleInferenceAnakinPredictor : public PaddlePredictor {
  public:
   PaddleInferenceAnakinPredictor() {}
 
-  PaddleInferenceAnakinPredictor(const AnakinConfig& config);
+  explicit PaddleInferenceAnakinPredictor(const AnakinConfig& config);
 
   // NOTE Unlike the native engine, the buffers of anakin engine's output_data
   // should be allocated first.
   bool Run(const std::vector<PaddleTensor>& inputs,
-           std::vector<PaddleTensor>* output_data) override;
+           std::vector<PaddleTensor>* output_data,
+           int batch_size = -1) override;
 
   std::unique_ptr<PaddlePredictor> Clone() override;
 
-  anakin::Net<anakin::NV, anakin::saber::AK_FLOAT, anakin::Precision::FP32>&
+  anakin::Net<Target, anakin::saber::AK_FLOAT, anakin::Precision::FP32>&
   get_executer();
 
-  ~PaddleInferenceAnakinPredictor() override{};
+  ~PaddleInferenceAnakinPredictor() override;
 
  private:
   bool Init(const AnakinConfig& config);
 
-  anakin::graph::Graph<anakin::NV,
-                       anakin::saber::AK_FLOAT,
-                       anakin::Precision::FP32>
+  anakin::graph::Graph<Target, anakin::saber::AK_FLOAT, anakin::Precision::FP32>
       graph_;
-  anakin::Net<anakin::NV, anakin::saber::AK_FLOAT, anakin::Precision::FP32>
-      executor_;
+  anakin::Net<Target, anakin::saber::AK_FLOAT, anakin::Precision::FP32>*
+      executor_p_{nullptr};
   AnakinConfig config_;
+  int max_batch_size_{0};
 };
 
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/api_anakin_engine_rnn_tester.cc b/paddle/fluid/inference/api/api_anakin_engine_rnn_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..98c74aaa562dce6618ccde8f11f4344eefd59ef2
--- /dev/null
+++ b/paddle/fluid/inference/api/api_anakin_engine_rnn_tester.cc
@@ -0,0 +1,247 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gflags/gflags.h>
+#include <sys/time.h>
+#include <time.h>
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <thread>  // NOLINT
+#include <vector>
+#include "paddle/fluid/inference/api/helper.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/api/timer.h"
+#include "utils/logger/logger.h"
+
+DEFINE_string(model, "", "Directory of the inference model.");
+DEFINE_string(datapath, "", "Path of the dataset.");
+DEFINE_int32(batch_size, 1, "batch size.");
+DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
+
+class Data {
+ public:
+  Data(std::string file_name, int batch_size)
+      : _batch_size(batch_size), _total_length(0) {
+    _file.open(file_name);
+    _file.seekg(_file.end);
+    _total_length = _file.tellg();
+    _file.seekg(_file.beg);
+  }
+  void get_batch_data(std::vector<std::vector<float>>& fea,         // NOLINT
+                      std::vector<std::vector<float>>& week_fea,    // NOLINT
+                      std::vector<std::vector<float>>& time_fea,    // NOLINT
+                      std::vector<long unsigned int>& seq_offset);  // NOLINT
+
+ private:
+  std::fstream _file;
+  int _total_length;
+  int _batch_size;
+};
+
+void Data::get_batch_data(
+    std::vector<std::vector<float>>& fea,          // NOLINT
+    std::vector<std::vector<float>>& week_fea,     // NOLINT
+    std::vector<std::vector<float>>& time_fea,     // NOLINT
+    std::vector<long unsigned int>& seq_offset) {  // NOLINT
+  int seq_num = 0;
+  long unsigned int cum = 0;  // NOLINT
+
+  char buf[10000];
+  seq_offset.clear();
+  seq_offset.push_back(0);
+  fea.clear();
+  week_fea.clear();
+  time_fea.clear();
+  while (_file.getline(buf, 10000)) {
+    std::vector<std::string> data_vec;
+    paddle::inference::split(buf, ':', &data_vec);
+
+    std::vector<std::string> seq;
+    paddle::inference::split(data_vec[0], '|', &seq);
+
+    for (auto link : seq) {
+      std::vector<float> vec;
+      paddle::inference::split_to_float(link, ',', &vec);
+      fea.push_back(vec);
+    }
+
+    std::vector<float> vec_w;
+    paddle::inference::split_to_float(data_vec[2], ',', &vec_w);
+    week_fea.push_back(vec_w);
+
+    std::vector<float> vec_t;
+    paddle::inference::split_to_float(data_vec[1], ',', &vec_t);
+    time_fea.push_back(vec_t);
+
+    cum += seq.size();
+    seq_offset.push_back(cum);
+
+    seq_num++;
+    if (seq_num >= _batch_size) {
+      break;
+    }
+  }
+}
+
+namespace paddle {
+
+AnakinConfig GetConfig() {
+  AnakinConfig config;
+  // using AnakinConfig::X86 if you need to use cpu to do inference
+  config.target_type = AnakinConfig::X86;
+  config.model_file = FLAGS_model;
+  config.device = 0;
+  config.max_batch_size = 1000;  // the max number of token
+  return config;
+}
+
+void set_tensor(std::string name, std::vector<int> shape,
+                std::vector<PaddleTensor>& vec) {  // NOLINT
+  int sum = 1;
+  std::for_each(shape.begin(), shape.end(), [&](int n) { sum *= n; });
+  float* data = new float[sum];
+  PaddleTensor tensor;
+  tensor.name = name;
+  tensor.shape = shape;
+  tensor.data = PaddleBuf(data, sum);
+  tensor.dtype = PaddleDType::FLOAT32;
+  vec.push_back(tensor);
+}
+
+void single_test() {
+  AnakinConfig config = GetConfig();
+  auto predictor =
+      CreatePaddlePredictor<AnakinConfig, PaddleEngineKind::kAnakin>(config);
+
+  int max_batch_size = 1000;
+  std::string feature_file = FLAGS_datapath;
+  Data map_data(feature_file, FLAGS_batch_size);
+  std::vector<std::vector<float>> fea;
+  std::vector<std::vector<float>> week_fea;
+  std::vector<std::vector<float>> time_fea;
+  std::vector<long unsigned int> seq_offset;  // NOLINT
+
+  paddle::PaddleTensor tensor_0, tensor_1, tensor_2;
+  tensor_0.name = "input_0";
+  tensor_1.name = "input_4";
+  tensor_2.name = "input_5";
+
+  PaddleTensor tensor_out;
+  tensor_out.name = "final_output.tmp_1_gout";
+  tensor_out.shape = std::vector<int>({});
+  tensor_out.data = PaddleBuf();
+  tensor_out.dtype = PaddleDType::FLOAT32;
+
+  std::vector<PaddleTensor> inputs;
+  std::vector<PaddleTensor> outputs(1, tensor_out);
+
+  int data_0_dim = 38;
+  int data_1_dim = 10;
+  int data_2_dim = 10;
+  float data_0[max_batch_size * data_0_dim];  // NOLINT
+  float data_1[max_batch_size * data_1_dim];  // NOLINT
+  float data_2[max_batch_size * data_2_dim];  // NOLINT
+
+  int count = 0;
+  while (true) {
+    if (count++ > 0) break;  // only run the first batch in ci.
+    seq_offset.clear();
+    map_data.get_batch_data(fea, week_fea, time_fea, seq_offset);
+    if (seq_offset.size() <= 1) {
+      LOG(FATAL) << "seq_offset.size() <= 1, exit.";
+      break;
+    }
+
+    std::vector<std::vector<long unsigned int>> seq_offset_vec;  // NOLINT
+    seq_offset_vec.push_back(seq_offset);
+    tensor_0.lod = seq_offset_vec;
+
+    int p_shape_0[] = {(int)fea.size(), 1, 1, data_0_dim};       // NOLINT
+    int p_shape_1[] = {(int)week_fea.size(), data_1_dim, 1, 1};  // NOLINT
+    int p_shape_2[] = {(int)time_fea.size(), data_2_dim, 1, 1};  // NOLINT
+
+    std::vector<int> shape_0(p_shape_0, p_shape_0 + 4);
+    std::vector<int> shape_1(p_shape_1, p_shape_1 + 4);
+    std::vector<int> shape_2(p_shape_2, p_shape_2 + 4);
+
+    tensor_0.shape = shape_0;
+    tensor_1.shape = shape_1;
+    tensor_2.shape = shape_2;
+
+    for (int i = 0; i < fea.size(); i++) {
+      memcpy(data_0 + i * data_0_dim, &fea[i][0], sizeof(float) * data_0_dim);
+    }
+    for (int i = 0; i < week_fea.size(); i++) {
+      memcpy(data_1 + i * data_1_dim, &week_fea[i][0],
+             sizeof(float) * data_1_dim);
+    }
+    for (int i = 0; i < time_fea.size(); i++) {
+      memcpy(data_2 + i * data_2_dim, &time_fea[i][0],
+             sizeof(float) * data_2_dim);
+    }
+
+    tensor_0.data =
+        paddle::PaddleBuf(data_0, fea.size() * sizeof(float) * data_0_dim);
+    tensor_1.data =
+        paddle::PaddleBuf(data_1, week_fea.size() * sizeof(float) * data_1_dim);
+    tensor_2.data =
+        paddle::PaddleBuf(data_2, time_fea.size() * sizeof(float) * data_2_dim);
+
+    tensor_0.dtype = paddle::PaddleDType::FLOAT32;
+    tensor_1.dtype = paddle::PaddleDType::FLOAT32;
+    tensor_2.dtype = paddle::PaddleDType::FLOAT32;
+
+    inputs.clear();
+    inputs.push_back(tensor_1);
+    inputs.push_back(tensor_2);
+    inputs.push_back(tensor_0);
+
+    paddle::inference::Timer timer;
+    timer.tic();
+    for (int i = 0; i < FLAGS_repeat; i++) predictor->Run(inputs, &outputs);
+
+    paddle::inference::PrintTime(FLAGS_batch_size, FLAGS_repeat, 1, 0,
+                                 timer.toc() / FLAGS_repeat);
+    LOG(INFO) << "sequence_length = " << seq_offset[seq_offset.size() - 1];
+
+    float* data_o = static_cast<float*>(outputs[0].data.data());
+    VLOG(3) << "outputs[0].data.length() = " << outputs[0].data.length();
+    for (size_t j = 0; j < outputs[0].data.length(); ++j) {
+      VLOG(3) << "output[" << j << "]: " << data_o[j];
+    }
+  }
+}
+}  // namespace paddle
+
+int main(int argc, char** argv) {
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  logger::init(argv[0]);
+
+  paddle::single_test();
+  /* multi-threads
+  std::vector<std::thread> threads;
+  int num = 1;
+  for (int i = 0; i < num; i++) {
+    LOG(INFO) << " thread id : " << i;
+    threads.emplace_back(paddle::single_test);
+  }
+  for (int i = 0; i < num; i++) {
+    threads[i].join();
+  }
+  threads.clear();
+  */
+
+  return 0;
+}
diff --git a/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc b/paddle/fluid/inference/api/api_anakin_engine_tester.cc
similarity index 61%
rename from paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc
rename to paddle/fluid/inference/api/api_anakin_engine_tester.cc
index f92e9d4190412f5847e353ef1dc0324cad668c9a..62e820b68c79a47d963bb174663bfc8c4ac22de3 100644
--- a/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc
+++ b/paddle/fluid/inference/api/api_anakin_engine_tester.cc
@@ -12,18 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <gflags/gflags.h>
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 
-#include "paddle/contrib/inference/paddle_inference_api.h"
+#include "gflags/gflags.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
 
-DEFINE_string(model, "", "Directory of the inference model.");
+DEFINE_string(model, "", "Directory of the inference model(mobile_v2).");
 
 namespace paddle {
 
 AnakinConfig GetConfig() {
   AnakinConfig config;
+  // using AnakinConfig::X86 if you need to use cpu to do inference
+  config.target_type = AnakinConfig::NVGPU;
   config.model_file = FLAGS_model;
   config.device = 0;
   config.max_batch_size = 1;
@@ -36,28 +38,27 @@ TEST(inference, anakin) {
       CreatePaddlePredictor<AnakinConfig, PaddleEngineKind::kAnakin>(config);
 
   float data[1 * 3 * 224 * 224] = {1.0f};
-
-  PaddleTensor tensor{.name = "input_0",
-                      .shape = std::vector<int>({1, 3, 224, 224}),
-                      .data = PaddleBuf(data, sizeof(data)),
-                      .dtype = PaddleDType::FLOAT32};
+  PaddleTensor tensor;
+  tensor.name = "input_0";
+  tensor.shape = std::vector<int>({1, 3, 224, 224});
+  tensor.data = PaddleBuf(data, sizeof(data));
+  tensor.dtype = PaddleDType::FLOAT32;
 
   // For simplicity, we set all the slots with the same data.
-  std::vector<PaddleTensor> paddle_tensor_feeds;
-  paddle_tensor_feeds.emplace_back(std::move(tensor));
+  std::vector<PaddleTensor> paddle_tensor_feeds(1, tensor);
 
-  PaddleTensor tensor_out{.name = "prob_out",
-                          .shape = std::vector<int>({1000, 1}),
-                          .data = PaddleBuf(),
-                          .dtype = PaddleDType::FLOAT32};
+  PaddleTensor tensor_out;
+  tensor_out.name = "prob_out";
+  tensor_out.shape = std::vector<int>({});
+  tensor_out.data = PaddleBuf();
+  tensor_out.dtype = PaddleDType::FLOAT32;
 
-  std::vector<PaddleTensor> outputs;
-  outputs.emplace_back(std::move(tensor_out));
+  std::vector<PaddleTensor> outputs(1, tensor_out);
 
   ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &outputs));
 
   float* data_o = static_cast<float*>(outputs[0].data.data());
-  for (size_t j = 0; j < 1000; ++j) {
+  for (size_t j = 0; j < outputs[0].data.length(); ++j) {
     LOG(INFO) << "output[" << j << "]: " << data_o[j];
   }
 }
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2e9e10139fa7008a46c3782960dfd44d3228cc26
--- /dev/null
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -0,0 +1,333 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+#include <map>
+#include <set>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/framework/feed_fetch_method.h"
+#include "paddle/fluid/inference/api/api_impl.h"
+#include "paddle/fluid/inference/api/timer.h"
+#include "paddle/fluid/platform/profiler.h"
+
+DEFINE_bool(profile, false, "Turn on profiler for fluid");
+
+namespace paddle {
+namespace {
+using paddle::inference::Timer;
+
+template <class T>
+std::string num2str(T a) {
+  std::stringstream istr;
+  istr << a;
+  return istr.str();
+}
+}  // namespace
+
+void NativePaddlePredictor::PrepareFeedFetch() {
+  for (auto *op : inference_program_->Block(0).AllOps()) {
+    if (op->Type() == "feed") {
+      int idx = boost::get<int>(op->GetAttr("col"));
+      if (feeds_.size() <= static_cast<size_t>(idx)) {
+        feeds_.resize(idx + 1);
+      }
+      feeds_[idx] = op;
+      feed_names_[op->Output("Out")[0]] = idx;
+    } else if (op->Type() == "fetch") {
+      int idx = boost::get<int>(op->GetAttr("col"));
+      if (fetchs_.size() <= static_cast<size_t>(idx)) {
+        fetchs_.resize(idx + 1);
+      }
+      fetchs_[idx] = op;
+    }
+  }
+}
+
+bool NativePaddlePredictor::Init(
+    std::shared_ptr<framework::Scope> parent_scope) {
+  VLOG(3) << "Predictor::init()";
+#if !defined(_WIN32)
+  if (FLAGS_profile) {
+    LOG(WARNING) << "Profiler is actived, might affect the performance";
+    LOG(INFO) << "You can turn off by set gflags '-profile false'";
+
+    auto tracking_device = config_.use_gpu ? platform::ProfilerState::kAll
+                                           : platform::ProfilerState::kCPU;
+    platform::EnableProfiler(tracking_device);
+  }
+#endif
+
+  if (config_.use_gpu) {
+    place_ = paddle::platform::CUDAPlace(config_.device);
+  } else {
+    place_ = paddle::platform::CPUPlace();
+  }
+  if (parent_scope) {
+    scope_ = parent_scope;
+    sub_scope_ = &(parent_scope->NewScope());
+    PADDLE_ENFORCE_NOT_NULL(sub_scope_, "create sub scope fail");
+  } else {
+    paddle::framework::InitDevices(false);
+    scope_.reset(new paddle::framework::Scope());
+  }
+
+  executor_.reset(new paddle::framework::Executor(place_));
+
+  // Initialize the inference program
+  if (!config_.model_dir.empty()) {
+    // Parameters are saved in separate files sited in
+    // the specified `dirname`.
+    inference_program_ = paddle::inference::Load(executor_.get(), scope_.get(),
+                                                 config_.model_dir);
+  } else if (!config_.prog_file.empty() && !config_.param_file.empty()) {
+    // All parameters are saved in a single file.
+    // The file names should be consistent with that used
+    // in Python API `fluid.io.save_inference_model`.
+    inference_program_ = paddle::inference::Load(
+        executor_.get(), scope_.get(), config_.prog_file, config_.param_file);
+  } else {
+    LOG(ERROR) << "fail to load inference model.";
+    return false;
+  }
+
+  ctx_ = executor_->Prepare(*inference_program_, 0);
+  if (config_._use_mkldnn) {
+    executor_->EnableMKLDNN(*inference_program_);
+  }
+  executor_->CreateVariables(*inference_program_,
+                             sub_scope_ ? sub_scope_ : scope_.get(), 0);
+
+  // Get the feed_target_names and fetch_target_names
+  PrepareFeedFetch();
+  return true;
+}
+
+NativePaddlePredictor::~NativePaddlePredictor() {
+#if !defined(_WIN32)
+  if (FLAGS_profile) {
+    platform::DisableProfiler(platform::EventSortingKey::kTotal,
+                              "./profile.log");
+  }
+#endif
+  if (sub_scope_) {
+    scope_->DeleteScope(sub_scope_);
+  }
+}
+
+bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
+                                std::vector<PaddleTensor> *output_data,
+                                int batch_size) {
+  VLOG(3) << "Predictor::predict";
+  Timer timer;
+  timer.tic();
+  // set feed variable
+  std::vector<framework::LoDTensor> feeds;
+  framework::Scope *scope = sub_scope_ != nullptr ? sub_scope_ : scope_.get();
+  if (!SetFeed(inputs, scope)) {
+    LOG(ERROR) << "fail to set feed";
+    return false;
+  }
+  // Run the inference program
+  // if share variables, we need not create variables
+  VLOG(4) << "Run prepared context";
+  executor_->RunPreparedContext(ctx_.get(), scope,
+                                false, /* don't create local scope each time*/
+                                false /* don't create variable eatch time */);
+  VLOG(4) << "Finish prepared context";
+  // get fetch variable
+  if (!GetFetch(output_data, scope)) {
+    LOG(ERROR) << "fail to get fetches";
+    return false;
+  }
+  VLOG(3) << "predict cost: " << timer.toc() << "ms";
+  return true;
+}
+
+std::unique_ptr<PaddlePredictor> NativePaddlePredictor::Clone() {
+  VLOG(3) << "Predictor::clone";
+  std::unique_ptr<PaddlePredictor> cls(new NativePaddlePredictor(config_));
+
+  if (!dynamic_cast<NativePaddlePredictor *>(cls.get())->Init(scope_)) {
+    LOG(ERROR) << "fail to call Init";
+    return nullptr;
+  }
+#ifdef __clang__
+  // fix clang compile error
+  return cls;
+#else
+  // fix manylinux compile error.
+  return std::move(cls);
+#endif
+}
+
+bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
+                                    framework::Scope *scope) {
+  VLOG(3) << "Predictor::set_feed";
+  if (inputs.size() != feeds_.size()) {
+    LOG(ERROR) << "wrong feed input size, need " << feeds_.size() << " but get "
+               << inputs.size();
+    return false;
+  }
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    framework::LoDTensor input;
+    framework::DDim ddim = framework::make_ddim(inputs[i].shape);
+    void *input_ptr;
+    if (inputs[i].dtype == PaddleDType::INT64) {
+      input_ptr = input.mutable_data<int64_t>(ddim, platform::CPUPlace());
+    } else if (inputs[i].dtype == PaddleDType::FLOAT32) {
+      input_ptr = input.mutable_data<float>(ddim, platform::CPUPlace());
+    } else {
+      LOG(ERROR) << "unsupported feed type " << inputs[i].dtype;
+      return false;
+    }
+
+    // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
+    std::memcpy(static_cast<void *>(input_ptr), inputs[i].data.data(),
+                inputs[i].data.length());
+    // TODO(Superjomn) Low performance, need optimization for heavy LoD copy.
+    framework::LoD lod;
+    for (auto &level : inputs[i].lod) {
+      lod.emplace_back(level);
+    }
+    input.set_lod(lod);
+    int idx = -1;
+    if (config_.specify_input_name) {
+      idx = feed_names_[inputs[i].name];
+    } else {
+      idx = boost::get<int>(feeds_[i]->GetAttr("col"));
+    }
+    framework::SetFeedVariable(scope, input, "feed", idx);
+  }
+  return true;
+}
+template <typename T>
+void NativePaddlePredictor::GetFetchOne(const framework::LoDTensor &fetch,
+                                        PaddleTensor *output) {
+  std::vector<int> shape;
+  auto dims_i = fetch.dims();
+  auto lod = fetch.lod();
+  const T *output_ptr = fetch.data<T>();
+  auto num = fetch.numel();
+  std::vector<T> data;
+  if (0 == lod.size()) {
+    std::copy(output_ptr, output_ptr + num, std::back_inserter(data));
+    for (int j = 0; j < dims_i.size(); ++j) {
+      shape.push_back(dims_i[j]);
+    }
+  } else {
+    // for batch detection
+    // image[0] -> output[0] shape {145, 6}
+    // image[1] -> output[1] shape {176, 6}
+    // then,
+    // the batch output shape {321, 6}
+    // the lod {{0, 145, 321}}
+    // so we should append output[0] to {176, 6}
+    size_t max_dim = 0;
+    for (size_t j = 1; j < lod[0].size(); j++) {
+      max_dim = std::max(max_dim, lod[0][j] - lod[0][j - 1]);
+    }
+    size_t common_dim = lod[0].back() == 0 ? 0 : num / lod[0].back();
+    if (max_dim > 0) {
+      data.resize((lod[0].size() - 1) * max_dim * common_dim, 0);
+    }
+    for (size_t j = 1; j < lod[0].size(); j++) {
+      size_t start = lod[0][j - 1] * common_dim;
+      size_t end = lod[0][j] * common_dim;
+      if (end > start) {
+        std::copy(output_ptr + start, output_ptr + end,
+                  data.begin() + (j - 1) * max_dim * common_dim);
+      }
+    }
+    shape.push_back(lod[0].size() - 1);
+    shape.push_back(max_dim);
+    for (int j = 1; j < dims_i.size(); ++j) {
+      shape.push_back(dims_i[j]);
+    }
+  }
+
+  output->shape = shape;
+  auto &buffer = output->data;
+  if (buffer.empty() || buffer.length() < sizeof(T) * data.size()) {
+    buffer.Resize(sizeof(T) * data.size());
+  }
+  std::memcpy(buffer.data(), data.data(), sizeof(T) * data.size());
+  // copy LoD
+  for (const auto &level : fetch.lod()) {
+    output->lod.emplace_back(level);
+  }
+}
+
+bool NativePaddlePredictor::GetFetch(std::vector<PaddleTensor> *outputs,
+                                     framework::Scope *scope) {
+  VLOG(3) << "Predictor::get_fetch";
+  outputs->resize(fetchs_.size());
+  for (size_t i = 0; i < fetchs_.size(); ++i) {
+    int idx = boost::get<int>(fetchs_[i]->GetAttr("col"));
+    PADDLE_ENFORCE((size_t)idx == i);
+    framework::LoDTensor &fetch =
+        framework::GetFetchVariable(*scope, "fetch", idx);
+    auto type = fetch.type();
+    auto output = &(outputs->at(i));
+    if (type == typeid(float)) {
+      GetFetchOne<float>(fetch, output);
+      output->dtype = PaddleDType::FLOAT32;
+    } else if (type == typeid(int64_t)) {
+      GetFetchOne<int64_t>(fetch, output);
+      output->dtype = PaddleDType::INT64;
+    } else {
+      LOG(ERROR) << "unknown type, only support float32 and int64 now.";
+    }
+  }
+  return true;
+}
+
+template <>
+std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
+    NativeConfig, PaddleEngineKind::kNative>(const NativeConfig &config) {
+  VLOG(3) << "create NativePaddlePredictor";
+  if (config.use_gpu) {
+    // 1. GPU memeroy
+    PADDLE_ENFORCE_GT(
+        config.fraction_of_gpu_memory, 0.f,
+        "fraction_of_gpu_memory in the config should be set to range (0., 1.]");
+    PADDLE_ENFORCE_GE(config.device, 0, "Invalid device id %d", config.device);
+    std::vector<std::string> flags;
+    if (config.fraction_of_gpu_memory >= 0.0f ||
+        config.fraction_of_gpu_memory <= 0.95f) {
+      flags.push_back("dummpy");
+      std::string flag = "--fraction_of_gpu_memory_to_use=" +
+                         num2str<float>(config.fraction_of_gpu_memory);
+      flags.push_back(flag);
+      VLOG(3) << "set flag: " << flag;
+      framework::InitGflags(flags);
+    }
+  }
+
+  std::unique_ptr<PaddlePredictor> predictor(new NativePaddlePredictor(config));
+  if (!dynamic_cast<NativePaddlePredictor *>(predictor.get())->Init(nullptr)) {
+    return nullptr;
+  }
+#ifdef __clang__
+  // fix clang compile error
+  return predictor;
+#else
+  return std::move(predictor);
+#endif
+}
+
+}  // namespace paddle
diff --git a/paddle/contrib/inference/paddle_inference_api_impl.h b/paddle/fluid/inference/api/api_impl.h
similarity index 75%
rename from paddle/contrib/inference/paddle_inference_api_impl.h
rename to paddle/fluid/inference/api/api_impl.h
index f9ec6f55449fc46b4a44b9563980cb5f8e80a951..ec801c58857e716241d28404510530e551ed25aa 100644
--- a/paddle/contrib/inference/paddle_inference_api_impl.h
+++ b/paddle/fluid/inference/api/api_impl.h
@@ -15,11 +15,12 @@
 #pragma once
 
 #include <glog/logging.h>
+#include <map>
 #include <memory>
 #include <string>
 #include <vector>
 
-#include "paddle/contrib/inference/paddle_inference_api.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
 
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/lod_tensor.h"
@@ -38,7 +39,8 @@ class NativePaddlePredictor : public PaddlePredictor {
   bool Init(std::shared_ptr<framework::Scope> parent_scope);
 
   bool Run(const std::vector<PaddleTensor> &inputs,
-           std::vector<PaddleTensor> *output_data) override;
+           std::vector<PaddleTensor> *output_data,
+           int batch_size = -1) override;
 
   std::unique_ptr<PaddlePredictor> Clone() override;
 
@@ -46,9 +48,13 @@ class NativePaddlePredictor : public PaddlePredictor {
 
  protected:
   bool SetFeed(const std::vector<PaddleTensor> &input_datas,
-               std::vector<framework::LoDTensor> *feeds);
-  bool GetFetch(const std::vector<framework::LoDTensor> &fetchs,
-                std::vector<PaddleTensor> *output_data);
+               framework::Scope *scope);
+  bool GetFetch(std::vector<PaddleTensor> *output_data,
+                framework::Scope *scope);
+  template <typename T>
+  void GetFetchOne(const framework::LoDTensor &fetchs,
+                   PaddleTensor *output_data);
+  void PrepareFeedFetch();
 
   NativeConfig config_;
   platform::Place place_;
@@ -56,8 +62,9 @@ class NativePaddlePredictor : public PaddlePredictor {
   std::shared_ptr<framework::Scope> scope_;
   std::unique_ptr<framework::ExecutorPrepareContext> ctx_;
   std::unique_ptr<framework::ProgramDesc> inference_program_;
-  std::vector<std::string> feed_target_names_;
-  std::vector<std::string> fetch_target_names_;
+  std::vector<framework::OpDesc *> feeds_;
+  std::map<std::string, size_t> feed_names_;
+  std::vector<framework::OpDesc *> fetchs_;
   // Do not use unique_ptr, use parent scope to delete
   framework::Scope *sub_scope_{nullptr};
 };
diff --git a/paddle/contrib/inference/test_paddle_inference_api_impl.cc b/paddle/fluid/inference/api/api_impl_tester.cc
similarity index 97%
rename from paddle/contrib/inference/test_paddle_inference_api_impl.cc
rename to paddle/fluid/inference/api/api_impl_tester.cc
index 88c4e665a3daed0ed34b23b75d360acbd586401f..fc1364b80ac1ee2d304eb2fe429eae5f56967516 100644
--- a/paddle/contrib/inference/test_paddle_inference_api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl_tester.cc
@@ -15,10 +15,10 @@ limitations under the License. */
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 
-#include <thread>
+#include <thread>  // NOLINT
 
 #include "gflags/gflags.h"
-#include "paddle/contrib/inference/paddle_inference_api_impl.h"
+#include "paddle/fluid/inference/api/api_impl.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
 
 DEFINE_string(dirname, "", "Directory of the inference model.");
@@ -121,8 +121,8 @@ void MainImageClassification(bool use_gpu) {
   // which should be in the range [0.0, 1.0].
   feed_target_shapes[0][0] = batch_size;
   framework::DDim input_dims = framework::make_ddim(feed_target_shapes[0]);
-  SetupTensor<float>(
-      &input, input_dims, static_cast<float>(0), static_cast<float>(1));
+  SetupTensor<float>(&input, input_dims, static_cast<float>(0),
+                     static_cast<float>(1));
   std::vector<framework::LoDTensor*> cpu_feeds;
   cpu_feeds.push_back(&input);
 
@@ -249,7 +249,7 @@ void MainThreadsImageClassification(bool use_gpu) {
       const size_t len = local_outputs[0].data.length();
       float* data = static_cast<float*>(local_outputs[0].data.data());
       float* ref_data = refs[tid].data<float>();
-      EXPECT_EQ(refs[tid].numel(), len / sizeof(float));
+      EXPECT_EQ((size_t)refs[tid].numel(), len / sizeof(float));
       for (int i = 0; i < refs[tid].numel(); ++i) {
         EXPECT_NEAR(ref_data[i], data[i], 1e-3);
       }
diff --git a/paddle/contrib/inference/paddle_inference_api_tensorrt_subgraph_engine.cc b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
similarity index 68%
rename from paddle/contrib/inference/paddle_inference_api_tensorrt_subgraph_engine.cc
rename to paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
index a11396cee91a758e86af2efd9e58b9da68442590..abee375313850f1490bacec11f737706c061a5e9 100644
--- a/paddle/contrib/inference/paddle_inference_api_tensorrt_subgraph_engine.cc
+++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
@@ -12,10 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/contrib/inference/paddle_inference_api.h"
-#include "paddle/contrib/inference/paddle_inference_api_impl.h"
 #include "paddle/fluid/inference/analysis/analyzer.h"
+#include "paddle/fluid/inference/api/api_impl.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/utils/singleton.h"
+#include "paddle/fluid/operators/tensorrt_engine_op.h"
 
 namespace paddle {
 
@@ -30,8 +32,10 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor {
       : NativePaddlePredictor(config), config_(config) {}
 
   bool Init(const std::shared_ptr<framework::Scope>& parent_scope) {
+    FLAGS_IA_enable_tensorrt_subgraph_engine = true;
     VLOG(3) << "Predictor::init()";
-
+    FLAGS_tensorrt_max_batch_size = config_.max_batch_size;
+    FLAGS_tensorrt_workspace_size = config_.workspace_size;
     if (config_.use_gpu) {
       place_ = paddle::platform::CUDAPlace(config_.device);
     } else {
@@ -64,8 +68,41 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor {
       return false;
     }
 
+    OptimizeInferenceProgram();
+    ctx_ = executor_->Prepare(*inference_program_, 0);
+
+    VLOG(5) << "to create variables";
+    executor_->CreateVariables(*inference_program_,
+                               sub_scope_ ? sub_scope_ : scope_.get(), 0);
+    // Get the feed_target_names and fetch_target_names
+    PrepareFeedFetch();
+    return true;
+  }
+
+  bool Run(const std::vector<PaddleTensor>& inputs,
+           std::vector<PaddleTensor>* output_data,
+           int batch_size = -1) override {
+    PADDLE_ENFORCE_GT(batch_size, 0,
+                      "TensorRT engine needs the argument batch_size set");
+    FLAGS_tensorrt_engine_batch_size = batch_size;
+    return NativePaddlePredictor::Run(inputs, output_data, batch_size);
+  }
+
+  void OptimizeInferenceProgram() {
     // Analyze inference_program
     Argument argument;
+    if (!config_.model_dir.empty()) {
+      argument.fluid_model_dir.reset(new std::string(config_.model_dir));
+    } else {
+      PADDLE_ENFORCE(
+          !config_.param_file.empty(),
+          "Either model_dir or (param_file, prog_file) should be set.");
+      PADDLE_ENFORCE(!config_.prog_file.empty());
+      argument.fluid_model_program_path.reset(
+          new std::string(config_.prog_file));
+      argument.fluid_model_param_path.reset(
+          new std::string(config_.param_file));
+    }
     argument.origin_program_desc.reset(
         new ProgramDesc(*inference_program_->Proto()));
     Singleton<Analyzer>::Global().Run(&argument);
@@ -73,17 +110,8 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor {
     VLOG(5) << "transformed program:\n"
             << argument.transformed_program_desc->SerializeAsString();
     VLOG(5) << "to prepare executor";
-    *inference_program_->Proto() = *argument.transformed_program_desc;
-    ctx_ = executor_->Prepare(*inference_program_, 0);
-
-    VLOG(5) << "to create variables";
-    executor_->CreateVariables(
-        *inference_program_, sub_scope_ ? sub_scope_ : scope_.get(), 0);
-
-    // Get the feed_target_names and fetch_target_names
-    feed_target_names_ = inference_program_->GetFeedTargetNames();
-    fetch_target_names_ = inference_program_->GetFetchTargetNames();
-    return true;
+    inference_program_.reset(
+        new framework::ProgramDesc(*argument.transformed_program_desc));
   }
 
  private:
@@ -98,8 +126,7 @@ CreatePaddlePredictor<TensorRTConfig, PaddleEngineKind::kAutoMixedTensorRT>(
   if (config.use_gpu) {
     // 1. GPU memeroy
     PADDLE_ENFORCE_GT(
-        config.fraction_of_gpu_memory,
-        0.f,
+        config.fraction_of_gpu_memory, 0.f,
         "fraction_of_gpu_memory in the config should be set to range (0., 1.]");
     PADDLE_ENFORCE_GE(config.device, 0, "Invalid device id %d", config.device);
     std::vector<std::string> flags;
@@ -124,3 +151,13 @@ CreatePaddlePredictor<TensorRTConfig, PaddleEngineKind::kAutoMixedTensorRT>(
 }
 
 }  // namespace paddle
+
+USE_TRT_CONVERTER(elementwise_add_weight);
+USE_TRT_CONVERTER(mul);
+USE_TRT_CONVERTER(conv2d);
+USE_TRT_CONVERTER(relu);
+USE_TRT_CONVERTER(fc);
+USE_TRT_CONVERTER(pool2d);
+USE_TRT_CONVERTER(softmax);
+USE_TRT_CONVERTER(batch_norm);
+USE_TRT_CONVERTER(concat);
diff --git a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9e7425eddd2df07ffe897f908aad360abe42117a
--- /dev/null
+++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc
@@ -0,0 +1,93 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+
+namespace paddle {
+
+DEFINE_string(dirname, "", "Directory of the inference model.");
+
+void CompareTensorRTWithFluid(bool enable_tensorrt) {
+  FLAGS_IA_enable_tensorrt_subgraph_engine = enable_tensorrt;
+
+  //# 1. Create PaddlePredictor with a config.
+  NativeConfig config0;
+  config0.model_dir = FLAGS_dirname + "word2vec.inference.model";
+  config0.use_gpu = true;
+  config0.fraction_of_gpu_memory = 0.3;
+  config0.device = 0;
+
+  TensorRTConfig config1;
+  config1.model_dir = FLAGS_dirname + "word2vec.inference.model";
+  config1.use_gpu = true;
+  config1.fraction_of_gpu_memory = 0.3;
+  config1.device = 0;
+  config1.max_batch_size = 10;
+
+  auto predictor0 =
+      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config0);
+  auto predictor1 =
+      CreatePaddlePredictor<TensorRTConfig,
+                            PaddleEngineKind::kAutoMixedTensorRT>(config1);
+
+  for (int batch_id = 0; batch_id < 1; batch_id++) {
+    //# 2. Prepare input.
+    std::vector<int64_t> data(20);
+    for (int i = 0; i < 20; i++) data[i] = i;
+
+    PaddleTensor tensor;
+    tensor.shape = std::vector<int>({10, 1});
+    tensor.data = PaddleBuf(data.data(), data.size() * sizeof(int64_t));
+    tensor.dtype = PaddleDType::INT64;
+
+    // For simplicity, we set all the slots with the same data.
+    std::vector<PaddleTensor> slots(4, tensor);
+
+    //# 3. Run
+    std::vector<PaddleTensor> outputs0;
+    std::vector<PaddleTensor> outputs1;
+    CHECK(predictor0->Run(slots, &outputs0));
+    CHECK(predictor1->Run(slots, &outputs1, 10));
+
+    //# 4. Get output.
+    ASSERT_EQ(outputs0.size(), 1UL);
+    ASSERT_EQ(outputs1.size(), 1UL);
+
+    const size_t num_elements = outputs0.front().data.length() / sizeof(float);
+    const size_t num_elements1 = outputs1.front().data.length() / sizeof(float);
+    EXPECT_EQ(num_elements, num_elements1);
+
+    auto *data0 = static_cast<float *>(outputs0.front().data.data());
+    auto *data1 = static_cast<float *>(outputs1.front().data.data());
+
+    ASSERT_GT(num_elements, 0UL);
+    for (size_t i = 0; i < std::min(num_elements, num_elements1); i++) {
+      EXPECT_NEAR(data0[i], data1[i], 1e-3);
+    }
+  }
+}
+
+TEST(paddle_inference_api_tensorrt_subgraph_engine, without_tensorrt) {
+  CompareTensorRTWithFluid(false);
+}
+
+TEST(paddle_inference_api_tensorrt_subgraph_engine, with_tensorrt) {
+  CompareTensorRTWithFluid(true);
+}
+
+}  // namespace paddle
diff --git a/paddle/contrib/inference/test_paddle_inference_api.cc b/paddle/fluid/inference/api/api_tester.cc
similarity index 91%
rename from paddle/contrib/inference/test_paddle_inference_api.cc
rename to paddle/fluid/inference/api/api_tester.cc
index bc7faab6e208a66d7a56e41a56bd743c7644eea2..7a579610eefda24c911edd28b5f3a178aa10ab1e 100644
--- a/paddle/contrib/inference/test_paddle_inference_api.cc
+++ b/paddle/fluid/inference/api/api_tester.cc
@@ -12,10 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/contrib/inference/paddle_inference_api.h"
-
 #include <glog/logging.h>
 #include <gtest/gtest.h>
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
 
 namespace paddle {
 
@@ -36,7 +35,8 @@ class DemoPredictor : public PaddlePredictor {
     LOG(INFO) << "I get other_config " << config.other_config;
   }
   bool Run(const std::vector<PaddleTensor> &inputs,
-           std::vector<PaddleTensor> *output_data) override {
+           std::vector<PaddleTensor> *output_data,
+           int batch_size = 0) override {
     LOG(INFO) << "Run";
     return false;
   }
diff --git a/paddle/fluid/inference/api/demo_ci/.gitignore b/paddle/fluid/inference/api/demo_ci/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..1269488f7fb1f4b56a8c0e5eb48cecbfadfa9219
--- /dev/null
+++ b/paddle/fluid/inference/api/demo_ci/.gitignore
@@ -0,0 +1 @@
+data
diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..afb46a7139f6ab8e6b3697fdc56fe1c78a05cd64
--- /dev/null
+++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
@@ -0,0 +1,92 @@
+cmake_minimum_required(VERSION 3.0)
+
+project(cpp_inference_demo CXX C)
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+if (WIN32)
+set(CMAKE_STATIC_LIBRARY_PREFIX "lib")
+else()
+set(CMAKE_STATIC_LIBRARY_PREFIX "")
+endif()
+
+if(NOT DEFINED PADDLE_LIB)
+  message(FATAL_ERROR "please set PADDLE_LIB with -DPADDLE_LIB=/path/paddle/lib")
+endif()
+if(NOT DEFINED DEMO_NAME)
+  message(FATAL_ERROR "please set DEMO_NAME with -DDEMO_NAME=demo_name")
+endif()
+
+option(WITH_MKL        "Compile demo with MKL/OpenBlas support, default use MKL."       ON)
+option(WITH_GPU        "Compile demo with GPU/CPU, default use CPU."                    OFF)
+option(WITH_STATIC_LIB "Compile demo with static/shared library, default use static."   ON)
+
+if(WITH_GPU)
+  set(CUDA_LIB "/usr/local/cuda/lib64/" CACHE STRING "CUDA Library")
+endif()
+
+include_directories("${PADDLE_LIB}")
+include_directories("${PADDLE_LIB}/third_party/install/protobuf/include")
+include_directories("${PADDLE_LIB}/third_party/install/glog/include")
+include_directories("${PADDLE_LIB}/third_party/install/gflags/include")
+if (NOT WIN32)
+include_directories("${PADDLE_LIB}/third_party/install/snappy/include")
+include_directories("${PADDLE_LIB}/third_party/install/snappystream/include")
+include_directories("${PADDLE_LIB}/third_party/install/zlib/include")
+endif(NOT WIN32)
+
+include_directories("${PADDLE_LIB}/third_party/boost")
+include_directories("${PADDLE_LIB}/third_party/eigen3")
+
+if (NOT WIN32)
+link_directories("${PADDLE_LIB}/third_party/install/snappy/lib")
+link_directories("${PADDLE_LIB}/third_party/install/snappystream/lib")
+link_directories("${PADDLE_LIB}/third_party/install/zlib/lib")
+endif(NOT WIN32)
+
+link_directories("${PADDLE_LIB}/third_party/install/protobuf/lib")
+link_directories("${PADDLE_LIB}/third_party/install/glog/lib")
+link_directories("${PADDLE_LIB}/third_party/install/gflags/lib")
+link_directories("${PADDLE_LIB}/paddle/fluid/inference")
+
+add_executable(${DEMO_NAME} ${DEMO_NAME}.cc)
+
+if(WITH_MKL)
+  include_directories("${PADDLE_LIB}/third_party/install/mklml/include")
+  set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX} 
+               ${PADDLE_LIB}/third_party/install/mklml/lib/libiomp5${CMAKE_SHARED_LIBRARY_SUFFIX})
+  set(MKLDNN_PATH "${PADDLE_LIB}/third_party/install/mkldnn")
+  if(EXISTS ${MKLDNN_PATH})
+    include_directories("${MKLDNN_PATH}/include")
+    set(MKLDNN_LIB ${MKLDNN_PATH}/lib/libmkldnn.so.0)
+  endif()
+else()
+  set(MATH_LIB ${PADDLE_LIB}/third_party/install/openblas/lib/libopenblas${CMAKE_STATIC_LIBRARY_SUFFIX})
+endif()
+
+# Note: libpaddle_inference_api.so/a must put before libpaddle_fluid.so/a
+if(WITH_STATIC_LIB)
+  set(DEPS
+      ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid${CMAKE_STATIC_LIBRARY_SUFFIX})
+else()
+  set(DEPS
+      ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid${CMAKE_SHARED_LIBRARY_SUFFIX})
+endif()
+
+if (NOT WIN32)
+set(EXTERNAL_LIB "-lrt -ldl -lpthread")
+set(DEPS ${DEPS}
+    ${MATH_LIB} ${MKLDNN_LIB}
+    glog gflags protobuf snappystream snappy z
+    ${EXTERNAL_LIB})
+else()
+set(DEPS ${DEPS}
+    ${MATH_LIB} ${MKLDNN_LIB}
+    ${CMAKE_STATIC_LIBRARY_PREFIX}glog  ${CMAKE_STATIC_LIBRARY_PREFIX}gflags  ${CMAKE_STATIC_LIBRARY_PREFIX}protobuf
+    ${EXTERNAL_LIB})
+endif(NOT WIN32)
+
+if(WITH_GPU)
+  set(DEPS ${DEPS} ${CUDA_LIB}/libcudart${CMAKE_SHARED_LIBRARY_SUFFIX})
+endif()
+
+target_link_libraries(${DEMO_NAME} ${DEPS})
diff --git a/paddle/fluid/inference/api/demo_ci/README.md b/paddle/fluid/inference/api/demo_ci/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..7f013da7f30acd84ec484773f4ea716a08efa0ff
--- /dev/null
+++ b/paddle/fluid/inference/api/demo_ci/README.md
@@ -0,0 +1,26 @@
+# Inference Demos
+
+There are several demos:
+
+- simple_on_word2vec: 
+  - Follow the C++ codes is in `simple_on_word2vec.cc`. 
+  - It is suitable for word2vec model.
+- vis_demo: 
+  - Follow the C++ codes is in `vis_demo.cc`. 
+  - It is suitable for mobilenet, se_resnext50 and ocr three models.
+  - Input data format:
+    - Each line contains a single record
+    - Each record's format is
+    ```
+    <space splitted floats as data>\t<space splitted ints as shape>
+    ```
+
+To build and execute the demos, simply run 
+```
+./run.sh $PADDLE_ROOT $TURN_ON_MKL $TEST_GPU_CPU
+```
+- It will build and execute the demos in both static and shared library.
+- `$PADDLE_ROOT`: paddle library path
+- `$TURN_ON_MKL`: use MKL or Openblas
+- `$TEST_GPU_CPU`: test both GPU/CPU mode or only CPU mode
+- NOTE: for simple_on_word2vec, must run `ctest -R test_word2vec -R` to obtain word2vec model at first.
diff --git a/paddle/fluid/inference/api/demo_ci/clean.sh b/paddle/fluid/inference/api/demo_ci/clean.sh
new file mode 100755
index 0000000000000000000000000000000000000000..0d9f3d2aa237acaf3bd7adb031b1f2a73c555352
--- /dev/null
+++ b/paddle/fluid/inference/api/demo_ci/clean.sh
@@ -0,0 +1,4 @@
+set -x
+cd `dirname $0`
+rm -rf build/ data/
+set +x
diff --git a/paddle/fluid/inference/api/demo_ci/run.sh b/paddle/fluid/inference/api/demo_ci/run.sh
new file mode 100755
index 0000000000000000000000000000000000000000..0f7d541c5edfc62e80cf50f83b491f06dcb42644
--- /dev/null
+++ b/paddle/fluid/inference/api/demo_ci/run.sh
@@ -0,0 +1,87 @@
+set -x
+PADDLE_ROOT=$1
+TURN_ON_MKL=$2 # use MKL or Openblas
+TEST_GPU_CPU=$3 # test both GPU/CPU mode or only CPU mode
+if [ $2 == ON ]; then
+  # You can export yourself if move the install path
+  MKL_LIB=${PADDLE_ROOT}/build/fluid_install_dir/third_party/install/mklml/lib
+  export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${MKL_LIB}
+fi
+if [ $3 == ON ]; then
+  use_gpu_list='true false'
+else    
+  use_gpu_list='false'
+fi
+
+PREFIX=inference-vis-demos%2F
+URL_ROOT=http://paddlemodels.cdn.bcebos.com/${PREFIX}
+
+# download vis_demo data
+function download() {
+  dir_name=$1
+  mkdir -p $dir_name
+  cd $dir_name
+  if [[ -e "${PREFIX}${dir_name}.tar.gz" ]]; then
+    echo "${PREFIX}{dir_name}.tar.gz has been downloaded."
+  else
+      wget -q ${URL_ROOT}$dir_name.tar.gz
+      tar xzf *.tar.gz
+  fi
+  cd ..
+}
+mkdir -p data
+cd data
+vis_demo_list='se_resnext50 ocr mobilenet'
+for vis_demo_name in $vis_demo_list; do
+  download $vis_demo_name
+done
+cd ..
+
+# compile and test the demo
+mkdir -p build
+cd build
+
+for WITH_STATIC_LIB in ON OFF; do
+  # -----simple_on_word2vec-----
+  rm -rf *
+  cmake .. -DPADDLE_LIB=${PADDLE_ROOT}/build/fluid_install_dir/ \
+    -DWITH_MKL=$TURN_ON_MKL \
+    -DDEMO_NAME=simple_on_word2vec \
+    -DWITH_GPU=$TEST_GPU_CPU \
+    -DWITH_STATIC_LIB=$WITH_STATIC_LIB
+  make -j
+  word2vec_model=${PADDLE_ROOT}'/build/python/paddle/fluid/tests/book/word2vec.inference.model'
+  if [ -d $word2vec_model ]; then
+    for use_gpu in $use_gpu_list; do
+      ./simple_on_word2vec \
+        --dirname=$word2vec_model \
+        --use_gpu=$use_gpu
+      if [ $? -ne 0 ]; then
+        echo "simple_on_word2vec demo runs fail."
+        exit 1
+      fi
+    done
+  fi
+  # ---------vis_demo---------
+  rm -rf *
+  cmake .. -DPADDLE_LIB=${PADDLE_ROOT}/build/fluid_install_dir/ \
+    -DWITH_MKL=$TURN_ON_MKL \
+    -DDEMO_NAME=vis_demo \
+    -DWITH_GPU=$TEST_GPU_CPU \
+    -DWITH_STATIC_LIB=$WITH_STATIC_LIB
+  make -j
+  for use_gpu in $use_gpu_list; do
+    for vis_demo_name in $vis_demo_list; do 
+      ./vis_demo \
+        --modeldir=../data/$vis_demo_name/model \
+        --data=../data/$vis_demo_name/data.txt \
+        --refer=../data/$vis_demo_name/result.txt \
+        --use_gpu=$use_gpu
+      if [ $? -ne 0 ]; then
+        echo "vis demo $vis_demo_name runs fail."
+        exit 1
+      fi
+    done
+  done
+done
+set +x
diff --git a/paddle/contrib/inference/demo/simple_on_word2vec.cc b/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
similarity index 60%
rename from paddle/contrib/inference/demo/simple_on_word2vec.cc
rename to paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
index c253014642f39a042430992548a285cc7078a959..03ac79e9edf0d7ce6e167c3d34af5ba84bbc0e72 100644
--- a/paddle/contrib/inference/demo/simple_on_word2vec.cc
+++ b/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
@@ -16,21 +16,27 @@ limitations under the License. */
  * This file contains a simple demo for how to take a model for inference.
  */
 
+#include <gflags/gflags.h>
 #include <glog/logging.h>
-#include <gtest/gtest.h>
 #include <memory>
-#include <thread>
-#include "paddle/contrib/inference/paddle_inference_api.h"
+#include <thread>  //NOLINT
+#include "paddle/fluid/inference/paddle_inference_api.h"
+#include "paddle/fluid/platform/enforce.h"
+
+DEFINE_string(dirname, "", "Directory of the inference model.");
+DEFINE_bool(use_gpu, false, "Whether use gpu.");
 
 namespace paddle {
 namespace demo {
 
-DEFINE_string(dirname, "", "Directory of the inference model.");
-
 void Main(bool use_gpu) {
   //# 1. Create PaddlePredictor with a config.
   NativeConfig config;
-  config.model_dir = FLAGS_dirname + "word2vec.inference.model";
+  if (FLAGS_dirname.empty()) {
+    LOG(INFO) << "Usage: ./simple_on_word2vec --dirname=path/to/your/model";
+    exit(1);
+  }
+  config.model_dir = FLAGS_dirname;
   config.use_gpu = use_gpu;
   config.fraction_of_gpu_memory = 0.15;
   config.device = 0;
@@ -41,10 +47,10 @@ void Main(bool use_gpu) {
     //# 2. Prepare input.
     int64_t data[4] = {1, 2, 3, 4};
 
-    PaddleTensor tensor{.name = "",
-                        .shape = std::vector<int>({4, 1}),
-                        .data = PaddleBuf(data, sizeof(data)),
-                        .dtype = PaddleDType::INT64};
+    PaddleTensor tensor;
+    tensor.shape = std::vector<int>({4, 1});
+    tensor.data = PaddleBuf(data, sizeof(data));
+    tensor.dtype = PaddleDType::INT64;
 
     // For simplicity, we set all the slots with the same data.
     std::vector<PaddleTensor> slots(4, tensor);
@@ -54,12 +60,16 @@ void Main(bool use_gpu) {
     CHECK(predictor->Run(slots, &outputs));
 
     //# 4. Get output.
-    ASSERT_EQ(outputs.size(), 1UL);
-    LOG(INFO) << "output buffer size: " << outputs.front().data.length();
+    PADDLE_ENFORCE(outputs.size(), 1UL);
+    // Check the output buffer size and result of each tid.
+    PADDLE_ENFORCE(outputs.front().data.length(), 33168UL);
+    float result[5] = {0.00129761, 0.00151112, 0.000423564, 0.00108815,
+                       0.000932706};
     const size_t num_elements = outputs.front().data.length() / sizeof(float);
     // The outputs' buffers are in CPU memory.
     for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
-      LOG(INFO) << static_cast<float*>(outputs.front().data.data())[i];
+      PADDLE_ENFORCE(static_cast<float*>(outputs.front().data.data())[i],
+                     result[i]);
     }
   }
 }
@@ -68,7 +78,7 @@ void MainThreads(int num_threads, bool use_gpu) {
   // Multi-threads only support on CPU
   // 0. Create PaddlePredictor with a config.
   NativeConfig config;
-  config.model_dir = FLAGS_dirname + "word2vec.inference.model";
+  config.model_dir = FLAGS_dirname;
   config.use_gpu = use_gpu;
   config.fraction_of_gpu_memory = 0.15;
   config.device = 0;
@@ -84,24 +94,28 @@ void MainThreads(int num_threads, bool use_gpu) {
       for (int batch_id = 0; batch_id < num_batches; ++batch_id) {
         // 2. Dummy Input Data
         int64_t data[4] = {1, 2, 3, 4};
-        PaddleTensor tensor{.name = "",
-                            .shape = std::vector<int>({4, 1}),
-                            .data = PaddleBuf(data, sizeof(data)),
-                            .dtype = PaddleDType::INT64};
+        PaddleTensor tensor;
+        tensor.shape = std::vector<int>({4, 1});
+        tensor.data = PaddleBuf(data, sizeof(data));
+        tensor.dtype = PaddleDType::INT64;
+
         std::vector<PaddleTensor> inputs(4, tensor);
         std::vector<PaddleTensor> outputs;
         // 3. Run
         CHECK(predictor->Run(inputs, &outputs));
 
         // 4. Get output.
-        ASSERT_EQ(outputs.size(), 1UL);
-        LOG(INFO) << "TID: " << tid << ", "
-                  << "output buffer size: " << outputs.front().data.length();
+        PADDLE_ENFORCE(outputs.size(), 1UL);
+        // Check the output buffer size and result of each tid.
+        PADDLE_ENFORCE(outputs.front().data.length(), 33168UL);
+        float result[5] = {0.00129761, 0.00151112, 0.000423564, 0.00108815,
+                           0.000932706};
         const size_t num_elements =
             outputs.front().data.length() / sizeof(float);
         // The outputs' buffers are in CPU memory.
         for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
-          LOG(INFO) << static_cast<float*>(outputs.front().data.data())[i];
+          PADDLE_ENFORCE(static_cast<float*>(outputs.front().data.data())[i],
+                         result[i]);
         }
       }
     });
@@ -111,15 +125,18 @@ void MainThreads(int num_threads, bool use_gpu) {
   }
 }
 
-TEST(demo, word2vec_cpu) { Main(false /*use_gpu*/); }
-TEST(demo_multi_threads, word2vec_cpu_1) { MainThreads(1, false /*use_gpu*/); }
-TEST(demo_multi_threads, word2vec_cpu_4) { MainThreads(4, false /*use_gpu*/); }
-
-#ifdef PADDLE_WITH_CUDA
-TEST(demo, word2vec_gpu) { Main(true /*use_gpu*/); }
-TEST(demo_multi_threads, word2vec_gpu_1) { MainThreads(1, true /*use_gpu*/); }
-TEST(demo_multi_threads, word2vec_gpu_4) { MainThreads(4, true /*use_gpu*/); }
-#endif
-
 }  // namespace demo
 }  // namespace paddle
+
+int main(int argc, char** argv) {
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  paddle::demo::Main(false /* use_gpu*/);
+  paddle::demo::MainThreads(1, false /* use_gpu*/);
+  paddle::demo::MainThreads(4, false /* use_gpu*/);
+  if (FLAGS_use_gpu) {
+    paddle::demo::Main(true /*use_gpu*/);
+    paddle::demo::MainThreads(1, true /*use_gpu*/);
+    paddle::demo::MainThreads(4, true /*use_gpu*/);
+  }
+  return 0;
+}
diff --git a/paddle/contrib/inference/demo/utils.h b/paddle/fluid/inference/api/demo_ci/utils.h
similarity index 93%
rename from paddle/contrib/inference/demo/utils.h
rename to paddle/fluid/inference/api/demo_ci/utils.h
index b5330d8d9d89260cfe3d5214e5a4ceb720cffdf1..cb8990671162dff47228736e69617229528cc093 100644
--- a/paddle/contrib/inference/demo/utils.h
+++ b/paddle/fluid/inference/api/demo_ci/utils.h
@@ -13,16 +13,15 @@
 // limitations under the License.
 
 #pragma once
+#include <algorithm>
 #include <string>
 #include <vector>
-
-#include "paddle/contrib/inference/paddle_inference_api.h"
+#include "paddle/fluid/inference/paddle_inference_api.h"
 
 namespace paddle {
 namespace demo {
 
-static void split(const std::string& str,
-                  char sep,
+static void split(const std::string& str, char sep,
                   std::vector<std::string>* pieces) {
   pieces->clear();
   if (str.empty()) {
diff --git a/paddle/contrib/inference/demo/vis_demo.cc b/paddle/fluid/inference/api/demo_ci/vis_demo.cc
similarity index 70%
rename from paddle/contrib/inference/demo/vis_demo.cc
rename to paddle/fluid/inference/api/demo_ci/vis_demo.cc
index 45575f9a862de430236ae20cf498e542a45b1f4b..3800d49b34738d5a272033d75cb415ae9ad1fb8f 100644
--- a/paddle/contrib/inference/demo/vis_demo.cc
+++ b/paddle/fluid/inference/api/demo_ci/vis_demo.cc
@@ -18,26 +18,24 @@ limitations under the License. */
 
 #include <gflags/gflags.h>
 #include <glog/logging.h>  // use glog instead of PADDLE_ENFORCE to avoid importing other paddle header files.
-#include <gtest/gtest.h>
 #include <fstream>
 #include <iostream>
-#include "paddle/contrib/inference/demo/utils.h"
-#include "paddle/contrib/inference/paddle_inference_api.h"
+#include "paddle/fluid/inference/demo_ci/utils.h"
+#include "paddle/fluid/platform/enforce.h"
 
 #ifdef PADDLE_WITH_CUDA
 DECLARE_double(fraction_of_gpu_memory_to_use);
 #endif
-
-namespace paddle {
-namespace demo {
-
 DEFINE_string(modeldir, "", "Directory of the inference model.");
 DEFINE_string(refer, "", "path to reference result for comparison.");
 DEFINE_string(
-    data,
-    "",
+    data, "",
     "path of data; each line is a record, format is "
     "'<space splitted floats as data>\t<space splitted ints as shape'");
+DEFINE_bool(use_gpu, false, "Whether use gpu.");
+
+namespace paddle {
+namespace demo {
 
 struct Record {
   std::vector<float> data;
@@ -47,7 +45,7 @@ struct Record {
 void split(const std::string& str, char sep, std::vector<std::string>* pieces);
 
 Record ProcessALine(const std::string& line) {
-  LOG(INFO) << "process a line";
+  VLOG(3) << "process a line";
   std::vector<std::string> columns;
   split(line, '\t', &columns);
   CHECK_EQ(columns.size(), 2UL)
@@ -65,8 +63,8 @@ Record ProcessALine(const std::string& line) {
   for (auto& s : shape_strs) {
     record.shape.push_back(std::stoi(s));
   }
-  LOG(INFO) << "data size " << record.data.size();
-  LOG(INFO) << "data shape size " << record.shape.size();
+  VLOG(3) << "data size " << record.data.size();
+  VLOG(3) << "data shape size " << record.shape.size();
   return record;
 }
 
@@ -78,20 +76,22 @@ void CheckOutput(const std::string& referfile, const PaddleTensor& output) {
   file.close();
 
   size_t numel = output.data.length() / PaddleDtypeSize(output.dtype);
-  LOG(INFO) << "predictor output numel " << numel;
-  LOG(INFO) << "reference output numel " << refer.data.size();
-  EXPECT_EQ(numel, refer.data.size());
+  VLOG(3) << "predictor output numel " << numel;
+  VLOG(3) << "reference output numel " << refer.data.size();
+  PADDLE_ENFORCE_EQ(numel, refer.data.size());
   switch (output.dtype) {
     case PaddleDType::INT64: {
       for (size_t i = 0; i < numel; ++i) {
-        EXPECT_EQ(static_cast<int64_t*>(output.data.data())[i], refer.data[i]);
+        PADDLE_ENFORCE_EQ(static_cast<int64_t*>(output.data.data())[i],
+                          refer.data[i]);
       }
       break;
     }
     case PaddleDType::FLOAT32:
       for (size_t i = 0; i < numel; ++i) {
-        EXPECT_NEAR(
-            static_cast<float*>(output.data.data())[i], refer.data[i], 1e-5);
+        PADDLE_ENFORCE_LT(
+            fabs(static_cast<float*>(output.data.data())[i] - refer.data[i]),
+            1e-5);
       }
       break;
   }
@@ -106,15 +106,15 @@ void Main(bool use_gpu) {
   config.prog_file = FLAGS_modeldir + "/__model__";
   config.use_gpu = use_gpu;
   config.device = 0;
-#ifdef PADDLE_WITH_CUDA
-  config.fraction_of_gpu_memory = FLAGS_fraction_of_gpu_memory_to_use;
-#endif
+  if (FLAGS_use_gpu) {
+    config.fraction_of_gpu_memory = 0.1;  // set by yourself
+  }
 
-  LOG(INFO) << "init predictor";
+  VLOG(3) << "init predictor";
   auto predictor =
       CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
 
-  LOG(INFO) << "begin to process data";
+  VLOG(3) << "begin to process data";
   // Just a single batch of data.
   std::string line;
   std::ifstream file(FLAGS_data);
@@ -123,27 +123,32 @@ void Main(bool use_gpu) {
   file.close();
 
   // Inference.
-  PaddleTensor input{
-      .name = "xx",
-      .shape = record.shape,
-      .data = PaddleBuf(record.data.data(), record.data.size() * sizeof(float)),
-      .dtype = PaddleDType::FLOAT32};
+  PaddleTensor input;
+  input.shape = record.shape;
+  input.data =
+      PaddleBuf(record.data.data(), record.data.size() * sizeof(float));
+  input.dtype = PaddleDType::FLOAT32;
 
-  LOG(INFO) << "run executor";
+  VLOG(3) << "run executor";
   std::vector<PaddleTensor> output;
   predictor->Run({input}, &output);
 
-  LOG(INFO) << "output.size " << output.size();
+  VLOG(3) << "output.size " << output.size();
   auto& tensor = output.front();
-  LOG(INFO) << "output: " << SummaryTensor(tensor);
+  VLOG(3) << "output: " << SummaryTensor(tensor);
 
   // compare with reference result
   CheckOutput(FLAGS_refer, tensor);
 }
 
-TEST(demo, vis_demo_cpu) { Main(false /*use_gpu*/); }
-#ifdef PADDLE_WITH_CUDA
-TEST(demo, vis_demo_gpu) { Main(true /*use_gpu*/); }
-#endif
 }  // namespace demo
 }  // namespace paddle
+
+int main(int argc, char** argv) {
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  paddle::demo::Main(false /* use_gpu*/);
+  if (FLAGS_use_gpu) {
+    paddle::demo::Main(true /*use_gpu*/);
+  }
+  return 0;
+}
diff --git a/paddle/fluid/inference/api/helper.cc b/paddle/fluid/inference/api/helper.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9cc491e10d691a206dd903b78c0ea570741da44c
--- /dev/null
+++ b/paddle/fluid/inference/api/helper.cc
@@ -0,0 +1,44 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/api/helper.h"
+
+namespace paddle {
+namespace inference {
+
+template <>
+std::string to_string<std::vector<float>>(
+    const std::vector<std::vector<float>> &vec) {
+  std::stringstream ss;
+  for (const auto &piece : vec) {
+    ss << to_string(piece) << "\n";
+  }
+  return ss.str();
+}
+
+template <>
+std::string to_string<std::vector<std::vector<float>>>(
+    const std::vector<std::vector<std::vector<float>>> &vec) {
+  std::stringstream ss;
+  for (const auto &line : vec) {
+    for (const auto &rcd : line) {
+      ss << to_string(rcd) << ";\t";
+    }
+    ss << '\n';
+  }
+  return ss.str();
+}
+
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..8e359a67738c0df180933421b45f15b39fd0e78c
--- /dev/null
+++ b/paddle/fluid/inference/api/helper.h
@@ -0,0 +1,139 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <glog/logging.h>
+#include <sys/time.h>
+#include <algorithm>
+#include <numeric>
+#include <sstream>
+#include <string>
+#include <vector>
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/api/timer.h"
+
+namespace paddle {
+namespace inference {
+
+static void split(const std::string &str, char sep,
+                  std::vector<std::string> *pieces) {
+  pieces->clear();
+  if (str.empty()) {
+    return;
+  }
+  size_t pos = 0;
+  size_t next = str.find(sep, pos);
+  while (next != std::string::npos) {
+    pieces->push_back(str.substr(pos, next - pos));
+    pos = next + 1;
+    next = str.find(sep, pos);
+  }
+  if (!str.substr(pos).empty()) {
+    pieces->push_back(str.substr(pos));
+  }
+}
+static void split_to_float(const std::string &str, char sep,
+                           std::vector<float> *fs) {
+  std::vector<std::string> pieces;
+  split(str, sep, &pieces);
+  std::transform(pieces.begin(), pieces.end(), std::back_inserter(*fs),
+                 [](const std::string &v) { return std::stof(v); });
+}
+static void split_to_int64(const std::string &str, char sep,
+                           std::vector<int64_t> *is) {
+  std::vector<std::string> pieces;
+  split(str, sep, &pieces);
+  std::transform(pieces.begin(), pieces.end(), std::back_inserter(*is),
+                 [](const std::string &v) { return std::stoi(v); });
+}
+template <typename T>
+std::string to_string(const std::vector<T> &vec) {
+  std::stringstream ss;
+  for (const auto &c : vec) {
+    ss << c << " ";
+  }
+  return ss.str();
+}
+template <>
+std::string to_string<std::vector<float>>(
+    const std::vector<std::vector<float>> &vec);
+
+template <>
+std::string to_string<std::vector<std::vector<float>>>(
+    const std::vector<std::vector<std::vector<float>>> &vec);
+
+template <typename T>
+static void TensorAssignData(PaddleTensor *tensor,
+                             const std::vector<std::vector<T>> &data) {
+  // Assign buffer
+  int dim = std::accumulate(tensor->shape.begin(), tensor->shape.end(), 1,
+                            [](int a, int b) { return a * b; });
+  tensor->data.Resize(sizeof(T) * dim);
+  int c = 0;
+  for (const auto &f : data) {
+    for (T v : f) {
+      static_cast<T *>(tensor->data.data())[c++] = v;
+    }
+  }
+}
+
+std::string DescribeTensor(const PaddleTensor &tensor) {
+  std::stringstream os;
+  os << "Tensor [" << tensor.name << "]\n";
+  os << " - type: ";
+  switch (tensor.dtype) {
+    case PaddleDType::FLOAT32:
+      os << "float32";
+      break;
+    case PaddleDType::INT64:
+      os << "int64";
+      break;
+    default:
+      os << "unset";
+  }
+  os << '\n';
+
+  os << " - shape: " << to_string(tensor.shape) << '\n';
+  os << " - lod: ";
+  for (auto &l : tensor.lod) {
+    os << to_string(l) << "; ";
+  }
+  os << "\n";
+  os << " - data: ";
+
+  int dim = std::accumulate(tensor.shape.begin(), tensor.shape.end(), 1,
+                            [](int a, int b) { return a * b; });
+  for (int i = 0; i < dim; i++) {
+    os << static_cast<float *>(tensor.data.data())[i] << " ";
+  }
+  os << '\n';
+  return os.str();
+}
+
+void PrintTime(int batch_size, int repeat, int num_threads, int tid,
+               double latency, int epoch = 1) {
+  LOG(INFO) << "====== batch_size: " << batch_size << ", repeat: " << repeat
+            << ", threads: " << num_threads << ", thread id: " << tid
+            << ", latency: " << latency << "ms ======";
+  if (epoch > 1) {
+    int samples = batch_size * epoch;
+    LOG(INFO) << "====== sample number: " << samples
+              << ", average latency of each sample: " << latency / samples
+              << "ms ======";
+  }
+}
+
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/contrib/inference/high_level_api.md b/paddle/fluid/inference/api/high_level_api.md
similarity index 98%
rename from paddle/contrib/inference/high_level_api.md
rename to paddle/fluid/inference/api/high_level_api.md
index eb92885052a453d8c837bbf6f6e984efb509332a..8b8b6916d7e2b1a2f9fd09e9dfd2fe5a332461f5 100644
--- a/paddle/contrib/inference/high_level_api.md
+++ b/paddle/fluid/inference/api/high_level_api.md
@@ -57,4 +57,4 @@ By specifying the engine kind and config, one can get a specific implementation.
 ## Reference
 
 - [paddle_inference_api.h](./paddle_inference_api.h)
-- [some demos](./demo)
+- [some demos](./demo_ci)
diff --git a/paddle/contrib/inference/high_level_api_cn.md b/paddle/fluid/inference/api/high_level_api_cn.md
similarity index 89%
rename from paddle/contrib/inference/high_level_api_cn.md
rename to paddle/fluid/inference/api/high_level_api_cn.md
index a57f015a4e44d43ee4e475cf606faa6f05e095fa..442c598978c700f4c438b365b8900db5b65bc5ec 100644
--- a/paddle/contrib/inference/high_level_api_cn.md
+++ b/paddle/fluid/inference/api/high_level_api_cn.md
@@ -65,13 +65,13 @@ config.model_dir = "xxx";
 config.use_gpu = false;
 // 创建一个原生的 PaddlePredictor
 auto predictor =
-      paddle::CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+      paddle::CreatePaddlePredictor<paddle::NativeConfig, paddle::PaddleEngineKind::kNative>(config);
 // 创建输入 tensor
 int64_t data[4] = {1, 2, 3, 4};
 paddle::PaddleTensor tensor{.name = "",
                             .shape = std::vector<int>({4, 1}),
-                            .data = PaddleBuf(data, sizeof(data)),
-                            .dtype = PaddleDType::INT64};
+                            .data = paddle::PaddleBuf(data, sizeof(data)),
+                            .dtype = paddle::PaddleDType::INT64};
 // 创建输出 tensor，输出 tensor 的内存可以复用
 std::vector<paddle::PaddleTensor> outputs;
 // 执行预测
@@ -83,5 +83,5 @@ CHECK(predictor->Run(slots, &outputs));
 
 ## 详细代码参考
 
-- [inference demos](./demo)
-- [复杂单线程/多线程例子](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/contrib/inference/test_paddle_inference_api_impl.cc)
+- [inference demos](./demo_ci)
+- [复杂单线程/多线程例子](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/api/test_api_impl.cc)
diff --git a/paddle/contrib/inference/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h
similarity index 75%
rename from paddle/contrib/inference/paddle_inference_api.h
rename to paddle/fluid/inference/api/paddle_inference_api.h
index b8ba2d14a5c161d491d838888ea14b776f769f23..55a07ca705f9fafa9ea223a867300bd14e10c364 100644
--- a/paddle/contrib/inference/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -40,11 +40,12 @@ class PaddleBuf {
   // Copy only available when memory is managed externally.
   explicit PaddleBuf(const PaddleBuf&);
   PaddleBuf& operator=(const PaddleBuf&);
+  PaddleBuf& operator=(PaddleBuf&&);
   // Do not own the memory.
   PaddleBuf(void* data, size_t length)
       : data_(data), length_(length), memory_owned_{false} {}
   // Own memory.
-  PaddleBuf(size_t length)
+  explicit PaddleBuf(size_t length)
       : data_(new char[length]), length_(length), memory_owned_(true) {}
   // Resize to `length` bytes.
   void Resize(size_t length);
@@ -67,15 +68,16 @@ struct PaddleTensor {
   PaddleTensor() = default;
   std::string name;  // variable name.
   std::vector<int> shape;
-  // TODO(Superjomn) for LoD support, add a vector<vector<int>> field if needed.
   PaddleBuf data;  // blob of data.
   PaddleDType dtype;
+  std::vector<std::vector<size_t>> lod;  // Tensor+LoD equals LoDTensor
 };
 
 enum class PaddleEngineKind {
   kNative = 0,         // Use the native Fluid facility.
   kAnakin,             // Use Anakin for inference.
   kAutoMixedTensorRT,  // Automatically mix Fluid with TensorRT.
+  kAnalysis
   // TODO(Superjomn) support following engines latter.
   // kTensorRT,           // Use TensorRT for inference.
   // kAutoMixedAnakin,    // Automatically mix Fluid with Anakin.
@@ -98,7 +100,8 @@ class PaddlePredictor {
   // responsible for the output tensor's buffer, either allocated or passed from
   // outside.
   virtual bool Run(const std::vector<PaddleTensor>& inputs,
-                   std::vector<PaddleTensor>* output_data) = 0;
+                   std::vector<PaddleTensor>* output_data,
+                   int batch_size = -1) = 0;
 
   // Clone a predictor that share the model weights, the Cloned predictor should
   // be thread-safe.
@@ -118,6 +121,10 @@ struct NativeConfig : public PaddlePredictor::Config {
   bool use_gpu{false};
   int device{0};
   float fraction_of_gpu_memory{-1.f};  // Negative to notify initialization.
+  // NOTE: NOT use it, just for the internal test, will discard later
+  bool _use_mkldnn{false};
+  // Specify the variable's name of each input.
+  bool specify_input_name{false};
 
   std::string prog_file;
   std::string param_file;
@@ -125,14 +132,39 @@ struct NativeConfig : public PaddlePredictor::Config {
 
 // Configurations for Anakin engine.
 struct AnakinConfig : public PaddlePredictor::Config {
+  enum TargetType { NVGPU = 0, X86 };
   int device;
   std::string model_file;
   int max_batch_size{-1};
+  TargetType target_type;
 };
 
 struct TensorRTConfig : public NativeConfig {
   // Determine whether a subgraph will be executed by TRT.
   int min_subgraph_size{1};
+  // While TensorRT allows an engine optimized for a given max batch size
+  // to run at any smaller size, the performance for those smaller
+  // sizes may not be as well-optimized. Therefore, Max batch is best
+  // equivalent to the runtime batch size.
+  int max_batch_size{1};
+  // For workspace_size, refer it from here:
+  // https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#troubleshooting
+  int workspace_size{1 << 30};
+};
+
+// NOTE WIP, not stable yet.
+struct AnalysisConfig : public NativeConfig {
+  //
+  enum class IrPassMode {
+    kSystem,   // Use system default passes, not customize.
+    kInclude,  // Specify the passes in `ir_passes`.
+    kExclude   // Specify the disabled passes in `ir_passes`.
+  };
+
+  bool enable_ir_optim = true;
+  IrPassMode ir_mode{IrPassMode::kExclude};
+  // attention lstm fuse works only on some specific models, disable as default.
+  std::vector<std::string> ir_passes{"attention_lstm_fuse_pass"};
 };
 
 // A factory to help create different predictors.
diff --git a/paddle/fluid/inference/api/timer.h b/paddle/fluid/inference/api/timer.h
new file mode 100644
index 0000000000000000000000000000000000000000..2df5274dc1f2e7ad8e434f1da9d5ae6aee94c784
--- /dev/null
+++ b/paddle/fluid/inference/api/timer.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <chrono>  // NOLINT
+
+namespace paddle {
+namespace inference {
+
+// Timer for timer
+class Timer {
+ public:
+  std::chrono::high_resolution_clock::time_point start;
+  std::chrono::high_resolution_clock::time_point startu;
+
+  void tic() { start = std::chrono::high_resolution_clock::now(); }
+  double toc() {
+    startu = std::chrono::high_resolution_clock::now();
+    std::chrono::duration<double> time_span =
+        std::chrono::duration_cast<std::chrono::duration<double>>(startu -
+                                                                  start);
+    double used_time_ms = static_cast<double>(time_span.count()) * 1000.0;
+    return used_time_ms;
+  }
+};
+
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/check_symbol.sh b/paddle/fluid/inference/check_symbol.sh
new file mode 100755
index 0000000000000000000000000000000000000000..12b7b3e7e5982f193e48596b867953fc93841b61
--- /dev/null
+++ b/paddle/fluid/inference/check_symbol.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+lib=$1
+if [ $# -ne 1 ]; then echo "No input library"; exit -1 ; fi
+
+num_paddle_syms=$(nm -D ${lib} | grep paddle | wc -l)
+num_google_syms=$(nm -D ${lib} | grep google | grep -v paddle | grep T | wc -l)
+
+if [ $num_paddle_syms -le 0 ]; then echo "Have no paddle symbols"; exit -1 ; fi
+if [ $num_google_syms -ge 1 ]; then echo "Have some google symbols"; exit -1 ; fi
+
+exit 0
diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc
index 181868977dd8f2568486ed0c4e1f260a69795896..e246a06fd079d837ac321197914c9f70b528f2c8 100644
--- a/paddle/fluid/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
@@ -20,6 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/feed_fetch_type.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/version.h"
 #include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/pybind/pybind.h"
 
@@ -124,6 +125,9 @@ std::unique_ptr<framework::ProgramDesc> Load(framework::Executor* executor,
 
   std::unique_ptr<framework::ProgramDesc> main_program(
       new framework::ProgramDesc(program_desc_str));
+  PADDLE_ENFORCE(framework::IsProgramVersionSupported(main_program->Version()),
+                 "model version %ld is not supported.",
+                 main_program->Version());
 
   LoadPersistables(executor, scope, *main_program, dirname, "");
   return main_program;
@@ -138,10 +142,29 @@ std::unique_ptr<framework::ProgramDesc> Load(
 
   std::unique_ptr<framework::ProgramDesc> main_program(
       new framework::ProgramDesc(program_desc_str));
+  PADDLE_ENFORCE(framework::IsProgramVersionSupported(main_program->Version()),
+                 "model version %ld is not supported.",
+                 main_program->Version());
 
   LoadPersistables(executor, scope, *main_program, "", param_filename);
   return main_program;
 }
 
+void SaveVars(const framework::Scope& scope,
+              const std::vector<std::string>& vars, const std::string& dirname,
+              bool predicate) {
+  framework::ProgramDesc prog;
+  auto* block = prog.MutableBlock(0);
+  auto* op = block->AppendOp();
+  op->SetType("save_combine");
+  op->SetInput("X", vars);
+  op->SetAttr("file_path", dirname + "/param");
+  op->CheckAttrs();
+
+  platform::CPUPlace place;
+  framework::Executor exe(place);
+  exe.Run(prog, const_cast<framework::Scope*>(&scope), 0, true, true);
+}
+
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/io.h b/paddle/fluid/inference/io.h
index 01b50b3670cb9da2e0be232a61ea6129dd83aa20..ab492577c1476abee30d6dd1c740394391e5a93a 100644
--- a/paddle/fluid/inference/io.h
+++ b/paddle/fluid/inference/io.h
@@ -41,5 +41,10 @@ std::unique_ptr<framework::ProgramDesc> Load(framework::Executor* executor,
                                              const std::string& prog_filename,
                                              const std::string& param_filename);
 
+// Save the variables from a scope to disk.
+void SaveVars(const framework::Scope& scope,
+              const std::vector<std::string>& vars, const std::string& dirname,
+              bool predicate = true);
+
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/paddle_fluid.map b/paddle/fluid/inference/paddle_fluid.map
index 5203784dc1fcb672eb6a26d9dfd3ffbe02e08038..7e5cae04b81e6ce759b92f6c4b921ecf974e8260 100644
--- a/paddle/fluid/inference/paddle_fluid.map
+++ b/paddle/fluid/inference/paddle_fluid.map
@@ -1,6 +1,7 @@
 {
 	global:
 		*paddle*;
+                *Pass*;
 	local:
 		*;
 };
diff --git a/paddle/fluid/inference/paddle_fluid.sym b/paddle/fluid/inference/paddle_fluid.sym
new file mode 100644
index 0000000000000000000000000000000000000000..ef2a04d788aa86b7f6a61c4af479d70d1137f374
--- /dev/null
+++ b/paddle/fluid/inference/paddle_fluid.sym
@@ -0,0 +1 @@
+*paddle*
diff --git a/paddle/fluid/inference/tensorrt/CMakeLists.txt b/paddle/fluid/inference/tensorrt/CMakeLists.txt
index b52d083f280e5e7713600a7b748dedd37aca0a1e..a610687a5b11999a7cb7426dbe961e5972ee1746 100644
--- a/paddle/fluid/inference/tensorrt/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/CMakeLists.txt
@@ -1,4 +1,4 @@
-nv_library(tensorrt_engine SRCS engine.cc DEPS framework_proto)
+nv_library(tensorrt_engine SRCS engine.cc DEPS framework_proto device_context)
 nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader)
 nv_test(test_tensorrt_engine SRCS test_engine.cc DEPS dynload_cuda tensorrt_engine)
 add_subdirectory(convert)
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index 748f5a084e8c880df215a60fe51c835ba5cd3110..9d7be2d03cf7bb12afe7e52d9630f184d689dc25 100644
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -1,7 +1,8 @@
 # Add TRT tests
 nv_library(tensorrt_converter
-  SRCS mul_op.cc conv2d_op.cc fc_op.cc
-  DEPS tensorrt_engine mul_op)
+  SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
+batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc
+  DEPS tensorrt_engine operator scope framework_proto op_registry)
 
 nv_test(test_op_converter SRCS test_op_converter.cc DEPS
   ${FLUID_CORE_MODULES} tensorrt_engine tensorrt_converter)
@@ -13,3 +14,16 @@ nv_test(test_trt_fc_op SRCS test_fc_op.cc fc_op.cc
         DEPS ${FLUID_CORE_MODULES} tensorrt_engine mul_op SERIAL)
 nv_test(test_trt_activation_op SRCS test_activation_op.cc activation_op.cc
         DEPS ${FLUID_CORE_MODULES} tensorrt_engine activation_op SERIAL)
+nv_test(test_trt_conv_op SRCS test_conv2d_op.cc conv2d_op.cc
+        DEPS ${FLUID_CORE_MODULES} tensorrt_engine conv_op SERIAL)
+nv_test(test_trt_pool2d_op SRCS test_pool2d_op.cc pool2d_op.cc
+        DEPS ${FLUID_CORE_MODULES} tensorrt_engine pool_op SERIAL)
+nv_test(test_trt_elementwise_op SRCS test_elementwise_op.cc elementwise_op.cc
+        DEPS ${FLUID_CORE_MODULES} tensorrt_engine elementwise_add_op SERIAL)
+nv_test(test_trt_softmax_op SRCS test_softmax_op.cc softmax_op.cc
+        DEPS ${FLUID_CORE_MODULES} tensorrt_engine softmax_op SERIAL)
+nv_test(test_trt_batch_norm_op SRCS test_batch_norm_op.cc batch_norm_op.cc
+        DEPS ${FLUID_CORE_MODULES} tensorrt_engine batch_norm_op SERIAL)
+
+nv_test(test_trt_concat_op SRCS test_concat_op.cc concat_op.cc
+        DEPS ${FLUID_CORE_MODULES} tensorrt_engine concat_op SERIAL)
diff --git a/paddle/fluid/inference/tensorrt/convert/activation_op.cc b/paddle/fluid/inference/tensorrt/convert/activation_op.cc
index e1cace9cc1b06f036f52e82b7b86c99a02d50f50..8168cdff1b85fc05d22fbec7fac6ab8892f3a907 100644
--- a/paddle/fluid/inference/tensorrt/convert/activation_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/activation_op.cc
@@ -35,6 +35,8 @@ class ReluOpConverter : public OpConverter {
         engine_, Activation, *const_cast<nvinfer1::ITensor*>(input_tensor),
         nvinfer1::ActivationType::kRELU);
     auto output_name = op_desc.Output("Out")[0];
+    layer->setName(("relu (Output: " + output_name + ")").c_str());
+    layer->getOutput(0)->setName(output_name.c_str());
     engine_->SetITensor(output_name, layer->getOutput(0));
     if (test_mode) {  // the test framework can not determine which is the
                       // output, so place the declaration inside.
diff --git a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3330af2da6c97ad153dcecd86be4b441eac62b5e
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
@@ -0,0 +1,138 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <math.h>
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+class BatchNormOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    LOG(INFO) << "convert a fluid batch norm op to tensorrt batch_norm";
+
+    framework::OpDesc op_desc(op, nullptr);
+    PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
+    PADDLE_ENFORCE_EQ(op_desc.Input("Bias").size(), 1);   // Bias is a weight
+    PADDLE_ENFORCE_EQ(op_desc.Input("Mean").size(), 1);   // Mean is a weight
+    PADDLE_ENFORCE_EQ(op_desc.Input("Scale").size(), 1);  // Scale is a weight
+    PADDLE_ENFORCE_EQ(op_desc.Input("Variance").size(),
+                      1);  // Variance is a weight
+    PADDLE_ENFORCE_EQ(op_desc.Output("Y").size(), 1);
+
+    auto* X = engine_->GetITensor(op_desc.Input("X").front());
+    // Declare weights
+    auto* Bias_v = scope.FindVar(op_desc.Input("Bias").front());
+    auto* Mean_v = scope.FindVar(op_desc.Input("Mean").front());
+    auto* Scale_v = scope.FindVar(op_desc.Input("Scale").front());
+    auto* Variance_v = scope.FindVar(op_desc.Input("Variance").front());
+    const float eps = boost::get<float>(op_desc.GetAttr("epsilon"));
+
+    PADDLE_ENFORCE_NOT_NULL(Bias_v);
+    PADDLE_ENFORCE_NOT_NULL(Mean_v);
+    PADDLE_ENFORCE_NOT_NULL(Scale_v);
+    PADDLE_ENFORCE_NOT_NULL(Variance_v);
+
+    // get tensor
+    auto* Bias_t = Bias_v->GetMutable<framework::LoDTensor>();
+    auto* Mean_t = Mean_v->GetMutable<framework::LoDTensor>();
+    auto* Scale_t = Scale_v->GetMutable<framework::LoDTensor>();
+    auto* Variance_t = Variance_v->GetMutable<framework::LoDTensor>();
+
+    // create temp tensor for weights
+    framework::LoDTensor bias_tensor;
+    framework::LoDTensor mean_tensor;
+    framework::LoDTensor scale_tensor;
+    framework::LoDTensor variance_tensor;
+
+    bias_tensor.Resize(Bias_t->dims());
+    mean_tensor.Resize(Mean_t->dims());
+    scale_tensor.Resize(Scale_t->dims());
+    variance_tensor.Resize(Variance_t->dims());
+
+    platform::CPUPlace cpu_place;
+    // copy data from gpu to cpu
+    TensorCopySync((*Bias_t), cpu_place, &bias_tensor);
+    TensorCopySync((*Mean_t), cpu_place, &mean_tensor);
+    TensorCopySync((*Scale_t), cpu_place, &scale_tensor);
+    TensorCopySync((*Variance_t), cpu_place, &variance_tensor);
+
+    auto* bias_data = bias_tensor.mutable_data<float>(platform::CPUPlace());
+    auto* mean_data = mean_tensor.mutable_data<float>(platform::CPUPlace());
+    auto* scale_data = scale_tensor.mutable_data<float>(platform::CPUPlace());
+    auto* variance_data =
+        variance_tensor.mutable_data<float>(platform::CPUPlace());
+
+    std::unique_ptr<framework::LoDTensor> combile_scale_tensor(
+        new framework::LoDTensor());
+    std::unique_ptr<framework::LoDTensor> combile_bias_tensor(
+        new framework::LoDTensor());
+
+    combile_scale_tensor->Resize(scale_tensor.dims());
+    combile_bias_tensor->Resize(bias_tensor.dims());
+
+    auto* combile_scale_data =
+        combile_scale_tensor->mutable_data<float>(platform::CPUPlace());
+    auto* combile_bias_data =
+        combile_bias_tensor->mutable_data<float>(platform::CPUPlace());
+
+    size_t ele_num = combile_scale_tensor->memory_size() / sizeof(float);
+
+    for (size_t i = 0; i < ele_num; i++) {
+      float scale = scale_data[i];
+      float bias = bias_data[i];
+      float mean = mean_data[i];
+      float variance = variance_data[i];
+      combile_scale_data[i] = scale / sqrtf(variance + eps);
+      combile_bias_data[i] = bias - mean * combile_scale_data[i];
+    }
+
+    TensorRTEngine::Weight scale_weights{
+        nvinfer1::DataType::kFLOAT, static_cast<void*>(combile_scale_data),
+        combile_scale_tensor->memory_size() / sizeof(float)};
+    TensorRTEngine::Weight shift_weights{
+        nvinfer1::DataType::kFLOAT, static_cast<void*>(combile_bias_data),
+        combile_bias_tensor->memory_size() / sizeof(float)};
+    TensorRTEngine::Weight power_weights{nvinfer1::DataType::kFLOAT, nullptr,
+                                         0};
+
+    nvinfer1::IScaleLayer* layer =
+        TRT_ENGINE_ADD_LAYER(engine_, Scale, *const_cast<nvinfer1::ITensor*>(X),
+                             nvinfer1::ScaleMode::kCHANNEL, shift_weights.get(),
+                             scale_weights.get(), power_weights.get());
+
+    auto output_name = op_desc.Output("Y").front();
+    layer->setName(("batch_norm (Output: " + output_name + ")").c_str());
+    layer->getOutput(0)->setName(output_name.c_str());
+    engine_->weight_map[op_desc.Input("Bias").front()] =
+        std::move(combile_bias_tensor);
+    engine_->weight_map[op_desc.Input("Scale").front()] =
+        std::move(combile_scale_tensor);
+
+    engine_->SetITensor(output_name, layer->getOutput(0));
+
+    if (test_mode) {
+      engine_->DeclareOutput(output_name);
+    }
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(batch_norm, BatchNormOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/concat_op.cc b/paddle/fluid/inference/tensorrt/convert/concat_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a11dfa1e8f2dacfad067d025678911200db500fb
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/concat_op.cc
@@ -0,0 +1,59 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+/*
+ * MulOp, IMatrixMultiplyLayer in TRT. This Layer doesn't has weights.
+ */
+class ConcatOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    VLOG(4) << "convert a fluid mul op to tensorrt mul layer without bias";
+
+    framework::OpDesc op_desc(op, nullptr);
+    // Declare inputs
+    std::vector<nvinfer1::ITensor*> itensors;
+    for (auto& input_name : op_desc.Input("X")) {
+      itensors.push_back(engine_->GetITensor(input_name));
+    }
+    int axis = boost::get<int>(op_desc.GetAttr("axis"));
+    PADDLE_ENFORCE(axis > 0,
+                   "The axis attr of Concat op should be large than 0 for trt");
+
+    auto* layer = TRT_ENGINE_ADD_LAYER(engine_, Concatenation, itensors.data(),
+                                       itensors.size());
+    axis = axis - 1;  // Remove batch dim
+    layer->setAxis(axis);
+    auto output_name = op_desc.Output("Out")[0];
+    layer->setName(("concat (Output: " + output_name + ")").c_str());
+    layer->getOutput(0)->setName(output_name.c_str());
+    engine_->SetITensor(output_name, layer->getOutput(0));
+    if (test_mode) {  // the test framework can not determine which is the
+                      // output, so place the declaration inside.
+      engine_->DeclareOutput(output_name);
+    }
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(concat, ConcatOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
index 8e7e23377d4b2fe7afd51f1f58048fc4ed3c6d99..0a37d3968c39d2c244bbd82161afddf6330e421d 100644
--- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
@@ -20,11 +20,72 @@ namespace tensorrt {
 
 class Conv2dOpConverter : public OpConverter {
  public:
-  Conv2dOpConverter() {}
   void operator()(const framework::proto::OpDesc& op,
                   const framework::Scope& scope, bool test_mode) override {
     LOG(INFO)
         << "convert a fluid conv2d op to tensorrt conv layer without bias";
+
+    framework::OpDesc op_desc(op, nullptr);
+    PADDLE_ENFORCE_EQ(op_desc.Input("Input").size(), 1);
+    PADDLE_ENFORCE_EQ(op_desc.Input("Filter").size(), 1);  // Y is a weight
+    PADDLE_ENFORCE_EQ(op_desc.Output("Output").size(), 1);
+
+    auto* X = engine_->GetITensor(op_desc.Input("Input").front());
+    // Declare weights
+    auto* Y_v = scope.FindVar(op_desc.Input("Filter").front());
+    PADDLE_ENFORCE_NOT_NULL(Y_v);
+    auto* Y_t = Y_v->GetMutable<framework::LoDTensor>();
+
+    platform::CPUPlace cpu_place;
+    std::unique_ptr<framework::LoDTensor> weight_tensor(
+        new framework::LoDTensor());
+    weight_tensor->Resize(Y_t->dims());
+    TensorCopySync((*Y_t), cpu_place, weight_tensor.get());
+
+    auto* weight_data =
+        weight_tensor->mutable_data<float>(platform::CPUPlace());
+
+    PADDLE_ENFORCE_EQ(weight_tensor->dims().size(), 4UL);
+    const int n_output = weight_tensor->dims()[0];
+    const int filter_h = weight_tensor->dims()[2];
+    const int filter_w = weight_tensor->dims()[3];
+
+    const int groups = boost::get<int>(op_desc.GetAttr("groups"));
+    const std::vector<int> dilations =
+        boost::get<std::vector<int>>(op_desc.GetAttr("dilations"));
+    const std::vector<int> strides =
+        boost::get<std::vector<int>>(op_desc.GetAttr("strides"));
+    const std::vector<int> paddings =
+        boost::get<std::vector<int>>(op_desc.GetAttr("paddings"));
+
+    nvinfer1::DimsHW nv_ksize(filter_h, filter_w);
+    nvinfer1::DimsHW nv_dilations(dilations[0], dilations[1]);
+    nvinfer1::DimsHW nv_strides(strides[0], strides[1]);
+    nvinfer1::DimsHW nv_paddings(paddings[0], paddings[1]);
+
+    TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT,
+                                  static_cast<void*>(weight_data),
+                                  weight_tensor->memory_size() / sizeof(float)};
+
+    TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
+    auto* layer = TRT_ENGINE_ADD_LAYER(
+        engine_, Convolution, *const_cast<nvinfer1::ITensor*>(X), n_output,
+        nv_ksize, weight.get(), bias.get());
+    PADDLE_ENFORCE(layer != nullptr);
+    layer->setStride(nv_strides);
+    layer->setPadding(nv_paddings);
+    layer->setDilation(nv_dilations);
+    layer->setNbGroups(groups);
+
+    auto output_name = op_desc.Output("Output").front();
+    layer->setName(("conv2d (Output: " + output_name + ")").c_str());
+    engine_->weight_map[op_desc.Input("Filter").front()] =
+        std::move(weight_tensor);
+    layer->getOutput(0)->setName(output_name.c_str());
+    engine_->SetITensor(output_name, layer->getOutput(0));
+    if (test_mode) {
+      engine_->DeclareOutput(output_name);
+    }
   }
 };
 
diff --git a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0a6ce568f194f03c7259e1ebf28dd6ce4df2d594
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
@@ -0,0 +1,222 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+class ElementwiseWeightOpConverter : public OpConverter {
+ public:
+  ElementwiseWeightOpConverter() {}
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    // Here the two nullptr looks strange, that's because the
+    // framework::OpDesc's constructor is strange.
+    framework::OpDesc op_desc(op, nullptr);
+    LOG(INFO) << "convert a fluid elementwise op to tensorrt IScaleLayer";
+
+    PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
+    PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1);  // Y is a weight
+    PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
+
+    auto* X = engine_->GetITensor(op_desc.Input("X").front());
+    nvinfer1::Dims dims_x = X->getDimensions();
+    PADDLE_ENFORCE(dims_x.nbDims >= 3);
+
+    auto* Y_v = scope.FindVar(op_desc.Input("Y").front());
+    PADDLE_ENFORCE_NOT_NULL(Y_v);
+    auto* Y_t = Y_v->GetMutable<framework::LoDTensor>();
+
+    platform::CPUPlace cpu_place;
+    std::unique_ptr<framework::LoDTensor> weight_tensor(
+        new framework::LoDTensor());
+    weight_tensor->Resize(Y_t->dims());
+    TensorCopySync((*Y_t), cpu_place, weight_tensor.get());
+    auto* weight_data =
+        weight_tensor->mutable_data<float>(platform::CPUPlace());
+    auto scale_mode = nvinfer1::ScaleMode::kELEMENTWISE;
+
+    std::vector<int> dims_y = framework::vectorize2int(weight_tensor->dims());
+    if (static_cast<int>(dims_y.size()) == dims_x.nbDims + 1) {
+      if (dims_y[0] == 1) dims_y.erase(dims_y.begin());
+    }
+
+    if (static_cast<int>(dims_y.size()) == 1 && dims_y[0] == dims_x.d[0]) {
+      scale_mode = nvinfer1::ScaleMode::kCHANNEL;
+    } else if (static_cast<int>(dims_y.size()) == dims_x.nbDims &&
+               dims_y[0] == dims_x.d[0]) {
+      scale_mode = nvinfer1::ScaleMode::kELEMENTWISE;
+      for (int i = 1; i < dims_x.nbDims; i++) {
+        if (dims_y[i] != dims_x.d[i]) {
+          scale_mode = nvinfer1::ScaleMode::kCHANNEL;
+          break;
+        }
+      }
+      if (scale_mode == nvinfer1::ScaleMode::kCHANNEL) {
+        for (int i = 1; i < dims_x.nbDims; i++) {
+          if (dims_y[i] != 1)
+            PADDLE_THROW(
+                "TensorRT unsupported weight shape for Elementwise op!");
+        }
+      }
+    } else {
+      PADDLE_THROW("TensorRT unsupported weight Shape for Elementwise op!");
+    }
+
+    TensorRTEngine::Weight shift_weights{
+        nvinfer1::DataType::kFLOAT, static_cast<void*>(weight_data),
+        weight_tensor->memory_size() / sizeof(float)};
+    TensorRTEngine::Weight scale_weights{nvinfer1::DataType::kFLOAT, nullptr,
+                                         0};
+    TensorRTEngine::Weight power_weights{nvinfer1::DataType::kFLOAT, nullptr,
+                                         0};
+
+    nvinfer1::IScaleLayer* layer = TRT_ENGINE_ADD_LAYER(
+        engine_, Scale, *const_cast<nvinfer1::ITensor*>(X), scale_mode,
+        shift_weights.get(), scale_weights.get(), power_weights.get());
+    auto output_name = op_desc.Output("Out")[0];
+
+    layer->setName(("elementwise_add (Output: " + output_name + ")").c_str());
+    layer->getOutput(0)->setName(output_name.c_str());
+    engine_->weight_map[op_desc.Input("Y").front()] = std::move(weight_tensor);
+    engine_->SetITensor(output_name, layer->getOutput(0));
+    if (test_mode) {  // the test framework can not determine which is the
+                      // output, so place the declaration inside.
+      engine_->DeclareOutput(output_name);
+    }
+  }
+};
+
+class ElementwiseTensorOpConverter : public OpConverter {
+ public:
+  ElementwiseTensorOpConverter() {}
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    // Here the two nullptr looks strange, that's because the
+    // framework::OpDesc's constructor is strange.
+    framework::OpDesc op_desc(op, nullptr);
+    LOG(INFO) << "convert a fluid elementwise op to tensorrt IScaleLayer";
+
+    PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
+    PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1);  // Y is a weight
+    PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
+
+    auto* X = engine_->GetITensor(op_desc.Input("X").front());
+    auto* Y = engine_->GetITensor(op_desc.Input("Y").front());
+    nvinfer1::Dims dims_x = X->getDimensions();
+    nvinfer1::Dims dims_y = Y->getDimensions();
+
+    // The two input tensor should have the same dims
+    PADDLE_ENFORCE(dims_x.nbDims >= 3);
+    if (dims_x.nbDims == dims_y.nbDims) {
+      for (int i = 0; i < dims_x.nbDims; i++) {
+        if (dims_x.d[i] != dims_y.d[i])
+          PADDLE_THROW("TensorRT unsupported tensor shape for Elementwise op!");
+      }
+    } else {
+      PADDLE_THROW("TensorRT unsupported tensor shape for Elementwise op!");
+    }
+
+    auto op_pair = ops.find(op_type_);
+    if (op_pair == ops.end()) {
+      PADDLE_THROW("Wrong elementwise op type!");
+    }
+    nvinfer1::IElementWiseLayer* layer = TRT_ENGINE_ADD_LAYER(
+        engine_, ElementWise, *const_cast<nvinfer1::ITensor*>(X),
+        *const_cast<nvinfer1::ITensor*>(Y), op_pair->second);
+
+    auto output_name = op_desc.Output("Out")[0];
+    layer->setName(("elementwise (Output: " + output_name + ")").c_str());
+    layer->getOutput(0)->setName(output_name.c_str());
+    engine_->SetITensor(output_name, layer->getOutput(0));
+    if (test_mode) {  // the test framework can not determine which is the
+                      // output, so place the declaration inside.
+      engine_->DeclareOutput(output_name);
+    }
+  }
+
+ protected:
+  static const std::unordered_map<std::string, nvinfer1::ElementWiseOperation>
+      ops;
+  std::string op_type_;
+};
+
+const std::unordered_map<std::string, nvinfer1::ElementWiseOperation>
+    ElementwiseTensorOpConverter::ops = {
+        {"add", nvinfer1::ElementWiseOperation::kSUM},
+        {"mul", nvinfer1::ElementWiseOperation::kPROD},
+        {"sub", nvinfer1::ElementWiseOperation::kSUB},
+        {"div", nvinfer1::ElementWiseOperation::kDIV},
+        {"min", nvinfer1::ElementWiseOperation::kMIN},
+        {"pow", nvinfer1::ElementWiseOperation::kPOW},
+        {"max", nvinfer1::ElementWiseOperation::kMAX},
+};
+
+class ElementwiseTensorAddOpConverter : public ElementwiseTensorOpConverter {
+ public:
+  ElementwiseTensorAddOpConverter() { op_type_ = "add"; }
+};
+
+class ElementwiseTensorMulOpConverter : public ElementwiseTensorOpConverter {
+ public:
+  ElementwiseTensorMulOpConverter() { op_type_ = "mul"; }
+};
+
+class ElementwiseTensorSubOpConverter : public ElementwiseTensorOpConverter {
+ public:
+  ElementwiseTensorSubOpConverter() { op_type_ = "sub"; }
+};
+
+class ElementwiseTensorDivOpConverter : public ElementwiseTensorOpConverter {
+ public:
+  ElementwiseTensorDivOpConverter() { op_type_ = "div"; }
+};
+
+class ElementwiseTensorMinOpConverter : public ElementwiseTensorOpConverter {
+ public:
+  ElementwiseTensorMinOpConverter() { op_type_ = "min"; }
+};
+
+class ElementwiseTensorMaxOpConverter : public ElementwiseTensorOpConverter {
+ public:
+  ElementwiseTensorMaxOpConverter() { op_type_ = "max"; }
+};
+
+class ElementwiseTensorPowOpConverter : public ElementwiseTensorOpConverter {
+ public:
+  ElementwiseTensorPowOpConverter() { op_type_ = "pow"; }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(elementwise_add_weight, ElementwiseWeightOpConverter);
+
+REGISTER_TRT_OP_CONVERTER(elementwise_add_tensor,
+                          ElementwiseTensorAddOpConverter);
+REGISTER_TRT_OP_CONVERTER(elementwise_sub_tensor,
+                          ElementwiseTensorSubOpConverter);
+REGISTER_TRT_OP_CONVERTER(elementwise_div_tensor,
+                          ElementwiseTensorDivOpConverter);
+REGISTER_TRT_OP_CONVERTER(elementwise_mul_tensor,
+                          ElementwiseTensorMulOpConverter);
+REGISTER_TRT_OP_CONVERTER(elementwise_max_tensor,
+                          ElementwiseTensorMaxOpConverter);
+REGISTER_TRT_OP_CONVERTER(elementwise_min_tensor,
+                          ElementwiseTensorMinOpConverter);
+REGISTER_TRT_OP_CONVERTER(elementwise_pow_tensor,
+                          ElementwiseTensorPowOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
index bb603efaf30bb72d74b5583abc45d01a16c076a3..7c21ecd95da07b498eed2ab1bbdcc0e8cd184787 100644
--- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
@@ -12,12 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
-#include "paddle/fluid/inference/tensorrt/engine.h"
-#include "paddle/fluid/platform/place.h"
 
 namespace paddle {
 namespace inference {
@@ -32,13 +27,13 @@ void Reorder2(nvinfer1::DimsHW shape, const T* idata, nvinfer1::DimsHW istrides,
   for (int h = 0; h < shape.h(); ++h) {
     for (int w = 0; w < shape.w(); ++w) {
       odata[h * ostrides.h() + w * ostrides.w()] =
-          idata[h * ostrides.h() + w * ostrides.w()];
+          idata[h * istrides.h() + w * istrides.w()];
     }
   }
 }
-
+// indata c * k
 // Reorder the data layout from CK to KC.
-void ReorderCKtoKC(TensorRTEngine::Weight& iweights,
+void ReorderCKtoKC(TensorRTEngine::Weight& iweights,  // NOLINT
                    TensorRTEngine::Weight* oweights) {
   int c = iweights.dims[0];
   int k = iweights.dims[1];
@@ -73,27 +68,33 @@ class FcOpConverter : public OpConverter {
     auto* Y_t = Y_v->GetMutable<framework::LoDTensor>();
     // This may trigger a GPU->CPU copy, because TRT's weight can only be
     // assigned from CPU memory, that can't be avoided.
-    auto* weight_data = Y_t->mutable_data<float>(platform::CPUPlace());
-    PADDLE_ENFORCE_EQ(Y_t->dims().size(), 2UL);  // a matrix
-    size_t n_output = Y_t->dims()[1];
+    platform::CPUPlace cpu_place;
+    framework::LoDTensor weight_tensor;
+    weight_tensor.Resize(Y_t->dims());
+    TensorCopySync((*Y_t), cpu_place, &weight_tensor);
+
+    auto* weight_data = weight_tensor.mutable_data<float>(platform::CPUPlace());
+
+    PADDLE_ENFORCE_EQ(weight_tensor.dims().size(), 2UL);  // a matrix
+    size_t n_output = weight_tensor.dims()[1];
 
-    framework::LoDTensor tmp;
-    tmp.Resize(Y_t->dims());
-    memcpy(tmp.mutable_data<float>(platform::CPUPlace()), Y_t->data<float>(),
-           Y_t->dims()[0] * Y_t->dims()[1]);
+    std::unique_ptr<framework::Tensor> tmp(new framework::LoDTensor());
+    tmp->Resize(weight_tensor.dims());
 
+    memcpy(tmp->mutable_data<float>(platform::CPUPlace()), weight_data,
+           Y_t->dims()[0] * Y_t->dims()[1] * sizeof(float));
     TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT,
                                   static_cast<void*>(weight_data),
                                   Y_t->memory_size() / sizeof(float)};
     TensorRTEngine::Weight tmp_weight(nvinfer1::DataType::kFLOAT,
-                                      static_cast<void*>(tmp.data<float>()),
+                                      static_cast<void*>(tmp->data<float>()),
                                       Y_t->memory_size() / sizeof(float));
     weight.dims.assign({Y_t->dims()[0], Y_t->dims()[1]});
     tmp_weight.dims = weight.dims;
 
     // The data layout of TRT FC layer's weight is different from fluid's FC,
     // need to reorder the elements.
-    ReorderCKtoKC(tmp_weight, &weight);
+    ReorderCKtoKC(weight, &tmp_weight);
 
     // Currently, the framework can only handle one fluid op -> one TRT layer,
     // but fc fuses `mul` and `bias` (2 fluid ops), so here is a trick, just
@@ -103,10 +104,13 @@ class FcOpConverter : public OpConverter {
 
     auto* layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected,
                                        *const_cast<nvinfer1::ITensor*>(X),
-                                       n_output, weight.get(), bias.get());
+                                       n_output, tmp_weight.get(), bias.get());
 
     auto output_name = op_desc.Output("Out").front();
+    layer->setName(("fc (Output: " + output_name + ")").c_str());
+    layer->getOutput(0)->setName(output_name.c_str());
     engine_->SetITensor(output_name, layer->getOutput(0));
+    engine_->weight_map[op_desc.Input("Y").front()] = std::move(tmp);
     if (test_mode) {
       engine_->DeclareOutput(output_name);
     }
@@ -118,4 +122,3 @@ class FcOpConverter : public OpConverter {
 }  // namespace paddle
 
 REGISTER_TRT_OP_CONVERTER(fc, FcOpConverter);
-USE_OP(mul);
diff --git a/paddle/fluid/inference/tensorrt/convert/mul_op.cc b/paddle/fluid/inference/tensorrt/convert/mul_op.cc
index 3c342957360ad4192d838147bf37e84d233c2629..514eb659a8da73b6e56b5d17148ec0cb2aeaa135 100644
--- a/paddle/fluid/inference/tensorrt/convert/mul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/mul_op.cc
@@ -49,5 +49,4 @@ class MulOpConverter : public OpConverter {
 }  // namespace inference
 }  // namespace paddle
 
-USE_OP(mul);
 REGISTER_TRT_OP_CONVERTER(mul, MulOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h
index 6697952051c4b1997ca6b550da17a52e64cb3454..d309d94c560f2b484fac6b6cd40cc2704d641069 100644
--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -55,6 +55,40 @@ class OpConverter {
         it = Registry<OpConverter>::Lookup("fc");
       }
     }
+    if (op_desc.Type().find("elementwise") != std::string::npos) {
+      static std::unordered_set<std::string> add_tensor_op_set{
+          "add", "mul", "sub", "div", "max", "min", "pow"};
+      // TODO(xingzhaolong): all mul, sub, div
+      // static std::unordered_set<std::string> add_weight_op_set {"add", "mul",
+      // "sub", "div"};
+      static std::unordered_set<std::string> add_weight_op_set{"add"};
+      PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1UL);
+      int op_type_len = op_desc.Type().size();
+      std::string op_type = op_desc.Type().substr(op_type_len - 3, op_type_len);
+      std::string Y = op_desc.Input("Y")[0];
+      if (parameters.count(Y)) {
+        PADDLE_ENFORCE(add_weight_op_set.count(op_type) > 0,
+                       "Unsupported elementwise type" + op_type);
+        it =
+            Registry<OpConverter>::Lookup("elementwise_" + op_type + "_weight");
+        PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]",
+                                op_desc.Type());
+      } else {
+        PADDLE_ENFORCE(add_tensor_op_set.count(op_type) > 0,
+                       "Unsupported elementwise type" + op_type);
+        it =
+            Registry<OpConverter>::Lookup("elementwise_" + op_type + "_tensor");
+      }
+      PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]",
+                              op_desc.Type());
+    }
+
+    if (op_desc.Type() == "depthwise_conv2d") {
+      it = Registry<OpConverter>::Lookup("conv2d");
+      PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]",
+                              op_desc.Type());
+    }
+
     if (!it) {
       it = Registry<OpConverter>::Lookup(op_desc.Type());
     }
@@ -93,6 +127,10 @@ class OpConverter {
   framework::Scope* scope_{nullptr};
 };
 
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
 #define REGISTER_TRT_OP_CONVERTER(op_type__, Converter__)                      \
   struct trt_##op_type__##_converter : public ::paddle::framework::Registrar { \
     trt_##op_type__##_converter() {                                            \
@@ -111,7 +149,3 @@ class OpConverter {
   extern int TouchConverterRegister_##op_type__();                      \
   static int use_op_converter_trt_##op_type__ __attribute__((unused)) = \
       TouchConverterRegister_##op_type__();
-
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f9bb66a6e9f81a10368db7710108c319860e940a
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
@@ -0,0 +1,89 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+/*
+ * Pool2dOp, IPoolingLayer in TRT. This Layer doesn't has weights.
+ */
+class Pool2dOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    VLOG(4)
+        << "convert a fluid pool2d op to tensorrt pool2d layer without bias";
+    framework::OpDesc op_desc(op, nullptr);
+    // Declare inputs
+    PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
+    PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
+    auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]);
+
+    bool global_pooling = boost::get<bool>(op_desc.GetAttr("global_pooling"));
+    std::string pool_type =
+        boost::get<std::string>(op_desc.GetAttr("pooling_type"));
+    std::vector<int> ksize =
+        boost::get<std::vector<int>>(op_desc.GetAttr("ksize"));
+    std::vector<int> strides =
+        boost::get<std::vector<int>>(op_desc.GetAttr("strides"));
+    std::vector<int> paddings =
+        boost::get<std::vector<int>>(op_desc.GetAttr("paddings"));
+
+    nvinfer1::DimsHW nv_ksize(ksize[0], ksize[1]);
+    if (global_pooling == true) {
+      nvinfer1::Dims input_shape = input1->getDimensions();
+      int nbDims = input_shape.nbDims;
+      nv_ksize.d[0] = input_shape.d[nbDims - 2];
+      nv_ksize.d[1] = input_shape.d[nbDims - 1];
+    }
+    const nvinfer1::DimsHW nv_strides(strides[0], strides[1]);
+    const nvinfer1::DimsHW nv_paddings(paddings[0], paddings[1]);
+
+    PADDLE_ENFORCE_EQ(input1->getDimensions().nbDims, 3UL);
+
+    nvinfer1::PoolingType nv_pool_type = nvinfer1::PoolingType::kMAX;
+    if (pool_type == "max") {
+      nv_pool_type = nvinfer1::PoolingType::kMAX;
+    } else if (pool_type == "avg") {
+      nv_pool_type = nvinfer1::PoolingType::kAVERAGE;
+    } else {
+      PADDLE_THROW("TensorRT unsupported pooling type!");
+    }
+
+    auto* layer = TRT_ENGINE_ADD_LAYER(engine_, Pooling,
+                                       *const_cast<nvinfer1::ITensor*>(input1),
+                                       nv_pool_type, nv_ksize);
+    PADDLE_ENFORCE_NOT_NULL(layer, "pool layer could not be created.");
+    layer->setStride(nv_strides);
+    layer->setPadding(nv_paddings);
+
+    auto output_name = op_desc.Output("Out")[0];
+    layer->setName(("pool2d (Output: " + output_name + ")").c_str());
+    layer->getOutput(0)->setName(output_name.c_str());
+    engine_->SetITensor(output_name, layer->getOutput(0));
+    if (test_mode) {
+      engine_->DeclareOutput(output_name);
+    }
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+USE_OP(pool2d);
+REGISTER_TRT_OP_CONVERTER(pool2d, Pool2dOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/softmax_op.cc b/paddle/fluid/inference/tensorrt/convert/softmax_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0064f90fd7944403c14d4d47616ea82f681ceb74
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/softmax_op.cc
@@ -0,0 +1,49 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+/*
+ * SoftMaxOp, ISoftMaxLayer in TRT. This Layer doesn't has weights.
+ */
+class SoftMaxOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    VLOG(4)
+        << "convert a fluid softmax op to tensorrt softmax layer without bias";
+    framework::OpDesc op_desc(op, nullptr);
+    // Declare inputs
+    auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]);
+    auto* layer = TRT_ENGINE_ADD_LAYER(engine_, SoftMax,
+                                       *const_cast<nvinfer1::ITensor*>(input1));
+
+    auto output_name = op_desc.Output("Out")[0];
+    engine_->SetITensor(output_name, layer->getOutput(0));
+    if (test_mode) {
+      engine_->DeclareOutput(output_name);
+    }
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+USE_OP(softmax);
+REGISTER_TRT_OP_CONVERTER(softmax, SoftMaxOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
index 0a02a7bebf9efbd0555707e6cfa701ef1e7d9659..e82762ea03ecd00bce7cfb83b130a3436ccbfed3 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
@@ -37,7 +37,7 @@ TEST(ReluOpConverter, main) {
   validator.SetOp(*desc.Proto());
   LOG(INFO) << "execute";
 
-  validator.Execute(10);
+  validator.Execute(5);
 }
 
 }  // namespace tensorrt
diff --git a/paddle/fluid/inference/tensorrt/convert/test_batch_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/test_batch_norm_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..41412cb079540da72760558379b158b6538aa6a8
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/test_batch_norm_op.cc
@@ -0,0 +1,71 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+TEST(batch_norm_op, test) {
+  std::unordered_set<std::string> parameters(
+      {"batch_norm_scale", "batch_norm_bias", "batch_norm_mean",
+       "batch_norm_variance"});
+  framework::Scope scope;
+  TRTConvertValidation validator(5, parameters, scope, 1 << 15);
+  std::vector<int> param_shape{2};
+
+  validator.DeclInputVar("batch_norm_X", nvinfer1::DimsCHW(2, 5, 5));
+  validator.DeclParamVar("batch_norm_scale", param_shape);
+  validator.DeclParamVar("batch_norm_bias", param_shape);
+  validator.DeclParamVar("batch_norm_mean", param_shape);
+  validator.DeclParamVar("batch_norm_variance", param_shape);
+  validator.DeclOutputVar("batch_norm_Y", nvinfer1::DimsCHW(2, 5, 5));
+  validator.DeclOutputVar("batch_norm_save_mean", param_shape);
+  validator.DeclOutputVar("batch_norm_save_variance", param_shape);
+
+  // Prepare Op description
+  framework::OpDesc desc;
+
+  desc.SetType("batch_norm");
+  desc.SetInput("X", {"batch_norm_X"});
+  desc.SetInput("Scale", {"batch_norm_scale"});
+  desc.SetInput("Bias", {"batch_norm_bias"});
+  desc.SetInput("Mean", {"batch_norm_mean"});
+  desc.SetInput("Variance", {"batch_norm_variance"});
+  desc.SetOutput("Y", {"batch_norm_Y"});
+  desc.SetOutput("MeanOut", {"batch_norm_mean"});
+  desc.SetOutput("VarianceOut", {"batch_norm_variance"});
+  desc.SetOutput("SavedMean", {"batch_norm_save_mean"});
+  desc.SetOutput("SavedVariance", {"batch_norm_save_variance"});
+
+  float eps = 1e-5f;
+  bool is_test = true;
+  desc.SetAttr("epsilon", eps);
+  desc.SetAttr("is_test", is_test);
+
+  validator.SetOp(*desc.Proto());
+
+  std::unordered_set<std::string> neglected_output = {
+      "batch_norm_save_mean", "batch_norm_save_variance", "batch_norm_mean",
+      "batch_norm_variance"};
+  validator.Execute(3, neglected_output);
+}
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+USE_OP(batch_norm);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_concat_op.cc b/paddle/fluid/inference/tensorrt/convert/test_concat_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4f284a4db5758e072915d7fd0f16115b8a36ba8b
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/test_concat_op.cc
@@ -0,0 +1,49 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+TEST(concat_op, test) {
+  std::unordered_set<std::string> parameters({""});
+  framework::Scope scope;
+  TRTConvertValidation validator(10, parameters, scope, 1000);
+  validator.DeclInputVar("concat_x1", nvinfer1::DimsCHW(10, 3, 1));
+  validator.DeclInputVar("concat_x2", nvinfer1::DimsCHW(3, 3, 1));
+  validator.DeclInputVar("concat_x3", nvinfer1::DimsCHW(7, 3, 1));
+  validator.DeclOutputVar("concat_out", nvinfer1::DimsCHW(20, 3, 1));
+
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("concat");
+  desc.SetInput("X", {"concat_x1", "concat_x2", "concat_x3"});
+  desc.SetOutput("Out", {"concat_out"});
+
+  int axis = 1;
+  desc.SetAttr("axis", axis);
+
+  validator.SetOp(*desc.Proto());
+
+  validator.Execute(5);
+}
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+USE_OP(concat);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f8711c6b60d74639529624c25429bc245de46479
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc
@@ -0,0 +1,57 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+TEST(conv2d_op, test) {
+  std::unordered_set<std::string> parameters({"conv2d-Y"});
+  framework::Scope scope;
+  TRTConvertValidation validator(5, parameters, scope, 1 << 15);
+
+  validator.DeclInputVar("conv2d-X", nvinfer1::Dims3(2, 5, 5));
+  validator.DeclParamVar("conv2d-Y", nvinfer1::Dims4(3, 2, 3, 3));
+  validator.DeclOutputVar("conv2d-Out", nvinfer1::Dims3(3, 5, 5));
+
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("conv2d");
+  desc.SetInput("Input", {"conv2d-X"});
+  desc.SetInput("Filter", {"conv2d-Y"});
+  desc.SetOutput("Output", {"conv2d-Out"});
+
+  const std::vector<int> strides({1, 1});
+  const std::vector<int> paddings({1, 1});
+  const std::vector<int> dilations({1, 1});
+  const int groups = 1;
+
+  desc.SetAttr("strides", strides);
+  desc.SetAttr("paddings", paddings);
+  desc.SetAttr("dilations", dilations);
+  desc.SetAttr("groups", groups);
+
+  validator.SetOp(*desc.Proto());
+
+  validator.Execute(3);
+}
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+USE_OP(conv2d);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7537d02a35b66a41c158cd8eb1b1e5d4107e7d84
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc
@@ -0,0 +1,73 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+TEST(elementwise_op, add_weight_test) {
+  std::unordered_set<std::string> parameters({"elementwise_add-Y"});
+  framework::Scope scope;
+  TRTConvertValidation validator(10, parameters, scope, 1 << 15);
+  validator.DeclInputVar("elementwise_add-X", nvinfer1::DimsCHW(10, 3, 3));
+  validator.DeclParamVar("elementwise_add-Y", nvinfer1::Dims3(10, 1, 1));
+  // validator.DeclParamVar("mul-Y", nvinfer1::Dims2(8, 2));
+  validator.DeclOutputVar("elementwise_add-Out", nvinfer1::DimsCHW(10, 3, 3));
+
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("elementwise_add");
+  desc.SetInput("X", {"elementwise_add-X"});
+  desc.SetInput("Y", {"elementwise_add-Y"});
+  desc.SetOutput("Out", {"elementwise_add-Out"});
+
+  int axis = 1;
+  desc.SetAttr("axis", axis);
+
+  validator.SetOp(*desc.Proto());
+
+  validator.Execute(8);
+}
+
+TEST(elementwise_op, add_tensor_test) {
+  std::unordered_set<std::string> parameters;
+  framework::Scope scope;
+  TRTConvertValidation validator(8, parameters, scope, 1 << 15);
+  validator.DeclInputVar("elementwise_add-X", nvinfer1::DimsCHW(10, 3, 3));
+  validator.DeclInputVar("elementwise_add-Y", nvinfer1::Dims3(10, 3, 3));
+  // validator.DeclParamVar("mul-Y", nvinfer1::Dims2(8, 2));
+  validator.DeclOutputVar("elementwise_add-Out", nvinfer1::DimsCHW(10, 3, 3));
+
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("elementwise_add");
+  desc.SetInput("X", {"elementwise_add-X"});
+  desc.SetInput("Y", {"elementwise_add-Y"});
+  desc.SetOutput("Out", {"elementwise_add-Out"});
+
+  // the defalut axis of elementwise op is -1
+
+  validator.SetOp(*desc.Proto());
+
+  validator.Execute(8);
+}
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+USE_OP(elementwise_add);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc b/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc
index a30253072ac581ceca85ca10151a176f87a7cb39..1ae2668e733aad23241c63b9985e708396d0b1bc 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc
@@ -23,11 +23,10 @@ namespace tensorrt {
 TEST(fc_op, test) {
   std::unordered_set<std::string> parameters({"mul-Y"});
   framework::Scope scope;
-  TRTConvertValidation validator(20, parameters, scope, 1000);
-
-  validator.DeclInputVar("mul-X", nvinfer1::Dims4(8, 3, 1, 1));
-  validator.DeclParamVar("mul-Y", nvinfer1::Dims2(3, 2));
-  validator.DeclOutputVar("mul-Out", nvinfer1::Dims2(8, 2));
+  TRTConvertValidation validator(10, parameters, scope, 1000);
+  validator.DeclInputVar("mul-X", nvinfer1::Dims3(10, 1, 1));
+  validator.DeclParamVar("mul-Y", nvinfer1::Dims2(10, 2));
+  validator.DeclOutputVar("mul-Out", nvinfer1::Dims2(1, 2));
 
   // Prepare Op description
   framework::OpDesc desc;
@@ -44,3 +43,4 @@ TEST(fc_op, test) {
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
+USE_OP(mul);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc b/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc
index 1ce1130e5d660d717a1262a1fbdb4b620462c0b3..3d34cd7d5d0deca4d83a3f5b5ed0fb396c6acd56 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc
@@ -23,7 +23,7 @@ namespace tensorrt {
 TEST(MulOpConverter, main) {
   framework::Scope scope;
   std::unordered_set<std::string> parameters;
-  TRTConvertValidation validator(10, parameters, scope, 1000);
+  TRTConvertValidation validator(10, parameters, scope, 1000, false);
   validator.DeclInputVar("mul-X", nvinfer1::Dims2(10, 6));
   validator.DeclInputVar("mul-Y", nvinfer1::Dims2(6, 10));
   validator.DeclOutputVar("mul-Out", nvinfer1::Dims2(10, 10));
@@ -39,7 +39,7 @@ TEST(MulOpConverter, main) {
   validator.SetOp(*desc.Proto());
   LOG(INFO) << "execute";
 
-  validator.Execute(10);
+  validator.Execute(2);
 }
 
 }  // namespace tensorrt
diff --git a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
index 9b79f86b0edba983019bd932f52b08711ff36d41..01d7f700da9cc67d0ebbd3d9649e3823f58a8811 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
@@ -25,12 +25,43 @@ TEST(OpConverter, ConvertBlock) {
   framework::ProgramDesc prog;
   auto* block = prog.MutableBlock(0);
   auto* conv2d_op = block->AppendOp();
+
+  // init trt engine
+  cudaStream_t stream_;
+  std::unique_ptr<TensorRTEngine> engine_;
+  engine_.reset(new TensorRTEngine(5, 1 << 15, &stream_));
+  engine_->InitNetwork();
+  PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0);
+
+  engine_->DeclareInput("conv2d-X", nvinfer1::DataType::kFLOAT,
+                        nvinfer1::Dims3(2, 5, 5));
+
   conv2d_op->SetType("conv2d");
+  conv2d_op->SetInput("Input", {"conv2d-X"});
+  conv2d_op->SetInput("Filter", {"conv2d-Y"});
+  conv2d_op->SetOutput("Output", {"conv2d-Out"});
 
-  OpConverter converter;
+  const std::vector<int> strides({1, 1});
+  const std::vector<int> paddings({1, 1});
+  const std::vector<int> dilations({1, 1});
+  const int groups = 1;
+
+  conv2d_op->SetAttr("strides", strides);
+  conv2d_op->SetAttr("paddings", paddings);
+  conv2d_op->SetAttr("dilations", dilations);
+  conv2d_op->SetAttr("groups", groups);
+
+  // init scope
   framework::Scope scope;
-  converter.ConvertBlock(*block->Proto(), {}, scope,
-                         nullptr /*TensorRTEngine*/);
+  std::vector<int> dim_vec = {3, 2, 3, 3};
+  auto* x = scope.Var("conv2d-Y");
+  auto* x_tensor = x->GetMutable<framework::LoDTensor>();
+  x_tensor->Resize(framework::make_ddim(dim_vec));
+  x_tensor->mutable_data<float>(platform::CUDAPlace(0));
+
+  OpConverter converter;
+  converter.ConvertBlock(*block->Proto(), {"conv2d-Y"}, scope,
+                         engine_.get() /*TensorRTEngine*/);
 }
 
 }  // namespace tensorrt
diff --git a/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..aedd6b62df040eeee4e48f628128511cd8bf4439
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc
@@ -0,0 +1,68 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include <gtest/gtest.h>
+#include <fstream>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+void test_pool2d(bool global_pooling) {
+  framework::Scope scope;
+  std::unordered_set<std::string> parameters;
+  TRTConvertValidation validator(5, parameters, scope, 1 << 15);
+
+  // The ITensor's Dims should not contain the batch size.
+  // So, the ITensor's Dims of input and output should be C * H * W.
+  validator.DeclInputVar("pool2d-X", nvinfer1::Dims3(3, 4, 4));
+  if (global_pooling)
+    validator.DeclOutputVar("pool2d-Out", nvinfer1::Dims3(3, 1, 1));
+  else
+    validator.DeclOutputVar("pool2d-Out", nvinfer1::Dims3(3, 2, 2));
+
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("pool2d");
+  desc.SetInput("X", {"pool2d-X"});
+  desc.SetOutput("Out", {"pool2d-Out"});
+
+  std::vector<int> ksize({2, 2});
+  std::vector<int> strides({2, 2});
+  std::vector<int> paddings({0, 0});
+  std::string pooling_t = "max";
+
+  desc.SetAttr("pooling_type", pooling_t);
+  desc.SetAttr("ksize", ksize);
+  desc.SetAttr("strides", strides);
+  desc.SetAttr("paddings", paddings);
+  desc.SetAttr("global_pooling", global_pooling);
+
+  LOG(INFO) << "set OP";
+  validator.SetOp(*desc.Proto());
+  LOG(INFO) << "execute";
+
+  validator.Execute(3);
+}
+
+TEST(Pool2dOpConverter, normal) { test_pool2d(false); }
+
+TEST(Pool2dOpConverter, test_global_pooling) { test_pool2d(true); }
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+USE_OP(pool2d);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_softmax_op.cc b/paddle/fluid/inference/tensorrt/convert/test_softmax_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..503ce71f7fb4377bb4304569b7484fb25abdb284
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/test_softmax_op.cc
@@ -0,0 +1,49 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include <gtest/gtest.h>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+TEST(SoftMaxOpConverter, main) {
+  framework::Scope scope;
+  std::unordered_set<std::string> parameters;
+  TRTConvertValidation validator(8, parameters, scope, 1000);
+
+  std::vector<int> tensor_shape{8, 10};
+  validator.DeclInputVar("softmax-X", tensor_shape,
+                         nvinfer1::DimsCHW(10, 1, 1));
+  validator.DeclOutputVar("softmax-Out", nvinfer1::DimsCHW(10, 1, 1));
+
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("softmax");
+  desc.SetInput("X", {"softmax-X"});
+  desc.SetOutput("Out", {"softmax-Out"});
+
+  LOG(INFO) << "set OP";
+  validator.SetOp(*desc.Proto());
+  LOG(INFO) << "execute";
+
+  validator.Execute(3);
+}
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+USE_OP(softmax);
diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
index 3b1f531adc5d756259df1c350f7f44bf71ee1f93..0a6f171fc40a838fd81d6a51aca0430d5526f188 100644
--- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h
+++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
@@ -24,6 +24,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/inference/analysis/helper.h"
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/engine.h"
@@ -39,7 +40,7 @@ namespace tensorrt {
 float random(float low, float high) {
   static std::random_device rd;
   static std::mt19937 mt(rd());
-  std::uniform_real_distribution<double> dist(1.0, 10.0);
+  std::uniform_real_distribution<double> dist(low, high);
   return dist(mt);
 }
 
@@ -48,10 +49,17 @@ void RandomizeTensor(framework::LoDTensor* tensor, const platform::Place& place,
   auto dims = tensor->dims();
   size_t num_elements = analysis::AccuDims(dims, dims.size());
   PADDLE_ENFORCE_GT(num_elements, 0);
-  auto* data = tensor->mutable_data<float>(place);
+
+  platform::CPUPlace cpu_place;
+  framework::LoDTensor temp_tensor;
+  temp_tensor.Resize(dims);
+  auto* temp_data = temp_tensor.mutable_data<float>(cpu_place);
+
   for (size_t i = 0; i < num_elements; i++) {
-    *(data + i) = random(0., 1.);
+    *(temp_data + i) = random(0., 1.);
   }
+
+  TensorCopySync(temp_tensor, place, tensor);
 }
 
 /*
@@ -62,46 +70,73 @@ class TRTConvertValidation {
  public:
   TRTConvertValidation() = delete;
 
-  TRTConvertValidation(int batch_size,
+  TRTConvertValidation(int max_batch_size,
                        const std::unordered_set<std::string>& parameters,
                        framework::Scope& scope,  // NOLINT
-                       int workspace_size = 1 << 10)
-      : parameters_(parameters), scope_(scope) {
+                       int workspace_size = 1 << 10, bool if_add_batch = true)
+      : parameters_(parameters),
+        scope_(scope),
+        if_add_batch_(if_add_batch),
+        max_batch_size_(max_batch_size) {
     // create engine.
-    engine_.reset(new TensorRTEngine(10, 1 << 10, &stream_));
+    engine_.reset(new TensorRTEngine(max_batch_size, workspace_size, &stream_));
     engine_->InitNetwork();
 
     PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0);
   }
 
   // Declare a Variable as input with random initialization.
+  void DeclInputVar(const std::string& name, const std::vector<int> tensor_dims,
+                    const nvinfer1::Dims& trt_dims) {
+    DeclVar(name, tensor_dims);
+    engine_->DeclareInput(name, nvinfer1::DataType::kFLOAT, trt_dims);
+  }
+
   void DeclInputVar(const std::string& name, const nvinfer1::Dims& dims) {
     DeclVar(name, dims);
     // Declare TRT inputs.
     engine_->DeclareInput(name, nvinfer1::DataType::kFLOAT, dims);
   }
 
+  void DeclParamVar(const std::string& name, const std::vector<int> dim_vec) {
+    DeclVar(name, dim_vec);
+  }
+
   // Declare a parameter varaible in the scope.
   void DeclParamVar(const std::string& name, const nvinfer1::Dims& dims) {
-    DeclVar(name, dims);
+    DeclVar(name, dims, true);
+  }
+
+  void DeclOutputVar(const std::string& name, const std::vector<int> dim_vec) {
+    DeclVar(name, dim_vec);
   }
 
   void DeclOutputVar(const std::string& name, const nvinfer1::Dims& dims) {
     DeclVar(name, dims);
   }
 
-  // Declare a variable in a fluid Scope.
-  void DeclVar(const std::string& name, const nvinfer1::Dims& dims) {
-    platform::CPUPlace place;
-    platform::CPUDeviceContext ctx(place);
+  void DeclVar(const std::string& name, const std::vector<int> dim_vec) {
+    platform::CUDAPlace place;
+    platform::CUDADeviceContext ctx(place);
 
-    // Init Fluid tensor.
-    std::vector<int> dim_vec(dims.d, dims.d + dims.nbDims);
     auto* x = scope_.Var(name);
     auto* x_tensor = x->GetMutable<framework::LoDTensor>();
     x_tensor->Resize(framework::make_ddim(dim_vec));
     RandomizeTensor(x_tensor, place, ctx);
   }
+  // Declare a variable in a fluid Scope.
+  void DeclVar(const std::string& name, const nvinfer1::Dims& dims,
+               bool is_param = false) {
+    // Init Fluid tensor.
+    std::vector<int> dim_vec(dims.d, dims.d + dims.nbDims);
+    // There is no batchsize in ITensor's shape, but We should add it to
+    // tensor's shape of fluid. If the variable is not parameter and the
+    // if_add_batch_ flag is true, add the max batchsize to dim_vec.
+    if (is_param != true && if_add_batch_ == true)
+      dim_vec.insert(dim_vec.begin(), max_batch_size_);
+
+    DeclVar(name, dim_vec);
+  }
 
   void SetOp(const framework::proto::OpDesc& desc) {
     op_ = framework::OpRegistry::CreateOp(desc);
@@ -121,37 +156,48 @@ class TRTConvertValidation {
       PADDLE_ENFORCE(var);
       auto tensor = var->GetMutable<framework::LoDTensor>();
 
-      engine_->SetInputFromCPU(
+      engine_->SetInputFromGPU(
           input, static_cast<void*>(tensor->data<void>()),
           sizeof(float) *
               analysis::AccuDims(tensor->dims(), tensor->dims().size()));
     }
   }
 
-  void Execute(int batch_size) {
+  // We use the set 'neglected_output' here, because some Ops like batch norm,
+  // the outputs specified in the op des are only used during training,
+  // so we should neglect those output during inference.
+  void Execute(int batch_size,
+               std::unordered_set<std::string> neglected_output = {}) {
     // Execute Fluid Op
-    platform::CPUPlace place;
-    platform::CPUDeviceContext ctx(place);
+    PADDLE_ENFORCE_LE(batch_size, max_batch_size_);
+    platform::CUDAPlace place;
+    platform::CUDADeviceContext ctx(place);
     op_->Run(scope_, place);
     // Execute TRT.
     engine_->Execute(batch_size);
     cudaStreamSynchronize(*engine_->stream());
 
     ASSERT_FALSE(op_desc_->OutputArgumentNames().empty());
-    const size_t output_space_size = 200;
+    const size_t output_space_size = 3000;
     for (const auto& output : op_desc_->OutputArgumentNames()) {
+      if (neglected_output.count(output)) continue;
       std::vector<float> fluid_out;
       std::vector<float> trt_out(output_space_size);
-      engine_->GetOutputInCPU(output, &trt_out[0],
-                              output_space_size * sizeof(float));
+      engine_->GetOutputInCPU(output, &trt_out[0], output_space_size);
       cudaStreamSynchronize(*engine_->stream());
 
       auto* var = scope_.FindVar(output);
       auto tensor = var->GetMutable<framework::LoDTensor>();
       framework::TensorToVector(*tensor, ctx, &fluid_out);
+
+      size_t fluid_out_size = fluid_out.size();
+      if (if_add_batch_ == true) {
+        fluid_out_size =
+            batch_size * (framework::product(tensor->dims()) / max_batch_size_);
+      }
       // Compare two output
       ASSERT_FALSE(fluid_out.empty());
-      for (size_t i = 0; i < fluid_out.size(); i++) {
+      for (size_t i = 0; i < fluid_out_size; i++) {
         // Loose the threshold for CI in different machine model.
         EXPECT_LT(std::abs(fluid_out[i] - trt_out[i]), 2e-5);
       }
@@ -167,6 +213,12 @@ class TRTConvertValidation {
   std::unique_ptr<framework::OpDesc> op_desc_;
   const std::unordered_set<std::string>& parameters_;
   framework::Scope& scope_;
+  // The ITensor of trt does not cotain the batch size,
+  // bug, in most cases, we need to set batch size for
+  // fluid's tensor shape. This variable indicates
+  // whether to add batch size to tensor shape of fluid.
+  bool if_add_batch_;
+  int max_batch_size_;
 };
 
 }  // namespace tensorrt
diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc
index 596e0fe9da3d272ecb1c0f8dbef09a75d08a4b1a..14e9e14d33d637ee68e37593cc48721e5169499f 100644
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -1,7 +1,7 @@
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use
+this file except in compliance with the License.
 You may obtain a copy of the License at
 
 http://www.apache.org/licenses/LICENSE-2.0
@@ -26,26 +26,32 @@ namespace paddle {
 namespace inference {
 namespace tensorrt {
 
-void TensorRTEngine::Build(const DescType& paddle_model) {
+int TensorRTEngine::runtime_batch_ = 1;
+
+void TensorRTEngine::Build(const DescType &paddle_model) {
   PADDLE_ENFORCE(false, "not implemented");
 }
 
 void TensorRTEngine::Execute(int batch_size) {
-  std::vector<void*> buffers;
-  for (auto& buf : buffers_) {
+  freshDeviceId();
+  batch_size_ = batch_size;
+  std::vector<void *> buffers;
+  for (auto &buf : buffers_) {
     PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated");
     PADDLE_ENFORCE_GT(buf.max_size, 0);
     PADDLE_ENFORCE(buf.device == DeviceType::GPU);
     buffers.push_back(buf.buffer);
   }
+  PADDLE_ENFORCE_NOT_NULL(stream_);
   infer_context_->enqueue(batch_size, buffers.data(), *stream_, nullptr);
   cudaStreamSynchronize(*stream_);
+  SetRuntimeBatch(batch_size);
 }
 
 TensorRTEngine::~TensorRTEngine() {
   cudaStreamSynchronize(*stream_);
   // clean buffer
-  for (auto& buf : buffers_) {
+  for (auto &buf : buffers_) {
     if (buf.device == DeviceType::GPU && buf.buffer != nullptr) {
       PADDLE_ENFORCE_EQ(0, cudaFree(buf.buffer));
       buf.buffer = nullptr;
@@ -55,6 +61,7 @@ TensorRTEngine::~TensorRTEngine() {
 }
 
 void TensorRTEngine::FreezeNetwork() {
+  freshDeviceId();
   PADDLE_ENFORCE(infer_builder_ != nullptr,
                  "Call InitNetwork first to initialize network.");
   PADDLE_ENFORCE(infer_network_ != nullptr,
@@ -70,46 +77,51 @@ void TensorRTEngine::FreezeNetwork() {
 
   // allocate GPU buffers.
   buffers_.resize(buffer_sizes_.size());
-  for (auto& item : buffer_sizes_) {
+  for (auto &item : buffer_sizes_) {
+    // The output buffers are not set in the network building phrase, need to
+    // infer from the TesorRT network.
     if (item.second == 0) {
       auto slot_offset = infer_engine_->getBindingIndex(item.first.c_str());
       auto dims = infer_engine_->getBindingDimensions(slot_offset);
       item.second = kDataTypeSize[static_cast<int>(
                         infer_engine_->getBindingDataType(slot_offset))] *
-                    analysis::AccuDims(dims.d, dims.nbDims);
+                    analysis::AccuDims(dims.d, dims.nbDims) * max_batch_;
+      PADDLE_ENFORCE_GT(item.second, 0);
     }
-    auto& buf = buffer(item.first);
+
+    auto &buf = buffer(item.first);
+    buf.max_size = item.second * max_batch_;
     CHECK(buf.buffer == nullptr);  // buffer should be allocated only once.
-    PADDLE_ENFORCE_EQ(0, cudaMalloc(&buf.buffer, item.second));
-    VLOG(4) << "buffer malloc " << item.first << " " << item.second << " "
-            << buf.buffer;
-    buf.size = buf.max_size = item.second;
+
+    PADDLE_ENFORCE_EQ(0, cudaMalloc(&buf.buffer, item.second * max_batch_));
+    buf.size = 0;
+    PADDLE_ENFORCE_LE(buf.max_size, 1 << 30);  // 10G
     buf.device = DeviceType::GPU;
   }
 }
 
-nvinfer1::ITensor* TensorRTEngine::DeclareInput(const std::string& name,
+nvinfer1::ITensor *TensorRTEngine::DeclareInput(const std::string &name,
                                                 nvinfer1::DataType dtype,
-                                                const nvinfer1::Dims& dims) {
+                                                const nvinfer1::Dims &dims) {
   PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate input name %s",
                     name);
 
   PADDLE_ENFORCE(infer_network_ != nullptr, "should initnetwork first");
-  auto* input = infer_network_->addInput(name.c_str(), dtype, dims);
+  auto *input = infer_network_->addInput(name.c_str(), dtype, dims);
   PADDLE_ENFORCE(input, "infer network add input %s failed", name);
   buffer_sizes_[name] = kDataTypeSize[static_cast<int>(dtype)] *
-                        analysis::AccuDims(dims.d, dims.nbDims);
+                        analysis::AccuDims(dims.d, dims.nbDims) * max_batch_;
   PADDLE_ENFORCE(input->isNetworkInput());
   TensorRTEngine::SetITensor(name, input);
   return input;
 }
 
-void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer* layer, int offset,
-                                   const std::string& name) {
+void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer *layer, int offset,
+                                   const std::string &name) {
   PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate output name %s",
                     name);
 
-  auto* output = layer->getOutput(offset);
+  auto *output = layer->getOutput(offset);
   SetITensor(name, output);
   PADDLE_ENFORCE(output != nullptr);
   output->setName(name.c_str());
@@ -121,11 +133,11 @@ void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer* layer, int offset,
   buffer_sizes_[name] = 0;
 }
 
-void TensorRTEngine::DeclareOutput(const std::string& name) {
+void TensorRTEngine::DeclareOutput(const std::string &name) {
   PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate output name %s",
                     name);
 
-  auto* output = TensorRTEngine::GetITensor(name);
+  auto *output = TensorRTEngine::GetITensor(name);
   PADDLE_ENFORCE(output != nullptr);
   output->setName(name.c_str());
   PADDLE_ENFORCE(!output->isNetworkInput());
@@ -135,38 +147,52 @@ void TensorRTEngine::DeclareOutput(const std::string& name) {
   buffer_sizes_[name] = 0;
 }
 
-void* TensorRTEngine::GetOutputInGPU(const std::string& name) {
+void *TensorRTEngine::GetOutputInGPU(const std::string &name) {
   return buffer(name).buffer;
 }
 
-void TensorRTEngine::GetOutputInGPU(const std::string& name, void* dst,
+void TensorRTEngine::GetOutputInGPU(const std::string &name, void *dst,
                                     size_t max_size) {
   // determine data size
+  auto *output = TensorRTEngine::GetITensor(name);
+  nvinfer1::Dims dims = output->getDimensions();
+  auto dim_size = analysis::AccuDims(dims.d, dims.nbDims);
+  size_t dst_size = dim_size * runtime_batch_ *
+                    kDataTypeSize[static_cast<int>(output->getType())];
+
   auto it = buffer_sizes_.find(name);
   PADDLE_ENFORCE(it != buffer_sizes_.end());
   PADDLE_ENFORCE_GT(it->second, 0);
-  PADDLE_ENFORCE_GE(max_size, it->second);
-  auto& buf = buffer(name);
+  PADDLE_ENFORCE_LE(dst_size, it->second);
+  PADDLE_ENFORCE_GE(max_size, dst_size);
+  auto &buf = buffer(name);
   PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before");
-  PADDLE_ENFORCE_EQ(cudaMemcpyAsync(dst, buf.buffer, it->second,
+  PADDLE_ENFORCE_EQ(cudaMemcpyAsync(dst, buf.buffer, dst_size,
                                     cudaMemcpyDeviceToDevice, *stream_),
                     0);
 }
 
-void TensorRTEngine::GetOutputInCPU(const std::string& name, void* dst,
+void TensorRTEngine::GetOutputInCPU(const std::string &name, void *dst,
                                     size_t max_size) {
   // determine data size
+
+  auto *output = TensorRTEngine::GetITensor(name);
+  nvinfer1::Dims dims = output->getDimensions();
+  auto dim_size = analysis::AccuDims(dims.d, dims.nbDims);
+  size_t dst_size = dim_size * runtime_batch_ *
+                    kDataTypeSize[static_cast<int>(output->getType())];
   auto it = buffer_sizes_.find(name);
   PADDLE_ENFORCE(it != buffer_sizes_.end());
   PADDLE_ENFORCE_GT(it->second, 0);
-  PADDLE_ENFORCE_GE(max_size, it->second);
-  auto& buf = buffer(name);
+  PADDLE_ENFORCE_LE(dst_size, it->second);
+  PADDLE_ENFORCE_GE(max_size, dst_size);
+  auto &buf = buffer(name);
   PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before");
-  PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(dst, buf.buffer, it->second,
+  PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(dst, buf.buffer, dst_size,
                                        cudaMemcpyDeviceToHost, *stream_));
 }
 
-Buffer& TensorRTEngine::buffer(const std::string& name) {
+Buffer &TensorRTEngine::buffer(const std::string &name) {
   PADDLE_ENFORCE(infer_engine_ != nullptr, "call FreezeNetwork first.");
   auto it = buffer_sizes_.find(name);
   PADDLE_ENFORCE(it != buffer_sizes_.end());
@@ -174,19 +200,23 @@ Buffer& TensorRTEngine::buffer(const std::string& name) {
   return buffers_[slot_offset];
 }
 
-void TensorRTEngine::SetInputFromCPU(const std::string& name, const void* data,
+void TensorRTEngine::SetInputFromCPU(const std::string &name, const void *data,
                                      size_t size) {
-  auto& buf = buffer(name);
+  auto &buf = buffer(name);
   PADDLE_ENFORCE_NOT_NULL(buf.buffer);
+  PADDLE_ENFORCE_NOT_NULL(data);
+  PADDLE_ENFORCE_NOT_NULL(stream_);
   PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small");
   PADDLE_ENFORCE(buf.device == DeviceType::GPU);
+  buf.size = size;
   PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(buf.buffer, data, size,
                                        cudaMemcpyHostToDevice, *stream_));
 }
 
-void TensorRTEngine::SetInputFromGPU(const std::string& name, const void* data,
+void TensorRTEngine::SetInputFromGPU(const std::string &name, const void *data,
                                      size_t size) {
-  auto& buf = buffer(name);
+  auto &buf = buffer(name);
+  buf.size = size;
   PADDLE_ENFORCE_NOT_NULL(buf.buffer);
   PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small");
   PADDLE_ENFORCE(buf.device == DeviceType::GPU);
@@ -194,19 +224,32 @@ void TensorRTEngine::SetInputFromGPU(const std::string& name, const void* data,
                                        cudaMemcpyDeviceToDevice, *stream_));
 }
 
-void TensorRTEngine::SetITensor(const std::string& name,
-                                nvinfer1::ITensor* tensor) {
+void TensorRTEngine::SetITensor(const std::string &name,
+                                nvinfer1::ITensor *tensor) {
   PADDLE_ENFORCE(tensor != nullptr);
   PADDLE_ENFORCE_EQ(0, itensor_map_.count(name), "duplicate ITensor name %s",
                     name);
   itensor_map_[name] = tensor;
 }
 
-nvinfer1::ITensor* TensorRTEngine::GetITensor(const std::string& name) {
+nvinfer1::ITensor *TensorRTEngine::GetITensor(const std::string &name) {
   PADDLE_ENFORCE(itensor_map_.count(name), "no ITensor %s", name);
   return itensor_map_[name];
 }
 
+void TensorRTEngine::SetRuntimeBatch(size_t batch_size) {
+  runtime_batch_ = batch_size;
+}
+
+int TensorRTEngine::GetRuntimeBatch() { return runtime_batch_; }
+
+void TensorRTEngine::freshDeviceId() {
+  int count;
+  cudaGetDeviceCount(&count);
+  PADDLE_ENFORCE_LT(device_, count);
+  cudaSetDevice(device_);
+}
+
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
index b06a9bbc6758ae9410b2fce99ef2b1a9e7ab98c0..bd3ba4cea6551a7f6651e311e2649de191a6faa1 100644
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <string>
 #include <unordered_map>
 #include <vector>
+#include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/inference/engine.h"
 #include "paddle/fluid/inference/tensorrt/helper.h"
 #include "paddle/fluid/inference/utils/singleton.h"
@@ -52,12 +53,16 @@ class TensorRTEngine : public EngineBase {
   };
 
   TensorRTEngine(int max_batch, int max_workspace,
-                 cudaStream_t* stream = nullptr,
+                 cudaStream_t* stream = nullptr, int device = 0,
                  nvinfer1::ILogger& logger = NaiveLogger::Global())
       : max_batch_(max_batch),
         max_workspace_(max_workspace),
         stream_(stream ? stream : &default_stream_),
-        logger_(logger) {}
+        logger_(logger),
+        device_(device) {
+    freshDeviceId();
+    cudaStreamCreate(stream_);
+  }
 
   virtual ~TensorRTEngine();
 
@@ -115,12 +120,28 @@ class TensorRTEngine : public EngineBase {
 
   nvinfer1::ICudaEngine* engine() { return infer_engine_.get(); }
   nvinfer1::INetworkDefinition* network() { return infer_network_.get(); }
+  void SetRuntimeBatch(size_t batch_size);
+  int GetRuntimeBatch();
+  int GetDevice() { return device_; }
+
+  // A pointer to CPU memory is needed of the TRT weight.
+  // Before TRT runs, fluid loads weight into GPU storage.
+  // so we need to copy the weights from GPU to CPU in our op converter.
+  // We use a map to store these weights for the weight memory is not released
+  // in advance, which affecting the construction of TRT Op.
+  std::unordered_map<std::string /*name*/, std::unique_ptr<framework::Tensor>>
+      weight_map;
 
  private:
   // the max batch size
   int max_batch_;
+  // the runtime batch size
+  static int runtime_batch_;
   // the max memory size the engine uses
   int max_workspace_;
+
+  // batch size of the current data, will be updated each Executation.
+  int batch_size_{-1};
   cudaStream_t* stream_;
   // If stream_ is not set from outside, hold its own stream.
   cudaStream_t default_stream_;
@@ -131,6 +152,8 @@ class TensorRTEngine : public EngineBase {
   std::unordered_map<std::string /*name*/, size_t /*max size*/> buffer_sizes_;
   std::unordered_map<std::string /*name*/, nvinfer1::ITensor* /*ITensor*/>
       itensor_map_;
+  // The specific GPU id that the TensorRTEngine bounded to.
+  int device_;
 
   // TensorRT related internal members
   template <typename T>
@@ -147,6 +170,10 @@ class TensorRTEngine : public EngineBase {
   infer_ptr<nvinfer1::INetworkDefinition> infer_network_;
   infer_ptr<nvinfer1::ICudaEngine> infer_engine_;
   infer_ptr<nvinfer1::IExecutionContext> infer_context_;
+  // Each ICudaEngine object is bound to a specific GPU when it is instantiated,
+  // ensure that the thread is associated with the correct device by calling
+  // freshDeviceId().
+  void freshDeviceId();
 };  // class TensorRTEngine
 
 // Add an layer__ into engine__ with args ARGS.
@@ -179,8 +206,8 @@ class TRT_EngineManager {
 
   // Create or get an engine called `name`
   TensorRTEngine* Create(int max_batch, int max_workspace, cudaStream_t* stream,
-                         const std::string& name) {
-    auto* p = new TensorRTEngine(max_batch, max_workspace, stream);
+                         const std::string& name, int gpu_device = 0) {
+    auto* p = new TensorRTEngine(max_batch, max_workspace, stream, gpu_device);
     engines_[name].reset(p);
     return p;
   }
diff --git a/paddle/fluid/inference/tensorrt/test_engine.cc b/paddle/fluid/inference/tensorrt/test_engine.cc
index e635f0f87d577a1f1ac74687ee60f762be525418..da1f6535cb3b2476cd475797861d6d2bb6d88856 100644
--- a/paddle/fluid/inference/tensorrt/test_engine.cc
+++ b/paddle/fluid/inference/tensorrt/test_engine.cc
@@ -27,8 +27,8 @@ namespace tensorrt {
 class TensorRTEngineTest : public ::testing::Test {
  protected:
   void SetUp() override {
-    ASSERT_EQ(0, cudaStreamCreate(&stream_));
-    engine_ = new TensorRTEngine(1, 1 << 10, &stream_);
+    // ASSERT_EQ(0, cudaStreamCreate(&stream_));
+    engine_ = new TensorRTEngine(10, 1 << 10, &stream_);
     engine_->InitNetwork();
   }
 
@@ -71,7 +71,7 @@ TEST_F(TensorRTEngineTest, add_layer) {
 
   LOG(INFO) << "to get output";
   float y_cpu;
-  engine_->GetOutputInCPU("y", &y_cpu, sizeof(float));
+  engine_->GetOutputInCPU("y", &y_cpu, 1 * sizeof(float));
 
   LOG(INFO) << "to checkout output";
   ASSERT_EQ(y_cpu, x_v * 2 + 3);
@@ -103,11 +103,80 @@ TEST_F(TensorRTEngineTest, add_layer_multi_dim) {
 
   LOG(INFO) << "to get output";
   float y_cpu[2] = {-1., -1.};
-  engine_->GetOutputInCPU("y", &y_cpu[0], sizeof(float) * 2);
+
+  auto dims = engine_->GetITensor("y")->getDimensions();
+  ASSERT_EQ(dims.nbDims, 3);
+  ASSERT_EQ(dims.d[0], 2);
+  ASSERT_EQ(dims.d[1], 1);
+  engine_->GetOutputInCPU("y", &y_cpu[0], 2 * sizeof(float));
   ASSERT_EQ(y_cpu[0], 4.5);
   ASSERT_EQ(y_cpu[1], 14.5);
 }
 
+TEST_F(TensorRTEngineTest, test_conv2d) {
+  // Weight in CPU memory.
+  float raw_weight[9] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
+  float raw_bias[1] = {0};
+
+  TensorRTEngine::Weight weight(nvinfer1::DataType::kFLOAT, raw_weight, 9);
+  TensorRTEngine::Weight bias(nvinfer1::DataType::kFLOAT, raw_bias, 1);
+  auto* x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT,
+                                  nvinfer1::Dims3{1, 3, 3});
+  auto* conv_layer =
+      TRT_ENGINE_ADD_LAYER(engine_, Convolution, *x, 1, nvinfer1::DimsHW{3, 3},
+                           weight.get(), bias.get());
+  PADDLE_ENFORCE(conv_layer != nullptr);
+  conv_layer->setStride(nvinfer1::DimsHW{1, 1});
+  conv_layer->setPadding(nvinfer1::DimsHW{1, 1});
+
+  engine_->DeclareOutput(conv_layer, 0, "y");
+  engine_->FreezeNetwork();
+  ASSERT_EQ(engine_->engine()->getNbBindings(), 2);
+
+  float x_v[18] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+                   1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
+  engine_->SetInputFromCPU("x", reinterpret_cast<void*>(&x_v),
+                           18 * sizeof(float));
+  engine_->Execute(2);
+
+  LOG(INFO) << "to get output";
+  float* y_cpu = new float[18];
+  engine_->GetOutputInCPU("y", &y_cpu[0], 18 * sizeof(float));
+  ASSERT_EQ(y_cpu[0], 4.0);
+  ASSERT_EQ(y_cpu[1], 6.0);
+}
+
+TEST_F(TensorRTEngineTest, test_pool2d) {
+  // Weight in CPU memory.
+  auto* x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT,
+                                  nvinfer1::Dims3{1, 2, 2});
+
+  nvinfer1::PoolingType pool_t = nvinfer1::PoolingType::kAVERAGE;
+  auto* pool_layer =
+      TRT_ENGINE_ADD_LAYER(engine_, Pooling, *const_cast<nvinfer1::ITensor*>(x),
+                           pool_t, nvinfer1::DimsHW{2, 2});
+
+  PADDLE_ENFORCE(pool_layer != nullptr);
+  pool_layer->setStride(nvinfer1::DimsHW{1, 1});
+  pool_layer->setPadding(nvinfer1::DimsHW{0, 0});
+
+  engine_->DeclareOutput(pool_layer, 0, "y");
+  engine_->FreezeNetwork();
+  ASSERT_EQ(engine_->engine()->getNbBindings(), 2);
+
+  float x_v[8] = {1.0, 2.0, 5.0, 0.0, 2.0, 3.0, 5.0, 10.0};
+  engine_->SetInputFromCPU("x", reinterpret_cast<void*>(&x_v),
+                           8 * sizeof(float));
+  engine_->Execute(2);
+
+  LOG(INFO) << "to get output";
+  float* y_cpu = new float[2];
+  engine_->GetOutputInCPU("y", &y_cpu[0], 2 * sizeof(float));
+
+  ASSERT_EQ(y_cpu[0], 2.0);
+  ASSERT_EQ(y_cpu[1], 5.0);
+}
+
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e397457061662c8afb9760ef52406c22caaeb213
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -0,0 +1,73 @@
+set(INFERENCE_URL "http://paddle-inference-dist.bj.bcebos.com")
+set(INFERENCE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo")
+set(INFERENCE_EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor)
+function (inference_download_and_uncompress install_dir filename)
+    message(STATUS "Download inference test stuff from ${INFERENCE_URL}/${filename}")
+    execute_process(COMMAND bash -c "mkdir -p ${install_dir}")
+    execute_process(COMMAND bash -c "cd ${install_dir} && wget -q ${INFERENCE_URL}/${filename}")
+    execute_process(COMMAND bash -c "cd ${install_dir} && tar xzf ${filename}")
+    message(STATUS "finish downloading ${filename}")
+endfunction(inference_download_and_uncompress)
+
+function(download_model_and_data install_dir model_name data_name)
+    if (NOT EXISTS ${install_dir} AND WITH_INFERENCE)
+        inference_download_and_uncompress(${install_dir} ${model_name})
+        inference_download_and_uncompress(${install_dir} ${data_name})
+    endif()
+endfunction()
+
+# RNN1
+set(RNN1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn1")
+download_model_and_data(${RNN1_INSTALL_DIR} "rnn1%2Fmodel.tar.gz" "rnn1%2Fdata.txt.tar.gz")
+inference_analysis_test(test_analyzer_rnn1 SRCS analyzer_rnn1_tester.cc 
+    EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
+    ARGS --infer_model=${RNN1_INSTALL_DIR}/model
+         --infer_data=${RNN1_INSTALL_DIR}/data.txt)
+
+# RNN2
+set(RNN2_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn2")
+download_model_and_data(${RNN2_INSTALL_DIR} "rnn2_model.tar.gz" "rnn2_data.txt.tar.gz")
+inference_analysis_test(test_analyzer_rnn2 SRCS analyzer_rnn2_tester.cc
+    EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
+    ARGS --infer_model=${RNN2_INSTALL_DIR}/model
+         --infer_data=${RNN2_INSTALL_DIR}/data.txt)
+
+# chinese_ner
+set(CHINESE_NER_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/chinese_ner")
+download_model_and_data(${CHINESE_NER_INSTALL_DIR} "chinese_ner_model.tar.gz" "chinese_ner-data.txt.tar.gz")
+inference_analysis_test(test_analyzer_ner SRCS analyzer_ner_tester.cc
+    EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
+    ARGS --infer_model=${CHINESE_NER_INSTALL_DIR}/model
+        --infer_data=${CHINESE_NER_INSTALL_DIR}/data.txt)
+
+# lac
+set(LAC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/lac")
+download_model_and_data(${LAC_INSTALL_DIR} "lac_model.tar.gz" "lac_data.txt.tar.gz")
+inference_analysis_test(test_analyzer_lac SRCS analyzer_lac_tester.cc
+    EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
+    ARGS --infer_model=${LAC_INSTALL_DIR}/model
+        --infer_data=${LAC_INSTALL_DIR}/data.txt)
+
+# text_classification
+set(TEXT_CLASSIFICATION_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/text_classification")
+download_model_and_data(${TEXT_CLASSIFICATION_INSTALL_DIR} "text-classification-Senta.tar.gz" "text_classification_data.txt.tar.gz")
+inference_analysis_test(test_analyzer_text_classification SRCS analyzer_text_classification_tester.cc
+    EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
+    ARGS --infer_model=${TEXT_CLASSIFICATION_INSTALL_DIR}/model
+         --infer_data=${TEXT_CLASSIFICATION_INSTALL_DIR}/data.txt)
+
+# ocr
+set(OCR_MODEL_URL "http://paddlemodels.cdn.bcebos.com/inference-vis-demos%2Focr.tar.gz")
+set(OCR_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/ocr")
+if (NOT EXISTS ${OCR_INSTALL_DIR} AND WITH_INFERENCE)
+    get_filename_component(filename ${OCR_MODEL_URL} NAME)
+    message(STATUS "Download inference test stuff ${filename} from ${OCR_MODEL_URL}")
+    execute_process(COMMAND bash -c "mkdir -p ${OCR_INSTALL_DIR}")
+    execute_process(COMMAND bash -c "cd ${OCR_INSTALL_DIR} && wget -q ${OCR_MODEL_URL}")
+    execute_process(COMMAND bash -c "cd ${OCR_INSTALL_DIR} && tar xzf ${filename}")
+    message(STATUS "finish downloading ${filename}")
+endif()
+inference_analysis_test(test_analyzer_ocr SRCS analyzer_vis_tester.cc
+    EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
+    ARGS --infer_model=${OCR_INSTALL_DIR}/model
+        --infer_data=${OCR_INSTALL_DIR}/data.txt)
diff --git a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bf893e32569f4b50a583ab6f43cb214ec3620e09
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
@@ -0,0 +1,212 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+struct DataRecord {
+  std::vector<int64_t> data;
+  std::vector<size_t> lod;
+  // for dataset and nextbatch
+  size_t batch_iter{0};
+  std::vector<std::vector<size_t>> batched_lods;
+  std::vector<std::vector<int64_t>> batched_datas;
+  std::vector<std::vector<int64_t>> datasets;
+  DataRecord() = default;
+  explicit DataRecord(const std::string &path, int batch_size = 1) {
+    Load(path);
+    Prepare(batch_size);
+    batch_iter = 0;
+  }
+  void Load(const std::string &path) {
+    std::ifstream file(path);
+    std::string line;
+    int num_lines = 0;
+    datasets.resize(0);
+    while (std::getline(file, line)) {
+      num_lines++;
+      std::vector<std::string> data;
+      split(line, ';', &data);
+      std::vector<int64_t> words_ids;
+      split_to_int64(data[1], ' ', &words_ids);
+      datasets.emplace_back(words_ids);
+    }
+  }
+  void Prepare(int bs) {
+    if (bs == 1) {
+      batched_datas = datasets;
+      for (auto one_sentence : datasets) {
+        batched_lods.push_back({0, one_sentence.size()});
+      }
+    } else {
+      std::vector<int64_t> one_batch;
+      std::vector<size_t> lod{0};
+      int bs_id = 0;
+      for (auto one_sentence : datasets) {
+        bs_id++;
+        one_batch.insert(one_batch.end(), one_sentence.begin(),
+                         one_sentence.end());
+        lod.push_back(lod.back() + one_sentence.size());
+        if (bs_id == bs) {
+          bs_id = 0;
+          batched_datas.push_back(one_batch);
+          batched_lods.push_back(lod);
+          one_batch.clear();
+          one_batch.resize(0);
+          lod.clear();
+          lod.resize(0);
+          lod.push_back(0);
+        }
+      }
+      if (one_batch.size() != 0) {
+        batched_datas.push_back(one_batch);
+        batched_lods.push_back(lod);
+      }
+    }
+  }
+  DataRecord NextBatch() {
+    DataRecord data;
+    data.data = batched_datas[batch_iter];
+    data.lod = batched_lods[batch_iter];
+    batch_iter++;
+    if (batch_iter >= batched_datas.size()) {
+      batch_iter = 0;
+    }
+    return data;
+  }
+};
+
+void GetOneBatch(std::vector<PaddleTensor> *input_slots, DataRecord *data,
+                 int batch_size) {
+  auto one_batch = data->NextBatch();
+  PaddleTensor input_tensor;
+  input_tensor.name = "word";
+  input_tensor.shape.assign({static_cast<int>(one_batch.data.size()), 1});
+  input_tensor.lod.assign({one_batch.lod});
+  input_tensor.dtype = PaddleDType::INT64;
+  TensorAssignData<int64_t>(&input_tensor, {one_batch.data});
+  PADDLE_ENFORCE_EQ(batch_size, static_cast<int>(one_batch.lod.size() - 1));
+  input_slots->assign({input_tensor});
+}
+
+const int64_t lac_ref_data[] = {24, 25, 25, 25, 38, 30, 31, 14, 15, 44, 24, 25,
+                                25, 25, 25, 25, 44, 24, 25, 25, 25, 36, 42, 43,
+                                44, 14, 15, 44, 14, 15, 44, 14, 15, 44, 38, 39,
+                                14, 15, 44, 22, 23, 23, 23, 23, 23, 23, 23};
+
+void TestLACPrediction(const std::string &model_path,
+                       const std::string &data_file, const int batch_size,
+                       const int repeat, bool use_analysis = false) {
+  AnalysisConfig cfg;
+  cfg.model_dir = model_path;
+  cfg.use_gpu = false;
+  cfg.device = 0;
+  cfg.specify_input_name = true;
+  cfg.enable_ir_optim = true;
+
+  std::vector<PaddleTensor> input_slots, outputs_slots;
+  DataRecord data(data_file, batch_size);
+  GetOneBatch(&input_slots, &data, batch_size);
+  std::unique_ptr<PaddlePredictor> predictor;
+  if (use_analysis) {
+    predictor =
+        CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(cfg);
+  } else {
+    predictor =
+        CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(cfg);
+  }
+  for (int i = 0; i < FLAGS_burning; i++) {
+    predictor->Run(input_slots, &outputs_slots);
+  }
+  Timer timer;
+  if (FLAGS_test_all_data) {
+    LOG(INFO) << "test all data";
+    std::vector<std::vector<PaddleTensor>> input_slots_all;
+    for (size_t bid = 0; bid < data.batched_datas.size(); ++bid) {
+      GetOneBatch(&input_slots, &data, batch_size);
+      input_slots_all.emplace_back(input_slots);
+    }
+    LOG(INFO) << "total number of samples: " << data.datasets.size();
+    TestPrediction(cfg, input_slots_all, &outputs_slots, FLAGS_num_threads);
+    return;
+  }
+  timer.tic();
+  for (int i = 0; i < repeat; i++) {
+    predictor->Run(input_slots, &outputs_slots);
+  }
+  PrintTime(batch_size, repeat, 1, 0, timer.toc() / repeat);
+
+  // check result
+  EXPECT_EQ(outputs_slots.size(), 1UL);
+  auto &out = outputs_slots[0];
+  size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
+                                [](int a, int b) { return a * b; });
+  size_t batch1_size = sizeof(lac_ref_data) / sizeof(int64_t);
+  PADDLE_ENFORCE_GT(size, 0);
+  EXPECT_GE(size, batch1_size);
+  int64_t *pdata = static_cast<int64_t *>(out.data.data());
+  for (size_t i = 0; i < batch1_size; ++i) {
+    EXPECT_EQ(pdata[i], lac_ref_data[i]);
+  }
+
+  if (use_analysis) {
+    // run once for comparion as reference
+    auto ref_predictor =
+        CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(cfg);
+    std::vector<PaddleTensor> ref_outputs_slots;
+    ref_predictor->Run(input_slots, &ref_outputs_slots);
+    CompareResult(ref_outputs_slots, outputs_slots);
+
+    AnalysisPredictor *analysis_predictor =
+        dynamic_cast<AnalysisPredictor *>(predictor.get());
+    auto &fuse_statis = analysis_predictor->analysis_argument()
+                            .Get<std::unordered_map<std::string, int>>(
+                                framework::ir::kFuseStatisAttr);
+    for (auto &item : fuse_statis) {
+      LOG(INFO) << "fused " << item.first << " " << item.second;
+    }
+    int num_ops = 0;
+    for (auto &node :
+         analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) {
+      if (node->IsFunction()) {
+        ++num_ops;
+      }
+    }
+    LOG(INFO) << "has num ops: " << num_ops;
+    ASSERT_TRUE(fuse_statis.count("fc_fuse"));
+    ASSERT_TRUE(fuse_statis.count("fc_gru_fuse"));
+    EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);
+    EXPECT_EQ(fuse_statis.at("fc_gru_fuse"), 4);
+    EXPECT_EQ(num_ops, 11);
+  }
+}
+
+TEST(Analyzer_LAC, native) {
+  LOG(INFO) << "LAC with native";
+  TestLACPrediction(FLAGS_infer_model, FLAGS_infer_data, FLAGS_batch_size,
+                    FLAGS_repeat);
+}
+
+TEST(Analyzer_LAC, analysis) {
+  LOG(INFO) << "LAC with analysis";
+  TestLACPrediction(FLAGS_infer_model, FLAGS_infer_data, FLAGS_batch_size,
+                    FLAGS_repeat, true);
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f8c651e32f7e2ce1d8ced0e6774ffd555d351167
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
@@ -0,0 +1,191 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+
+namespace paddle {
+namespace inference {
+
+struct DataRecord {
+  std::vector<std::vector<int64_t>> word_data_all, mention_data_all;
+  std::vector<std::vector<int64_t>> rnn_word_datas, rnn_mention_datas;
+  std::vector<size_t> lod;  // two inputs have the same lod info.
+  size_t batch_iter{0};
+  size_t batch_size{1};
+  size_t num_samples;  // total number of samples
+  DataRecord() = default;
+  explicit DataRecord(const std::string &path, int batch_size = 1)
+      : batch_size(batch_size) {
+    Load(path);
+  }
+  DataRecord NextBatch() {
+    DataRecord data;
+    size_t batch_end = batch_iter + batch_size;
+    // NOTE skip the final batch, if no enough data is provided.
+    if (batch_end <= word_data_all.size()) {
+      data.word_data_all.assign(word_data_all.begin() + batch_iter,
+                                word_data_all.begin() + batch_end);
+      data.mention_data_all.assign(mention_data_all.begin() + batch_iter,
+                                   mention_data_all.begin() + batch_end);
+      // Prepare LoDs
+      data.lod.push_back(0);
+      CHECK(!data.word_data_all.empty());
+      CHECK(!data.mention_data_all.empty());
+      CHECK_EQ(data.word_data_all.size(), data.mention_data_all.size());
+      for (size_t j = 0; j < data.word_data_all.size(); j++) {
+        data.rnn_word_datas.push_back(data.word_data_all[j]);
+        data.rnn_mention_datas.push_back(data.mention_data_all[j]);
+        // calculate lod
+        data.lod.push_back(data.lod.back() + data.word_data_all[j].size());
+      }
+    }
+    batch_iter += batch_size;
+    return data;
+  }
+  void Load(const std::string &path) {
+    std::ifstream file(path);
+    std::string line;
+    int num_lines = 0;
+    while (std::getline(file, line)) {
+      num_lines++;
+      std::vector<std::string> data;
+      split(line, ';', &data);
+      // load word data
+      std::vector<int64_t> word_data;
+      split_to_int64(data[1], ' ', &word_data);
+      // load mention data
+      std::vector<int64_t> mention_data;
+      split_to_int64(data[3], ' ', &mention_data);
+      word_data_all.push_back(std::move(word_data));
+      mention_data_all.push_back(std::move(mention_data));
+    }
+    num_samples = num_lines;
+  }
+};
+
+void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
+                   int batch_size) {
+  PaddleTensor lod_word_tensor, lod_mention_tensor;
+  lod_word_tensor.name = "word";
+  lod_mention_tensor.name = "mention";
+  auto one_batch = data->NextBatch();
+  int size = one_batch.lod[one_batch.lod.size() - 1];  // token batch size
+  lod_word_tensor.shape.assign({size, 1});
+  lod_word_tensor.lod.assign({one_batch.lod});
+  lod_mention_tensor.shape.assign({size, 1});
+  lod_mention_tensor.lod.assign({one_batch.lod});
+  // assign data
+  TensorAssignData<int64_t>(&lod_word_tensor, one_batch.rnn_word_datas);
+  TensorAssignData<int64_t>(&lod_mention_tensor, one_batch.rnn_mention_datas);
+  // Set inputs.
+  input_slots->assign({lod_word_tensor, lod_mention_tensor});
+  for (auto &tensor : *input_slots) {
+    tensor.dtype = PaddleDType::INT64;
+  }
+}
+
+// the first inference result
+const int chinese_ner_result_data[] = {30, 45, 41, 48, 17, 26,
+                                       48, 39, 38, 16, 25};
+
+void TestChineseNERPrediction(bool use_analysis) {
+  AnalysisConfig cfg;
+  cfg.prog_file = FLAGS_infer_model + "/__model__";
+  cfg.param_file = FLAGS_infer_model + "/param";
+  cfg.use_gpu = false;
+  cfg.device = 0;
+  cfg.specify_input_name = true;
+  cfg.enable_ir_optim = true;
+
+  std::vector<PaddleTensor> input_slots, outputs;
+  std::unique_ptr<PaddlePredictor> predictor;
+  Timer timer;
+  if (use_analysis) {
+    predictor =
+        CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(cfg);
+  } else {
+    predictor =
+        CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(cfg);
+  }
+
+  if (FLAGS_test_all_data) {
+    LOG(INFO) << "test all data";
+    DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
+    std::vector<std::vector<PaddleTensor>> input_slots_all;
+    for (size_t bid = 0; bid < data.num_samples / FLAGS_batch_size; ++bid) {
+      PrepareInputs(&input_slots, &data, FLAGS_batch_size);
+      input_slots_all.emplace_back(input_slots);
+    }
+    LOG(INFO) << "total number of samples: " << data.num_samples;
+    TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
+    return;
+  }
+  // Prepare inputs.
+  DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
+  PrepareInputs(&input_slots, &data, FLAGS_batch_size);
+
+  timer.tic();
+  for (int i = 0; i < FLAGS_repeat; i++) {
+    predictor->Run(input_slots, &outputs);
+  }
+  PrintTime(FLAGS_batch_size, FLAGS_repeat, 1, 0, timer.toc() / FLAGS_repeat);
+
+  PADDLE_ENFORCE(outputs.size(), 1UL);
+  auto &out = outputs[0];
+  size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
+                                [](int a, int b) { return a * b; });
+  PADDLE_ENFORCE_GT(size, 0);
+  int64_t *result = static_cast<int64_t *>(out.data.data());
+  for (size_t i = 0; i < std::min(11UL, size); i++) {
+    PADDLE_ENFORCE(result[i], chinese_ner_result_data[i]);
+  }
+
+  if (use_analysis) {
+    // run once for comparion as reference
+    auto ref_predictor =
+        CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(cfg);
+    std::vector<PaddleTensor> ref_outputs_slots;
+    ref_predictor->Run(input_slots, &ref_outputs_slots);
+    CompareResult(ref_outputs_slots, outputs);
+
+    AnalysisPredictor *analysis_predictor =
+        dynamic_cast<AnalysisPredictor *>(predictor.get());
+    auto &fuse_statis = analysis_predictor->analysis_argument()
+                            .Get<std::unordered_map<std::string, int>>(
+                                framework::ir::kFuseStatisAttr);
+    for (auto &item : fuse_statis) {
+      LOG(INFO) << "fused " << item.first << " " << item.second;
+    }
+    int num_ops = 0;
+    for (auto &node :
+         analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) {
+      if (node->IsFunction()) {
+        ++num_ops;
+      }
+    }
+    LOG(INFO) << "has num ops: " << num_ops;
+    ASSERT_TRUE(fuse_statis.count("fc_fuse"));
+    ASSERT_TRUE(fuse_statis.count("fc_gru_fuse"));
+    EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);
+    EXPECT_EQ(fuse_statis.at("fc_gru_fuse"), 2);
+    EXPECT_EQ(num_ops, 14);
+  }
+}
+
+TEST(Analyzer_Chinese_ner, native) { TestChineseNERPrediction(false); }
+
+TEST(Analyzer_Chinese_ner, analysis) { TestChineseNERPrediction(true); }
+
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..df96be544eaf51c52aa5592966f499fad91aab82
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
@@ -0,0 +1,236 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+
+namespace paddle {
+namespace inference {
+
+using namespace framework;  // NOLINT
+
+struct DataRecord {
+  std::vector<std::vector<std::vector<float>>> link_step_data_all;
+  std::vector<std::vector<float>> week_data_all, minute_data_all;
+  std::vector<size_t> lod1, lod2, lod3;
+  std::vector<std::vector<float>> rnn_link_data, rnn_week_datas,
+      rnn_minute_datas;
+  size_t batch_iter{0};
+  size_t batch_size{1};
+  DataRecord() = default;
+  explicit DataRecord(const std::string &path, int batch_size = 1)
+      : batch_size(batch_size) {
+    Load(path);
+  }
+  DataRecord NextBatch() {
+    DataRecord data;
+    size_t batch_end = batch_iter + batch_size;
+    // NOTE skip the final batch, if no enough data is provided.
+    if (batch_end <= link_step_data_all.size()) {
+      data.link_step_data_all.assign(link_step_data_all.begin() + batch_iter,
+                                     link_step_data_all.begin() + batch_end);
+      data.week_data_all.assign(week_data_all.begin() + batch_iter,
+                                week_data_all.begin() + batch_end);
+      data.minute_data_all.assign(minute_data_all.begin() + batch_iter,
+                                  minute_data_all.begin() + batch_end);
+      // Prepare LoDs
+      data.lod1.push_back(0);
+      data.lod2.push_back(0);
+      data.lod3.push_back(0);
+      CHECK(!data.link_step_data_all.empty()) << "empty";
+      CHECK(!data.week_data_all.empty());
+      CHECK(!data.minute_data_all.empty());
+      CHECK_EQ(data.link_step_data_all.size(), data.week_data_all.size());
+      CHECK_EQ(data.minute_data_all.size(), data.link_step_data_all.size());
+      for (size_t j = 0; j < data.link_step_data_all.size(); j++) {
+        for (const auto &d : data.link_step_data_all[j]) {
+          data.rnn_link_data.push_back(d);
+        }
+        data.rnn_week_datas.push_back(data.week_data_all[j]);
+        data.rnn_minute_datas.push_back(data.minute_data_all[j]);
+        // calculate lod
+        data.lod1.push_back(data.lod1.back() +
+                            data.link_step_data_all[j].size());
+        data.lod3.push_back(data.lod3.back() + 1);
+        for (size_t i = 1; i < data.link_step_data_all[j].size() + 1; i++) {
+          data.lod2.push_back(data.lod2.back() +
+                              data.link_step_data_all[j].size());
+        }
+      }
+    }
+    batch_iter += batch_size;
+    return data;
+  }
+  void Load(const std::string &path) {
+    std::ifstream file(path);
+    std::string line;
+    int num_lines = 0;
+    while (std::getline(file, line)) {
+      num_lines++;
+      std::vector<std::string> data;
+      split(line, ':', &data);
+      std::vector<std::vector<float>> link_step_data;
+      std::vector<std::string> link_datas;
+      split(data[0], '|', &link_datas);
+      for (auto &step_data : link_datas) {
+        std::vector<float> tmp;
+        split_to_float(step_data, ',', &tmp);
+        link_step_data.push_back(tmp);
+      }
+      // load week data
+      std::vector<float> week_data;
+      split_to_float(data[2], ',', &week_data);
+      // load minute data
+      std::vector<float> minute_data;
+      split_to_float(data[1], ',', &minute_data);
+      link_step_data_all.push_back(std::move(link_step_data));
+      week_data_all.push_back(std::move(week_data));
+      minute_data_all.push_back(std::move(minute_data));
+    }
+  }
+};
+void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
+                   int batch_size) {
+  PaddleTensor lod_attention_tensor, init_zero_tensor, lod_tensor_tensor,
+      week_tensor, minute_tensor;
+  lod_attention_tensor.name = "data_lod_attention";
+  init_zero_tensor.name = "cell_init";
+  lod_tensor_tensor.name = "data";
+  week_tensor.name = "week";
+  minute_tensor.name = "minute";
+  auto one_batch = data->NextBatch();
+  std::vector<int> rnn_link_data_shape(
+      {static_cast<int>(one_batch.rnn_link_data.size()),
+       static_cast<int>(one_batch.rnn_link_data.front().size())});
+  lod_attention_tensor.shape.assign({1, 2});
+  lod_attention_tensor.lod.assign({one_batch.lod1, one_batch.lod2});
+  init_zero_tensor.shape.assign({batch_size, 15});
+  init_zero_tensor.lod.assign({one_batch.lod3});
+  lod_tensor_tensor.shape = rnn_link_data_shape;
+  lod_tensor_tensor.lod.assign({one_batch.lod1});
+  // clang-format off
+  week_tensor.shape.assign(
+      {static_cast<int>(one_batch.rnn_week_datas.size()),
+       static_cast<int>(one_batch.rnn_week_datas.front().size())});
+  week_tensor.lod.assign({one_batch.lod3});
+  minute_tensor.shape.assign(
+      {static_cast<int>(one_batch.rnn_minute_datas.size()),
+       static_cast<int>(one_batch.rnn_minute_datas.front().size())});
+  minute_tensor.lod.assign({one_batch.lod3});
+  // clang-format on
+  // assign data
+  TensorAssignData<float>(&lod_attention_tensor,
+                          std::vector<std::vector<float>>({{0, 0}}));
+  std::vector<float> tmp_zeros(batch_size * 15, 0.);
+  TensorAssignData<float>(&init_zero_tensor, {tmp_zeros});
+  TensorAssignData<float>(&lod_tensor_tensor, one_batch.rnn_link_data);
+  TensorAssignData<float>(&week_tensor, one_batch.rnn_week_datas);
+  TensorAssignData<float>(&minute_tensor, one_batch.rnn_minute_datas);
+  // Set inputs.
+  auto init_zero_tensor1 = init_zero_tensor;
+  init_zero_tensor1.name = "hidden_init";
+  input_slots->assign({week_tensor, init_zero_tensor, minute_tensor,
+                       init_zero_tensor1, lod_attention_tensor,
+                       lod_tensor_tensor});
+  for (auto &tensor : *input_slots) {
+    tensor.dtype = PaddleDType::FLOAT32;
+  }
+}
+
+// Test with a really complicate model.
+void TestRNN1Prediction(bool use_analysis, bool activate_ir, int num_threads) {
+  AnalysisConfig config;
+  config.prog_file = FLAGS_infer_model + "/__model__";
+  config.param_file = FLAGS_infer_model + "/param";
+  config.use_gpu = false;
+  config.device = 0;
+  config.specify_input_name = true;
+  config.enable_ir_optim = activate_ir;
+  PADDLE_ENFORCE(config.ir_mode ==
+                 AnalysisConfig::IrPassMode::kExclude);  // default
+  config.ir_passes.clear();  // Do not exclude any pass.
+
+  int batch_size = FLAGS_batch_size;
+
+  auto base_predictor =
+      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+  auto predictor =
+      CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
+          config);
+  std::vector<PaddleTensor> input_slots;
+  DataRecord data(FLAGS_infer_data, batch_size);
+  // Prepare inputs.
+  PrepareInputs(&input_slots, &data, batch_size);
+  std::vector<PaddleTensor> outputs, base_outputs;
+
+  base_predictor->Run(input_slots, &base_outputs);
+
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  input_slots_all.emplace_back(input_slots);
+  if (num_threads == 1) {
+    TestOneThreadPrediction(config, input_slots_all, &outputs);
+    CompareResult(outputs, base_outputs);
+  } else {
+    // only return the output of first thread
+    TestMultiThreadPrediction(config, input_slots_all, &outputs, num_threads);
+  }
+
+  if (use_analysis && activate_ir) {
+    AnalysisPredictor *analysis_predictor =
+        dynamic_cast<AnalysisPredictor *>(predictor.get());
+    auto &fuse_statis = analysis_predictor->analysis_argument()
+                            .Get<std::unordered_map<std::string, int>>(
+                                framework::ir::kFuseStatisAttr);
+    for (auto &item : fuse_statis) {
+      LOG(INFO) << "fused " << item.first << " " << item.second;
+    }
+
+    int num_ops = 0;
+    for (auto &node :
+         analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) {
+      if (node->IsFunction()) {
+        ++num_ops;
+      }
+    }
+    LOG(INFO) << "has num ops: " << num_ops;
+
+    ASSERT_TRUE(fuse_statis.count("fc_fuse"));
+    EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);
+    EXPECT_EQ(fuse_statis.at("fc_nobias_lstm_fuse"), 2);  // bi-directional LSTM
+    EXPECT_EQ(fuse_statis.at("seq_concat_fc_fuse"), 1);
+    EXPECT_EQ(num_ops,
+              13);  // After graph optimization, only 13 operators exists.
+  }
+}
+
+// Inference with analysis and IR, easy for profiling independently.
+TEST(Analyzer, rnn1) { TestRNN1Prediction(true, true, FLAGS_num_threads); }
+
+// Other unit-tests of RNN1, test different options of use_analysis,
+// activate_ir and multi-threads.
+TEST(Analyzer, RNN_tests) {
+  int num_threads[2] = {1, 4};
+  for (auto i : num_threads) {
+    // Directly infer with the original model.
+    TestRNN1Prediction(false, false, i);
+    // Inference with the original model with the analysis turned on, the
+    // analysis module will transform the program to a data flow graph.
+    TestRNN1Prediction(true, false, i);
+    // Inference with analysis and IR. The IR module will fuse some large
+    // kernels.
+    TestRNN1Prediction(true, true, i);
+  }
+}
+
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c40ea58eea9c10a85acf84108f1d081a779f526d
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc
@@ -0,0 +1,181 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/analyzer.h"
+
+#include <google/protobuf/text_format.h>
+#include <gtest/gtest.h>
+#include <thread>  // NOLINT
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/inference/analysis/ut_helper.h"
+#include "paddle/fluid/inference/api/analysis_predictor.h"
+#include "paddle/fluid/inference/api/helper.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/api/paddle_inference_pass.h"
+
+DEFINE_string(infer_model, "", "model path");
+DEFINE_string(infer_data, "", "data path");
+DEFINE_int32(batch_size, 1, "batch size.");
+DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
+DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads.");
+
+namespace paddle {
+namespace inference {
+
+using namespace framework;  // NOLINT
+
+struct DataRecord {
+  std::vector<std::vector<std::vector<float>>> link_step_data_all;
+  std::vector<size_t> lod;
+  std::vector<std::vector<float>> rnn_link_data;
+  std::vector<float> result_data;
+  size_t batch_iter{0};
+  size_t batch_size{1};
+  DataRecord() = default;
+  explicit DataRecord(const std::string &path, int batch_size = 1)
+      : batch_size(batch_size) {
+    Load(path);
+  }
+  DataRecord NextBatch() {
+    DataRecord data;
+    size_t batch_end = batch_iter + batch_size;
+    // NOTE skip the final batch, if no enough data is provided.
+    if (batch_end <= link_step_data_all.size()) {
+      data.link_step_data_all.assign(link_step_data_all.begin() + batch_iter,
+                                     link_step_data_all.begin() + batch_end);
+      // Prepare LoDs
+      data.lod.push_back(0);
+      CHECK(!data.link_step_data_all.empty()) << "empty";
+      for (size_t j = 0; j < data.link_step_data_all.size(); j++) {
+        for (const auto &d : data.link_step_data_all[j]) {
+          data.rnn_link_data.push_back(d);
+          // calculate lod
+          data.lod.push_back(data.lod.back() + 11);
+        }
+      }
+    }
+    batch_iter += batch_size;
+    return data;
+  }
+  void Load(const std::string &path) {
+    std::ifstream file(path);
+    std::string line;
+    int num_lines = 0;
+    while (std::getline(file, line)) {
+      num_lines++;
+      std::vector<std::string> data;
+      split(line, ':', &data);
+      if (num_lines % 2) {  // feature
+        std::vector<std::string> feature_data;
+        split(data[1], ' ', &feature_data);
+        std::vector<std::vector<float>> link_step_data;
+        int feature_count = 1;
+        std::vector<float> feature;
+        for (auto &step_data : feature_data) {
+          std::vector<float> tmp;
+          split_to_float(step_data, ',', &tmp);
+          feature.insert(feature.end(), tmp.begin(), tmp.end());
+          if (feature_count % 11 == 0) {  // each sample has 11 features
+            link_step_data.push_back(feature);
+            feature.clear();
+          }
+          feature_count++;
+        }
+        link_step_data_all.push_back(std::move(link_step_data));
+      } else {  // result
+        std::vector<float> tmp;
+        split_to_float(data[1], ',', &tmp);
+        result_data.insert(result_data.end(), tmp.begin(), tmp.end());
+      }
+    }
+  }
+};
+void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
+                   int batch_size) {
+  PaddleTensor feed_tensor;
+  feed_tensor.name = "feed";
+  auto one_batch = data->NextBatch();
+  int token_size = one_batch.rnn_link_data.size();
+  // each token has 11 features, each feature's dim is 54.
+  std::vector<int> rnn_link_data_shape({token_size * 11, 54});
+  feed_tensor.shape = rnn_link_data_shape;
+  feed_tensor.lod.assign({one_batch.lod});
+  feed_tensor.dtype = PaddleDType::FLOAT32;
+  TensorAssignData<float>(&feed_tensor, one_batch.rnn_link_data);
+  // Set inputs.
+  input_slots->assign({feed_tensor});
+}
+
+void CompareResult(const std::vector<PaddleTensor> &outputs,
+                   const std::vector<float> &base_result) {
+  PADDLE_ENFORCE_GT(outputs.size(), 0);
+  for (size_t i = 0; i < outputs.size(); i++) {
+    auto &out = outputs[i];
+    size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
+                                  [](int a, int b) { return a * b; });
+    PADDLE_ENFORCE_GT(size, 0);
+    float *data = static_cast<float *>(out.data.data());
+    for (size_t i = 0; i < size; i++) {
+      EXPECT_NEAR(data[i], base_result[i], 1e-3);
+    }
+  }
+}
+// Test with a really complicate model.
+void TestRNN2Prediction() {
+  AnalysisConfig config;
+  config.prog_file = FLAGS_infer_model + "/__model__";
+  config.param_file = FLAGS_infer_model + "/param";
+  config.use_gpu = false;
+  config.device = 0;
+  config.specify_input_name = true;
+  config.enable_ir_optim = true;
+  PADDLE_ENFORCE(config.ir_mode ==
+                 AnalysisConfig::IrPassMode::kExclude);  // default
+
+  int batch_size = FLAGS_batch_size;
+  int num_times = FLAGS_repeat;
+
+  auto base_predictor =
+      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+  auto predictor =
+      CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
+          config);
+  std::vector<PaddleTensor> input_slots;
+  DataRecord data(FLAGS_infer_data, batch_size);
+  PrepareInputs(&input_slots, &data, batch_size);
+  std::vector<PaddleTensor> outputs, base_outputs;
+
+  Timer timer1;
+  timer1.tic();
+  for (int i = 0; i < num_times; i++) {
+    base_predictor->Run(input_slots, &base_outputs);
+  }
+  PrintTime(batch_size, num_times, 1, 0, timer1.toc() / num_times);
+
+  Timer timer2;
+  timer2.tic();
+  for (int i = 0; i < num_times; i++) {
+    predictor->Run(input_slots, &outputs);
+  }
+  PrintTime(batch_size, num_times, 1, 0, timer2.toc() / num_times);
+
+  CompareResult(base_outputs, data.result_data);
+  CompareResult(outputs, data.result_data);
+}
+
+TEST(Analyzer, rnn2) { TestRNN2Prediction(); }
+
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1472c475e4a3061ffcad96925ea215a41a7e63eb
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
@@ -0,0 +1,99 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+
+namespace paddle {
+namespace inference {
+
+struct DataReader {
+  explicit DataReader(const std::string &path)
+      : file(new std::ifstream(path)) {}
+
+  bool NextBatch(std::vector<PaddleTensor> *input, int batch_size) {
+    PADDLE_ENFORCE_EQ(batch_size, 1);
+    std::string line;
+    PaddleTensor tensor;
+    tensor.dtype = PaddleDType::INT64;
+    tensor.lod.emplace_back(std::vector<size_t>({0}));
+    std::vector<int64_t> data;
+
+    for (int i = 0; i < batch_size; i++) {
+      if (!std::getline(*file, line)) return false;
+      inference::split_to_int64(line, ' ', &data);
+    }
+    tensor.lod.front().push_back(data.size());
+
+    tensor.data.Resize(data.size() * sizeof(int64_t));
+    memcpy(tensor.data.data(), data.data(), data.size() * sizeof(int64_t));
+    tensor.shape.push_back(data.size());
+    tensor.shape.push_back(1);
+    input->assign({tensor});
+    return true;
+  }
+
+  std::unique_ptr<std::ifstream> file;
+};
+
+void Main(int batch_size) {
+  // shape --
+  // Create Predictor --
+  AnalysisConfig config;
+  config.model_dir = FLAGS_infer_model;
+  config.use_gpu = false;
+  config.enable_ir_optim = true;
+
+  std::vector<PaddleTensor> input_slots, output_slots;
+  DataReader reader(FLAGS_infer_data);
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+
+  if (FLAGS_test_all_data) {
+    LOG(INFO) << "test all data";
+    int num_batches = 0;
+    while (reader.NextBatch(&input_slots, FLAGS_batch_size)) {
+      input_slots_all.emplace_back(input_slots);
+      ++num_batches;
+    }
+    LOG(INFO) << "total number of samples: " << num_batches * FLAGS_batch_size;
+    TestPrediction(config, input_slots_all, &output_slots, FLAGS_num_threads);
+    return;
+  }
+
+  // one batch starts
+  // data --
+  reader.NextBatch(&input_slots, FLAGS_batch_size);
+  input_slots_all.emplace_back(input_slots);
+  TestPrediction(config, input_slots_all, &output_slots, FLAGS_num_threads);
+
+  // Get output
+  LOG(INFO) << "get outputs " << output_slots.size();
+
+  for (auto &output : output_slots) {
+    LOG(INFO) << "output.shape: " << to_string(output.shape);
+    // no lod ?
+    CHECK_EQ(output.lod.size(), 0UL);
+    LOG(INFO) << "output.dtype: " << output.dtype;
+    std::stringstream ss;
+    for (int i = 0; i < 5; i++) {
+      ss << static_cast<float *>(output.data.data())[i] << " ";
+    }
+    LOG(INFO) << "output.data summary: " << ss.str();
+    // one batch ends
+  }
+}
+
+TEST(text_classification, basic) { Main(FLAGS_batch_size); }
+
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a207c41b7140c806b4c1fdc7f24a317b165c9aef
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
@@ -0,0 +1,133 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <fstream>
+#include <iostream>
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+struct Record {
+  std::vector<float> data;
+  std::vector<int32_t> shape;
+};
+
+Record ProcessALine(const std::string &line) {
+  VLOG(3) << "process a line";
+  std::vector<std::string> columns;
+  split(line, '\t', &columns);
+  CHECK_EQ(columns.size(), 2UL)
+      << "data format error, should be <data>\t<shape>";
+
+  Record record;
+  std::vector<std::string> data_strs;
+  split(columns[0], ' ', &data_strs);
+  for (auto &d : data_strs) {
+    record.data.push_back(std::stof(d));
+  }
+
+  std::vector<std::string> shape_strs;
+  split(columns[1], ' ', &shape_strs);
+  for (auto &s : shape_strs) {
+    record.shape.push_back(std::stoi(s));
+  }
+  VLOG(3) << "data size " << record.data.size();
+  VLOG(3) << "data shape size " << record.shape.size();
+  return record;
+}
+
+/*
+ * Use the native and analysis fluid engine to inference the demo.
+ * ocr, mobilenet and se_resnext50
+ */
+void TestVisualPrediction(bool use_mkldnn) {
+  std::unique_ptr<PaddlePredictor> predictor;
+  AnalysisConfig cfg;
+  cfg.param_file = FLAGS_infer_model + "/__params__";
+  cfg.prog_file = FLAGS_infer_model + "/__model__";
+  cfg.use_gpu = false;
+  cfg._use_mkldnn = use_mkldnn;
+  cfg.device = 0;
+  cfg.enable_ir_optim = true;
+  // TODO(TJ): fix fusion gru
+  cfg.ir_passes.push_back("fc_gru_fuse_pass");
+#ifdef PADDLE_WITH_MKLDNN
+  // disable mkldnn fuse since it should have some bugs
+  cfg.ir_passes.push_back("conv_relu_mkldnn_fuse_pass");
+#endif
+  predictor =
+      CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(cfg);
+
+  // Only have single batch of data.
+  std::string line;
+  std::ifstream file(FLAGS_infer_data);
+  std::getline(file, line);
+  auto record = ProcessALine(line);
+  file.close();
+
+  // Inference.
+  PaddleTensor input;
+  input.shape = record.shape;
+  input.data =
+      PaddleBuf(record.data.data(), record.data.size() * sizeof(float));
+  input.dtype = PaddleDType::FLOAT32;
+
+  std::vector<PaddleTensor> outputs_slots;
+  Timer timer;
+  timer.tic();
+  for (int i = 0; i < FLAGS_repeat; i++) {
+    predictor->Run({input}, &outputs_slots);
+  }
+  PrintTime(/*batch size*/ 1, FLAGS_repeat, /*num threads*/ 1, /*thread id*/ 0,
+            timer.toc() / FLAGS_repeat);
+
+  VLOG(3) << "output.size " << outputs_slots.size();
+
+  // run native as reference
+  auto ref_predictor =
+      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(cfg);
+  std::vector<PaddleTensor> ref_outputs_slots;
+  ref_predictor->Run({input}, &ref_outputs_slots);
+  CompareResult(outputs_slots, ref_outputs_slots);
+  // print what are fused
+  AnalysisPredictor *analysis_predictor =
+      dynamic_cast<AnalysisPredictor *>(predictor.get());
+  auto &fuse_statis = analysis_predictor->analysis_argument()
+                          .Get<std::unordered_map<std::string, int>>(
+                              framework::ir::kFuseStatisAttr);
+  for (auto &item : fuse_statis) {
+    LOG(INFO) << "fused " << item.first << " " << item.second;
+  }
+  int num_ops = 0;
+  for (auto &node :
+       analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) {
+    if (node->IsFunction()) {
+      ++num_ops;
+    }
+  }
+  LOG(INFO) << "has num ops: " << num_ops;
+}
+
+TEST(Analyzer_vis, analysis) { TestVisualPrediction(/*use_mkldnn*/ false); }
+#ifdef PADDLE_WITH_MKLDNN
+TEST(Analyzer_vis, analysis_mkldnn) {
+  TestVisualPrediction(/*use_mkldnn*/ true);
+}
+#endif
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..43e97614e3ad9c14c8deee9f340757f373eb593e
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -0,0 +1,141 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <gtest/gtest.h>
+#include <thread>  // NOLINT
+#include <vector>
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include "paddle/fluid/inference/analysis/ut_helper.h"
+#include "paddle/fluid/inference/api/analysis_predictor.h"
+#include "paddle/fluid/inference/api/helper.h"
+#include "paddle/fluid/inference/api/paddle_inference_pass.h"
+#include "paddle/fluid/platform/profiler.h"
+
+DEFINE_string(infer_model, "", "model path");
+DEFINE_string(infer_data, "", "data file");
+DEFINE_int32(batch_size, 1, "batch size.");
+DEFINE_int32(burning, 0, "Burning before repeat.");
+DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
+DEFINE_bool(test_all_data, false, "Test the all dataset in data file.");
+DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads.");
+
+namespace paddle {
+namespace inference {
+
+void CompareResult(const std::vector<PaddleTensor> &outputs,
+                   const std::vector<PaddleTensor> &ref_outputs) {
+  EXPECT_GT(outputs.size(), 0);
+  EXPECT_EQ(outputs.size(), ref_outputs.size());
+  for (size_t i = 0; i < outputs.size(); i++) {
+    auto &out = outputs[i];
+    auto &ref_out = ref_outputs[i];
+    size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
+                                  [](int a, int b) { return a * b; });
+    size_t ref_size =
+        std::accumulate(ref_out.shape.begin(), ref_out.shape.end(), 1,
+                        [](int a, int b) { return a * b; });
+    EXPECT_GT(size, 0);
+    EXPECT_EQ(size, ref_size);
+    EXPECT_EQ(out.dtype, ref_out.dtype);
+    switch (out.dtype) {
+      case PaddleDType::INT64: {
+        int64_t *pdata = static_cast<int64_t *>(out.data.data());
+        int64_t *pdata_ref = static_cast<int64_t *>(ref_out.data.data());
+        for (size_t j = 0; j < size; ++j) {
+          EXPECT_EQ(pdata_ref[j], pdata[j]);
+        }
+        break;
+      }
+      case PaddleDType::FLOAT32: {
+        float *pdata = static_cast<float *>(out.data.data());
+        float *pdata_ref = static_cast<float *>(ref_out.data.data());
+        for (size_t j = 0; j < size; ++j) {
+          EXPECT_NEAR(pdata_ref[j], pdata[j], 1e-3);
+        }
+        break;
+      }
+    }
+  }
+}
+
+void TestOneThreadPrediction(
+    AnalysisConfig config, const std::vector<std::vector<PaddleTensor>> inputs,
+    std::vector<PaddleTensor> *outputs) {
+  int batch_size = FLAGS_batch_size;
+  int num_times = FLAGS_repeat;
+  auto predictor =
+      CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
+          config);
+  Timer timer;
+  timer.tic();
+  for (int i = 0; i < num_times; i++) {
+    for (size_t j = 0; j < inputs.size(); j++) {
+      predictor->Run(inputs[j], outputs);
+    }
+  }
+  PrintTime(batch_size, num_times, 1, 0, timer.toc() / num_times,
+            inputs.size());
+}
+
+void TestMultiThreadPrediction(
+    AnalysisConfig config, const std::vector<std::vector<PaddleTensor>> inputs,
+    std::vector<PaddleTensor> *outputs, int num_threads) {
+  int batch_size = FLAGS_batch_size;
+  int num_times = FLAGS_repeat;
+  std::vector<std::thread> threads;
+  std::vector<std::unique_ptr<PaddlePredictor>> predictors;
+  // TODO(yanchunwei): Bug here, the analyzer phase can't be parallelled
+  // because AttentionLSTM's hard code nodeid will be damanged.
+  for (int tid = 0; tid < num_threads; ++tid) {
+    predictors.emplace_back(
+        CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
+            config));
+  }
+  for (int tid = 0; tid < num_threads; ++tid) {
+    threads.emplace_back([&, tid]() {
+      // Each thread should have local inputs and outputs.
+      // The inputs of each thread are all the same.
+      std::vector<std::vector<PaddleTensor>> inputs_tid = inputs;
+      std::vector<PaddleTensor> outputs_tid;
+      Timer timer;
+      timer.tic();
+      for (int i = 0; i < num_times; i++) {
+        for (size_t j = 0; j < inputs_tid.size(); j++) {
+          predictors[tid]->Run(inputs_tid[j], &outputs_tid);
+        }
+      }
+      PrintTime(batch_size, num_times, num_threads, tid,
+                timer.toc() / num_times, inputs_tid.size());
+    });
+  }
+  for (int i = 0; i < num_threads; ++i) {
+    threads[i].join();
+  }
+}
+
+void TestPrediction(AnalysisConfig config,
+                    const std::vector<std::vector<PaddleTensor>> inputs,
+                    std::vector<PaddleTensor> *outputs, int num_threads) {
+  if (num_threads == 1) {
+    TestOneThreadPrediction(config, inputs, outputs);
+  } else {
+    TestMultiThreadPrediction(config, inputs, outputs, num_threads);
+  }
+}
+
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/book/CMakeLists.txt b/paddle/fluid/inference/tests/book/CMakeLists.txt
index 2fa5a9540ba1311c7f87e6675a53044b23dd8276..017fc4cd7b11c150cb941fffca2606a4d707330f 100644
--- a/paddle/fluid/inference/tests/book/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/book/CMakeLists.txt
@@ -17,7 +17,7 @@ function(inference_test TARGET_NAME)
     string(REGEX REPLACE "^_$" "" arg "${arg}")
     cc_test(test_inference_${TARGET_NAME}${arg}
         SRCS test_inference_${TARGET_NAME}.cc
-        DEPS paddle_fluid
+        DEPS paddle_fluid_origin
         ARGS --dirname=${PYTHON_TESTS_DIR}/book/${TARGET_NAME}${arg}.inference.model)
     set_tests_properties(test_inference_${TARGET_NAME}${arg}
         PROPERTIES DEPENDS test_${TARGET_NAME})
@@ -43,6 +43,6 @@ inference_test(word2vec)
 # TODO(TJ): clean me up
 cc_test(test_inference_nlp
   SRCS test_inference_nlp.cc
-  DEPS paddle_fluid
+  DEPS paddle_fluid_origin
   ARGS
   --model_path=${PADDLE_BINARY_DIR}/python/paddle/fluid/tests/book/recognize_digits_mlp.inference.model)
diff --git a/paddle/fluid/inference/tests/book/test_inference_nlp.cc b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
index 5cc1db12bb71e428d493e7c6f718b1c6ed431858..cbcfc964c91c33ab41a72ad7fec759086ad887cc 100644
--- a/paddle/fluid/inference/tests/book/test_inference_nlp.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
@@ -20,9 +20,8 @@ limitations under the License. */
 #include "gtest/gtest.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
 #include "paddle/fluid/platform/cpu_helper.h"
-#ifdef PADDLE_WITH_MKLML
-#include <omp.h>
-#endif
+
+#include "paddle/fluid/framework/feed_fetch_method.h"
 
 DEFINE_string(model_path, "", "Directory of the inference model.");
 DEFINE_string(data_file, "", "File of input index data.");
@@ -30,6 +29,7 @@ DEFINE_int32(repeat, 100, "Running the inference program repeat times");
 DEFINE_bool(prepare_vars, true, "Prepare variables before executor");
 DEFINE_int32(num_threads, 1, "Number of threads should be used");
 DECLARE_bool(use_mkldnn);
+DECLARE_int32(paddle_num_threads);
 
 inline double GetCurrentMs() {
   struct timeval time;
@@ -126,14 +126,35 @@ void ThreadRunInfer(
   std::map<std::string, const paddle::framework::LoDTensor*> feed_targets;
   PADDLE_ENFORCE_EQ(feed_target_names.size(), 1UL);
 
+  // map the data of feed_targets to feed_holder
+  for (auto* op : inference_program->Block(0).AllOps()) {
+    if (op->Type() == "feed") {
+      std::string feed_target_name = op->Output("Out")[0];
+      int idx = boost::get<int>(op->GetAttr("col"));
+      paddle::framework::SetFeedVariable(scope, *feed_targets[feed_target_name],
+                                         "feed", idx);
+    }
+  }
+
   auto& inputs = jobs[tid];
   auto start_ms = GetCurrentMs();
   for (size_t i = 0; i < inputs.size(); ++i) {
     feed_targets[feed_target_names[0]] = inputs[i];
-    executor.RunPreparedContext(ctx.get(), &sub_scope, &feed_targets,
-                                &fetch_targets, false /*create_local_scope*/);
+    executor.RunPreparedContext(ctx.get(), &sub_scope,
+                                false /*create_local_scope*/);
   }
   auto stop_ms = GetCurrentMs();
+
+  // obtain the data of fetch_targets from fetch_holder
+  for (auto* op : inference_program->Block(0).AllOps()) {
+    if (op->Type() == "fetch") {
+      std::string fetch_target_name = op->Input("X")[0];
+      int idx = boost::get<int>(op->GetAttr("col"));
+      *fetch_targets[fetch_target_name] =
+          paddle::framework::GetFetchVariable(*scope, "fetch", idx);
+    }
+  }
+
   scope->DeleteScope(&sub_scope);
   LOG(INFO) << "Tid: " << tid << ", process " << inputs.size()
             << " samples, avg time per sample: "
@@ -160,12 +181,7 @@ TEST(inference, nlp) {
   std::unique_ptr<paddle::framework::Scope> scope(
       new paddle::framework::Scope());
 
-#ifdef PADDLE_WITH_MKLML
-  // only use 1 thread number per std::thread
-  omp_set_dynamic(0);
-  omp_set_num_threads(1);
-  paddle::platform::SetNumThreads(1);
-#endif
+  paddle::platform::SetNumThreads(FLAGS_paddle_num_threads);
 
   double start_ms = 0, stop_ms = 0;
   if (FLAGS_num_threads > 1) {
diff --git a/paddle/fluid/inference/tests/test_helper.h b/paddle/fluid/inference/tests/test_helper.h
index 44c36b1683b037832a218df02184e7cd2ba143e9..94f0550df57e79fa68c135f5c9c4b7effe6ac156 100644
--- a/paddle/fluid/inference/tests/test_helper.h
+++ b/paddle/fluid/inference/tests/test_helper.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
+#include "paddle/fluid/framework/ir/graph_to_program_pass.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/inference/io.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -135,6 +136,15 @@ std::vector<std::vector<int64_t>> GetFeedTargetShapes(
   return feed_target_shapes;
 }
 
+void Compile(paddle::framework::ProgramDesc* program) {
+  std::unique_ptr<paddle::framework::ir::Graph> g(
+      new paddle::framework::ir::Graph(*program));
+  auto pass = paddle::framework::ir::PassRegistry::Instance().Get(
+      "graph_to_program_pass");
+  pass->SetNotOwned<paddle::framework::ProgramDesc>("program", program);
+  pass->Apply(std::move(g));
+}
+
 template <typename Place, bool CreateVars = true, bool PrepareContext = false>
 void TestInference(const std::string& dirname,
                    const std::vector<paddle::framework::LoDTensor*>& cpu_feeds,
@@ -172,6 +182,8 @@ void TestInference(const std::string& dirname,
         paddle::platform::DeviceContextPool::Instance().Get(place));
     inference_program = InitProgram(&executor, scope, dirname, is_combined);
   }
+  Compile(inference_program.get());
+
   // Disable the profiler and print the timing information
   paddle::platform::DisableProfiler(paddle::platform::EventSortingKey::kDefault,
                                     "load_program_profiler");
@@ -210,13 +222,14 @@ void TestInference(const std::string& dirname,
 
     // Ignore the profiling results of the first run
     std::unique_ptr<paddle::framework::ExecutorPrepareContext> ctx;
+    bool CreateLocalScope = CreateVars;
     if (PrepareContext) {
       ctx = executor.Prepare(*inference_program, 0);
       executor.RunPreparedContext(ctx.get(), scope, &feed_targets,
-                                  &fetch_targets, true, CreateVars);
+                                  &fetch_targets, CreateLocalScope, CreateVars);
     } else {
       executor.Run(*inference_program, scope, &feed_targets, &fetch_targets,
-                   true, CreateVars);
+                   CreateLocalScope, CreateVars);
     }
 
     // Enable the profiler
@@ -232,10 +245,11 @@ void TestInference(const std::string& dirname,
         // Note: if you change the inference_program, you need to call
         // executor.Prepare() again to get a new ExecutorPrepareContext.
         executor.RunPreparedContext(ctx.get(), scope, &feed_targets,
-                                    &fetch_targets, CreateVars);
+                                    &fetch_targets, CreateLocalScope,
+                                    CreateVars);
       } else {
         executor.Run(*inference_program, scope, &feed_targets, &fetch_targets,
-                     CreateVars);
+                     CreateLocalScope, CreateVars);
       }
     }
 
@@ -247,3 +261,5 @@ void TestInference(const std::string& dirname,
 
   delete scope;
 }
+
+USE_PASS(graph_to_program_pass);
diff --git a/paddle/fluid/memory/detail/buddy_allocator.cc b/paddle/fluid/memory/detail/buddy_allocator.cc
index 4194ba197948b47003863196efdac1c08a7ae4f6..c2f45fdc99b87bc12c2aadf1985de6e98a24fce7 100644
--- a/paddle/fluid/memory/detail/buddy_allocator.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator.cc
@@ -15,12 +15,17 @@ limitations under the License. */
 #include "paddle/fluid/memory/detail/buddy_allocator.h"
 #include "glog/logging.h"
 
+DEFINE_bool(free_idle_memory, false,
+            "If it is true, Paddle will try to free idle memory trunks during "
+            "running time.");
+
 namespace paddle {
 namespace memory {
 namespace detail {
 
-BuddyAllocator::BuddyAllocator(SystemAllocator* system_allocator,
-                               size_t min_chunk_size, size_t max_chunk_size)
+BuddyAllocator::BuddyAllocator(
+    std::unique_ptr<SystemAllocator> system_allocator, size_t min_chunk_size,
+    size_t max_chunk_size)
     : min_chunk_size_(min_chunk_size),
       max_chunk_size_(max_chunk_size),
       cache_(system_allocator->UseGpu()),
@@ -151,13 +156,14 @@ void BuddyAllocator::Free(void* p) {
   pool_.insert(
       IndexSizeAddress(block->index(cache_), block->total_size(cache_), block));
 
-  // Clean up if existing too much free memory
-
-  // Prefer freeing fallback allocation first
-  CleanIdleFallBackAlloc();
+  if (FLAGS_free_idle_memory) {
+    // Clean up if existing too much free memory
+    // Prefer freeing fallback allocation first
+    CleanIdleFallBackAlloc();
 
-  // Free normal allocation
-  CleanIdleNormalAlloc();
+    // Free normal allocation
+    CleanIdleNormalAlloc();
+  }
 }
 
 size_t BuddyAllocator::Used() { return total_used_; }
diff --git a/paddle/fluid/memory/detail/buddy_allocator.h b/paddle/fluid/memory/detail/buddy_allocator.h
index 2f39d774d6fb6a2bc37877eb2f8b90bebd3cda28..f0c83efc23ce39c4fc89296d672e1e55751851bf 100644
--- a/paddle/fluid/memory/detail/buddy_allocator.h
+++ b/paddle/fluid/memory/detail/buddy_allocator.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <memory>
 #include <mutex>  // NOLINT
 #include <set>
 #include <tuple>
@@ -32,8 +33,8 @@ namespace detail {
 
 class BuddyAllocator {
  public:
-  BuddyAllocator(SystemAllocator* system_allocator, size_t min_chunk_size,
-                 size_t max_chunk_size);
+  BuddyAllocator(std::unique_ptr<SystemAllocator> system_allocator,
+                 size_t min_chunk_size, size_t max_chunk_size);
 
   ~BuddyAllocator();
 
@@ -103,7 +104,7 @@ class BuddyAllocator {
 
  private:
   /*! Allocate CPU/GPU memory from system */
-  SystemAllocator* system_allocator_;
+  std::unique_ptr<SystemAllocator> system_allocator_;
   std::mutex mutex_;
 };
 
diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc
index 9b1ab1e228dd758b52975abc4c4aa0bdeadbe2de..1b96798d23cec34a1863f56c1e4027ce32b2eec5 100644
--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@@ -11,12 +11,18 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#define GLOG_NO_ABBREVIATED_SEVERITIES
 
 #include "paddle/fluid/memory/detail/system_allocator.h"
 
-#include <stdlib.h>    // for malloc and free
+#ifdef _WIN32
+#include <malloc.h>
+#include <windows.h>  // VirtualLock/VirtualUnlock
+#else
 #include <sys/mman.h>  // for mlock and munlock
-#include <algorithm>   // for std::max
+#endif
+#include <stdlib.h>   // for malloc and free
+#include <algorithm>  // for std::max
 
 #include "gflags/gflags.h"
 #include "paddle/fluid/platform/assert.h"
@@ -35,31 +41,42 @@ namespace paddle {
 namespace memory {
 namespace detail {
 
-void* CPUAllocator::Alloc(size_t* index, size_t size) {
-  // According to http://www.cplusplus.com/reference/cstdlib/malloc/,
-  // malloc might not return nullptr if size is zero, but the returned
-  // pointer shall not be dereferenced -- so we make it nullptr.
-  if (size <= 0) return nullptr;
-
-  *index = 0;  // unlock memory
-
+void* AlignedMalloc(size_t size) {
   void* p = nullptr;
-
+  size_t alignment = 32ul;
 #ifdef PADDLE_WITH_MKLDNN
   // refer to https://github.com/01org/mkl-dnn/blob/master/include/mkldnn.hpp
   // memory alignment
-  PADDLE_ENFORCE_EQ(posix_memalign(&p, 4096ul, size), 0, "Alloc %ld error!",
-                    size);
+  alignment = 4096ul;
+#endif
+#ifdef _WIN32
+  p = _aligned_malloc(size, alignment);
 #else
-  PADDLE_ENFORCE_EQ(posix_memalign(&p, 32ul, size), 0, "Alloc %ld error!",
+  PADDLE_ENFORCE_EQ(posix_memalign(&p, alignment, size), 0, "Alloc %ld error!",
                     size);
 #endif
   PADDLE_ENFORCE(p, "Fail to allocate CPU memory: size = %d .", size);
+  return p;
+}
+
+void* CPUAllocator::Alloc(size_t* index, size_t size) {
+  // According to http://www.cplusplus.com/reference/cstdlib/malloc/,
+  // malloc might not return nullptr if size is zero, but the returned
+  // pointer shall not be dereferenced -- so we make it nullptr.
+  if (size <= 0) return nullptr;
+
+  *index = 0;  // unlock memory
+
+  void* p = AlignedMalloc(size);
 
   if (p != nullptr) {
     if (FLAGS_use_pinned_memory) {
       *index = 1;
+#ifdef _WIN32
+      VirtualLock(p, size);
+#else
       mlock(p, size);  // lock memory
+#endif
     }
   }
 
@@ -68,7 +85,11 @@ void* CPUAllocator::Alloc(size_t* index, size_t size) {
 
 void CPUAllocator::Free(void* p, size_t size, size_t index) {
   if (p != nullptr && index == 1) {
+#ifdef _WIN32
+    VirtualUnlock(p, size);
+#else
     munlock(p, size);
+#endif
   }
   free(p);
 }
diff --git a/paddle/fluid/memory/malloc.cc b/paddle/fluid/memory/malloc.cc
index bd98ed81899440a46415d30b6d74fec2dac4c155..7c800b3c164049244770ceb2070b177d8307e85e 100644
--- a/paddle/fluid/memory/malloc.cc
+++ b/paddle/fluid/memory/malloc.cc
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <vector>
+
 #include "paddle/fluid/memory/malloc.h"
 
 #include "glog/logging.h"
@@ -34,12 +36,15 @@ namespace memory {
 using BuddyAllocator = detail::BuddyAllocator;
 
 BuddyAllocator* GetCPUBuddyAllocator() {
+  static std::once_flag init_flag;
   static detail::BuddyAllocator* a = nullptr;
-  if (a == nullptr) {
-    a = new detail::BuddyAllocator(new detail::CPUAllocator,
-                                   platform::CpuMinChunkSize(),
-                                   platform::CpuMaxChunkSize());
-  }
+
+  std::call_once(init_flag, []() {
+    a = new detail::BuddyAllocator(
+        std::unique_ptr<detail::SystemAllocator>(new detail::CPUAllocator),
+        platform::CpuMinChunkSize(), platform::CpuMaxChunkSize());
+  });
+
   return a;
 }
 
@@ -68,27 +73,33 @@ size_t Used<platform::CPUPlace>(platform::CPUPlace place) {
 #ifdef PADDLE_WITH_CUDA
 
 BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
-  static BuddyAllocator** as = NULL;
-  if (as == NULL) {
+  static std::once_flag init_flag;
+  static detail::BuddyAllocator** a_arr = nullptr;
+
+  std::call_once(init_flag, [gpu_id]() {
     int gpu_num = platform::GetCUDADeviceCount();
-    as = new BuddyAllocator*[gpu_num];
-    for (int gpu = 0; gpu < gpu_num; gpu++) {
-      as[gpu] = nullptr;
+    PADDLE_ENFORCE(gpu_id < gpu_num, "gpu_id:%d should < gpu_num:%d", gpu_id,
+                   gpu_num);
+
+    a_arr = new BuddyAllocator*[gpu_num];
+    for (int i = 0; i < gpu_num; i++) {
+      a_arr[i] = nullptr;
+      platform::SetDeviceId(i);
+      a_arr[i] = new BuddyAllocator(
+          std::unique_ptr<detail::SystemAllocator>(new detail::GPUAllocator(i)),
+          platform::GpuMinChunkSize(), platform::GpuMaxChunkSize());
+
+      VLOG(10) << "\n\nNOTE: each GPU device use "
+               << FLAGS_fraction_of_gpu_memory_to_use * 100
+               << "% of GPU memory.\n"
+               << "You can set GFlags environment variable '"
+               << "FLAGS_fraction_of_gpu_memory_to_use"
+               << "' to change the fraction of GPU usage.\n\n";
     }
-  }
+  });
+
   platform::SetDeviceId(gpu_id);
-  if (!as[gpu_id]) {
-    as[gpu_id] = new BuddyAllocator(new detail::GPUAllocator(gpu_id),
-                                    platform::GpuMinChunkSize(),
-                                    platform::GpuMaxChunkSize());
-    VLOG(10) << "\n\nNOTE: each GPU device use "
-             << FLAGS_fraction_of_gpu_memory_to_use * 100
-             << "% of GPU memory.\n"
-             << "You can set GFlags environment variable '"
-             << "FLAGS_fraction_of_gpu_memory_to_use"
-             << "' to change the fraction of GPU usage.\n\n";
-  }
-  return as[gpu_id];
+  return a_arr[gpu_id];
 }
 
 template <>
@@ -125,12 +136,16 @@ void Free<platform::CUDAPlace>(platform::CUDAPlace place, void* p) {
 }
 
 BuddyAllocator* GetCUDAPinnedBuddyAllocator() {
-  static BuddyAllocator* ba = NULL;
-  if (ba == NULL) {
-    ba = new BuddyAllocator(new detail::CUDAPinnedAllocator,
+  static std::once_flag init_flag;
+  static BuddyAllocator* ba = nullptr;
+
+  std::call_once(init_flag, []() {
+    ba = new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
+                                new detail::CUDAPinnedAllocator),
                             platform::CUDAPinnedMinChunkSize(),
                             platform::CUDAPinnedMaxChunkSize());
-  }
+  });
+
   return ba;
 }
 
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index ab1d2143330fb8cbfd535758a83bc71de939c4e0..7ec1e78da4ec642cb1e6248edfbcfed748fa11b8 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -9,7 +9,6 @@ function(op_library TARGET)
     # op_library is a function to create op library. The interface is same as
     # cc_library. But it handle split GPU/CPU code and link some common library
     # for ops.
-    set(OP_LIBRARY ${TARGET} ${OP_LIBRARY} PARENT_SCOPE)
     set(cc_srcs)
     set(cu_srcs)
     set(hip_cu_srcs)
@@ -84,6 +83,16 @@ function(op_library TARGET)
         message(FATAL_ERROR "The op library ${TARGET} should contains at least one .cc file")
     endif()
 
+    #remove windows unsupported op
+    if (WIN32)
+    foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op")
+        if ("${TARGET}" STREQUAL "${windows_unsupport_op}")
+          return()
+        endif()
+    endforeach()
+    endif(WIN32)
+    set(OP_LIBRARY ${TARGET} ${OP_LIBRARY} PARENT_SCOPE)
+
     list(LENGTH op_library_DEPS op_library_DEPS_len)
     if (${op_library_DEPS_len} GREATER 0)
         set(DEPS_OPS ${TARGET} ${DEPS_OPS} PARENT_SCOPE)
@@ -100,7 +109,8 @@ function(op_library TARGET)
     endif()
 
     # Define operators that don't need pybind here.
-    foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op" "tensor_array_read_write_op")
+    foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op"
+"tensor_array_read_write_op" "tensorrt_engine_op")
         if ("${TARGET}" STREQUAL "${manual_pybind_op}")
             set(pybind_flag 1)
         endif()
@@ -168,6 +178,13 @@ function(op_library TARGET)
         file(APPEND ${pybind_file} "USE_OP(relu);\n")
       elseif(${TARGET} STREQUAL "fake_dequantize")
         file(APPEND ${pybind_file} "USE_OP(fake_dequantize_max_abs);\n")
+      elseif(${TARGET} STREQUAL "fake_quantize")
+        file(APPEND ${pybind_file} "USE_OP(fake_quantize_abs_max);\n")
+      elseif(${TARGET} STREQUAL "tensorrt_engine_op")
+          message(STATUS "Pybind skips [tensorrt_engine_op], for this OP is only used in inference")
+      elseif(${TARGET} STREQUAL "fc")
+        # HACK: fc only have mkldnn and cpu, which would mismatch the cpu only condition
+        file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
       else()
         file(APPEND ${pybind_file} "USE_OP(${TARGET});\n")
       endif()
@@ -175,24 +192,24 @@ function(op_library TARGET)
 endfunction()
 
 add_subdirectory(math)
+if (NOT WIN32)
 add_subdirectory(nccl)
-
 if(WITH_GPU)
     op_library(nccl_op DEPS nccl_common)
     file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(ncclAllReduce);\n")
 else()
     set(DEPS_OPS ${DEPS_OPS} nccl_op)
 endif()
+endif() # NOT WIN32
 
 set(DISTRIBUTE_DEPS "")
 if(WITH_DISTRIBUTE)
     add_subdirectory(distributed)
-    
     set(DISTRIBUTE_DEPS "")
     if(WITH_GRPC)
-        set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf)
+        set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf node)
     else()
-        set(DISTRIBUTE_DEPS sendrecvop_brpc brpc leveldb snappystream snappy protobuf ssl crypto zlib)
+        set(DISTRIBUTE_DEPS sendrecvop_brpc brpc leveldb snappystream snappy protobuf ssl crypto zlib node)
         if(WITH_BRPC_RDMA)
             find_library(IBVERBS_LIBRARY NAMES ibverbs)
             ADD_LIBRARY(ibverbs SHARED IMPORTED GLOBAL)
@@ -216,7 +233,7 @@ if(WITH_DISTRIBUTE)
     #set_source_files_properties(send_recv_op_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
     #cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS prefetch_op send_op
     #        listen_and_serv_op sum_op executor SERIAL)
-    if(WITH_GPU)
+    if(WITH_GPU AND NOT WIN32)
         set_source_files_properties(test_send_nccl_id.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
         cc_test(test_send_nccl_id SRCS test_send_nccl_id.cc DEPS listen_and_serv_op ${DISTRIBUTE_DEPS} executor SERIAL)
         if(WITH_GRPC)
@@ -227,19 +244,25 @@ if(WITH_DISTRIBUTE)
         set_source_files_properties(gen_nccl_id_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
     else()
         set(DEPS_OPS ${DEPS_OPS} gen_nccl_id_op)
-    endif()
+    endif() # WITH_GPU AND NOT WIN32
 else()
     set(DEPS_OPS ${DEPS_OPS}  checkpoint_notify_op prefetch_op recv_op listen_and_serv_op send_op send_barrier_op fetch_barrier_op gen_nccl_id_op)
 endif()
 
 op_library(cross_entropy_op DEPS cross_entropy)
-op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax)
+if(WITH_GPU)
+  op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax cub)
+else()
+  op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax)
+endif()
+
 op_library(softmax_op DEPS softmax)
 op_library(sequence_softmax_op DEPS softmax)
 if (WITH_GPU AND TENSORRT_FOUND)
-    op_library(tensorrt_engine_op DEPS tensorrt_engine)
+    op_library(tensorrt_engine_op DEPS tensorrt_engine tensorrt_converter)
+    file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(tensorrt_engine);\n")
     nv_test(test_tensorrt_engine_op SRCS tensorrt_engine_op_test.cc
-      DEPS tensorrt_engine_op tensorrt_engine tensorrt_converter
+      DEPS tensorrt_engine_op
       analysis)
 else()
     set(DEPS_OPS ${DEPS_OPS} tensorrt_engine_op)
@@ -259,15 +282,24 @@ op_library(max_sequence_len_op DEPS lod_rank_table)
 op_library(sequence_conv_op DEPS context_project)
 op_library(sequence_pool_op DEPS sequence_pooling)
 op_library(lstm_op DEPS sequence2batch lstm_compute)
+op_library(hierarchical_sigmoid_op DEPS matrix_bit_code)
 op_library(lstmp_op DEPS sequence2batch lstm_compute)
 op_library(gru_op DEPS sequence2batch gru_compute)
 op_library(recurrent_op DEPS executor)
 op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
 op_library(cos_sim_op DEPS cos_sim_functor)
 op_library(parallel_do_op DEPS executor)
+op_library(unsqueeze_op DEPS reshape_op)
+op_library(squeeze_op DEPS reshape_op)
+op_library(extract_rows_op DEPS memory)
+op_library(flatten_op DEPS reshape_op)
+op_library(sequence_pad_op DEPS sequence_padding)
+op_library(unstack_op DEPS stack_op)
+op_library(fake_quantize_op DEPS memory)
 
 if (WITH_GPU)
     op_library(conv_op DEPS vol2col depthwise_conv im2col)
+    op_library(layer_norm_op DEPS cub)
 else()
     op_library(conv_op DEPS vol2col im2col)
 endif()
@@ -287,19 +319,14 @@ op_library(channel_recv_op DEPS concurrency)
 
 list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
 
-# The fully connected layer is deleted when the WITH_MKLDNN flag is OFF
-# Because the fully connected layer has only one MKLDNN's operator
-if(NOT WITH_MKLDNN)
-    list(REMOVE_ITEM GENERAL_OPS fc_op)
-endif(NOT WITH_MKLDNN)
-
 foreach(src ${GENERAL_OPS})
     op_library(${src})
 endforeach()
 
 file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n")
-
+if (NOT WIN32)
 add_subdirectory(reader)
+endif(NOT WIN32)
 foreach(src ${READER_LIBRARY})
     set(OP_LIBRARY ${src} ${OP_LIBRARY})
 endforeach()
@@ -319,5 +346,7 @@ cc_test(beam_search_op_test SRCS beam_search_op_test.cc DEPS lod_tensor beam_sea
 cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor memory)
 cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
 cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op)
+if(NOT WIN32)
 nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
+endif()
 nv_test(dropout_op_test SRCS dropout_op_test.cc DEPS dropout_op tensor)
diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h
index 912415192659dc004f54a76e9cd1a20581d512a6..2e31d1c9c708225135e27c93ba94722794c4b282 100644
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -865,8 +865,8 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> {
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
     auto temp1 = static_cast<T>(1) /
                  (static_cast<T>(1) + (static_cast<T>(-beta) * x).exp());
-    auto temp2 = temp1 * (static_cast<T>(1) - (beta * out));
-    dx.device(d) = dout * ((beta * out) + temp2);
+    auto temp2 = temp1 * (static_cast<T>(1) - (static_cast<T>(beta) * out));
+    dx.device(d) = dout * ((static_cast<T>(beta) * out) + temp2);
   }
 };
 
diff --git a/paddle/fluid/operators/adam_op.h b/paddle/fluid/operators/adam_op.h
index a7a28b02b67f2ef180ec0e273dbe7ef555f88ce2..84a584f424823a450effd4c36e9da600f5851da2 100644
--- a/paddle/fluid/operators/adam_op.h
+++ b/paddle/fluid/operators/adam_op.h
@@ -293,11 +293,18 @@ class AdamOpKernel : public framework::OpKernel<T> {
       auto& grad_tensor = grad_merge.value();
       const T* grad_data = grad_tensor.template data<T>();
       int64_t* rows = nullptr;
+// When compiled without CUDA, the CUDAMutableData() interface should not be
+// provided.
+#if defined(PADDLE_WITH_CUDA)
       if (platform::is_gpu_place(ctx.GetPlace())) {
         rows = grad_merge.mutable_rows()->CUDAMutableData(ctx.GetPlace());
       } else {
+#endif
         rows = grad_merge.mutable_rows()->data();
+
+#if defined(PADDLE_WITH_CUDA)
       }
+#endif
       auto row_numel = grad_tensor.numel() / grad_merge.rows().size();
 
       SparseAdamFunctor<T> functor(
diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9b943440a869e213db4ed761cfe7c508bc5e94ae
--- /dev/null
+++ b/paddle/fluid/operators/attention_lstm_op.cc
@@ -0,0 +1,419 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/attention_lstm_op.h"
+#include <string>
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/math/cpu_vec.h"
+#include "paddle/fluid/operators/math/fc_compute.h"
+#include "paddle/fluid/platform/cpu_info.h"
+
+namespace paddle {
+namespace operators {
+
+void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
+  PADDLE_ENFORCE(ctx->HasInput("X"),
+                 "Assert only one Input(X) of AttentionLSTM.");
+  PADDLE_ENFORCE(ctx->HasInput("C0"),
+                 "Assert only one Input(C0) of AttentionLSTM.");
+  PADDLE_ENFORCE(ctx->HasInput("LSTMWeight"),
+                 "Assert only one Input(LSTMWeight) of AttentionLSTM.");
+  PADDLE_ENFORCE(ctx->HasInput("LSTMBias"),
+                 "Assert only one Input(LSTMBias) of AttentionLSTM.");
+  PADDLE_ENFORCE(ctx->HasInput("AttentionWeight"),
+                 "Assert only one Input(AttentionWeight) of AttentionLSTM.");
+
+  PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
+                 "Assert only one Output(Hidden) of AttentionLSTM.");
+  PADDLE_ENFORCE(ctx->HasOutput("Cell"),
+                 "Assert only one Output(Cell) of AttentionLSTM.");
+  PADDLE_ENFORCE(ctx->HasOutput("AttentionedX"),
+                 "Assert only one Output(AttentionedX) of AttentionLSTM.");
+  PADDLE_ENFORCE(ctx->HasOutput("AttentionFCOut"),
+                 "Assert only one Output(AttentionFCOut) of AttentionLSTM.");
+  PADDLE_ENFORCE(ctx->HasOutput("LSTMX"),
+                 "Assert only one Output(LSTMX) of AttentionLSTM.");
+  PADDLE_ENFORCE(ctx->HasOutput("LSTMOUT"),
+                 "Assert only one Output(LSTMOUT) of AttentionLSTM.");
+
+  auto x_dims = ctx->GetInputDim("X");
+  const int M = x_dims[1];
+  PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank must be 2.");
+
+  auto w_dims = ctx->GetInputDim("LSTMWeight");
+  const int D = w_dims[1] / 4;
+  PADDLE_ENFORCE_EQ(w_dims.size(), 2, "Input(LSTMWeight)'s rank must be 2.");
+  PADDLE_ENFORCE_EQ(w_dims[0], D + M,
+                    "LSTMWeight dims should be (%d + %d) * %d.", D, M, 4 * D);
+
+  auto b_dims = ctx->GetInputDim("LSTMBias");
+  PADDLE_ENFORCE_EQ(b_dims.size(), 2, "Input(LSTMBias)'s rank must be 2.");
+  PADDLE_ENFORCE_EQ(b_dims[0], 1, "LSTMBias dims should be 1 x %d.", 4 * D);
+  PADDLE_ENFORCE_EQ(b_dims[1], 4 * D, "LSTMBias dims should be 1 x %d.", 4 * D);
+
+  auto c_dims = ctx->GetInputDim("C0");
+  PADDLE_ENFORCE_EQ(c_dims.size(), 2, "Input(C0)'s rank must be 2.");
+  PADDLE_ENFORCE_EQ(c_dims[1], D, "C0 dims should be N x %d.", D);
+  if (ctx->HasInput("H0")) {
+    auto h_dims = ctx->GetInputDim("H0");
+    PADDLE_ENFORCE(h_dims == c_dims,
+                   "The dimension of Input(H0) and Input(C0) "
+                   "should be the same.");
+  }
+
+  auto atten_w_dims = ctx->GetInputDim("AttentionWeight");
+  PADDLE_ENFORCE_EQ(atten_w_dims.size(), 2,
+                    "Input(AttentionWeight)'s rank must be 2.");
+  PADDLE_ENFORCE_EQ(atten_w_dims[0], M + D,
+                    "AttentionWeight shapes must be (%d + %d) * 1.", M, D);
+  PADDLE_ENFORCE_EQ(atten_w_dims[1], 1,
+                    "AttentionWeight shapes must be (%d + %d) * 1.", M, D);
+  if (ctx->HasInput("AttentionBias")) {
+    auto atten_b_dims = ctx->GetInputDim("AttentionBias");
+    PADDLE_ENFORCE_EQ(atten_b_dims.size(), 2,
+                      "Input(AttentionBias)'s rank must be 2.");
+    PADDLE_ENFORCE_EQ(atten_b_dims[0], 1,
+                      "AttentionBias shapes must be 1 * 1.");
+    PADDLE_ENFORCE_EQ(atten_b_dims[1], 1,
+                      "AttentionBias shapes must be 1 * 1.");
+  }
+
+  if (ctx->HasInput("AttentionScalar")) {
+    auto dims = ctx->GetInputDim("AttentionScalar");
+    PADDLE_ENFORCE_EQ(dims.size(), 2,
+                      "Input(AttentionScalar)'s rank must be 2.");
+    PADDLE_ENFORCE_EQ(dims[0], 1, "AttentionScalar shapes must be 1 * 1.");
+    PADDLE_ENFORCE_EQ(dims[1], 1, "AttentionScalar shapes must be 1 * 1.");
+  }
+
+  if (ctx->HasInput("AttentionScalarBias")) {
+    auto dims = ctx->GetInputDim("AttentionScalarBias");
+    PADDLE_ENFORCE(
+        ctx->HasInput("AttentionScalar"),
+        "AttentionScalar should not be null when have AttentionScalarBias.");
+    PADDLE_ENFORCE_EQ(dims.size(), 2,
+                      "Input(AttentionScalarBias)'s rank must be 2.");
+    PADDLE_ENFORCE_EQ(dims[0], 1, "AttentionScalarBias shapes must be 1 * 1.");
+    PADDLE_ENFORCE_EQ(dims[1], 1, "AttentionScalarBias shapes must be 1 * 1.");
+  }
+
+  framework::DDim out_dims({x_dims[0], D});
+  ctx->SetOutputDim("Hidden", out_dims);
+  ctx->SetOutputDim("Cell", out_dims);
+  ctx->SetOutputDim("AttentionedX", {x_dims[0], 1});
+  ctx->SetOutputDim("LSTMX", {1, M});
+  ctx->SetOutputDim("LSTMOUT", {1, 4 * D});
+  // AttentionFCOut should be reshape as (maxseqlen,1) in runtime
+  ctx->ShareLoD("X", "Hidden");
+  ctx->ShareLoD("X", "Cell");
+}
+
+framework::OpKernelType AttentionLSTMOp::GetExpectedKernelType(
+    const framework::ExecutionContext& ctx) const {
+  return framework::OpKernelType(
+      framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+      ctx.device_context());
+}
+
+void AttentionLSTMOpMaker::Make() {
+  AddInput("X",
+           "(LoDTensor) the input is a LodTensor, which support "
+           "variable-time length input sequence. The underlying tensor in "
+           "this LoDTensor is a matrix with shape (T X M), where T is the "
+           "total time steps in this mini-batch, M is the dim size of x.");
+  AddInput("C0",
+           "(Tensor) LSTM C0"
+           "This is a tensor with shape (N x D), where N is the batch size, D "
+           "is the gate size."
+           "C0 is necessary because of attention.");
+  AddInput("H0",
+           "(Tensor, optional) LSTM H0"
+           "This is a tensor with shape (N x D), where N is the "
+           "batch size and D is the gate size.")
+      .AsDispensable();
+  AddInput("AttentionWeight",
+           "(Tensor) the weights of attention fc. Always relu the fc result."
+           "The shape is ((M+D) x 1), where M is the dim size of x, D is the "
+           "gate size of LSTM.");
+  AddInput("AttentionBias",
+           "(Tensor, optional) the bias of attention fc."
+           "The shape is (1 x 1)")
+      .AsDispensable();
+  AddInput("AttentionScalar",
+           "(Tensor, optional) the scalar on the result of attentioned fc. "
+           "Always relu the Scalar."
+           "The shape is (1 x 1)")
+      .AsDispensable();
+  AddInput("AttentionScalarBias",
+           "(Tensor, optional) the scalar bias of attention fc."
+           "The shape is (1 x 1)")
+      .AsDispensable();
+  AddInput("LSTMWeight",
+           "(Tensor) the combined weight of LSTM"
+           " - The shape is ((D+M) x 4D), where D is the hidden gate size, M "
+           "is the dim size of x"
+           " - Weight = {W_forget, W_input, W_output, W_cell}");
+  AddInput("LSTMBias",
+           "(Tensor) the combined bias of LSTM, shape (1x4D)."
+           "Note: we should add the bias of hidden and context accorindg to "
+           "the same gate: "
+           "{B_forget, B_input, B_output, B_cell}");
+  AddOutput("Hidden",
+            "(LoDTensor) (same as LSTMOp) the hidden state of LSTM operator. "
+            "The shape is (T x D), and lod is the same with the `Input`.");
+  AddOutput("Cell",
+            "(LoDTensor) (same as LSTMOp) the cell state of LSTM operator. "
+            "The shape is (T x D), and lod is the same with the `Input`.");
+  AddOutput("AttentionedX",
+            "(Tensor) shape is (T x 1), the result after X * AttentionWeight,"
+            " where T is the total time steps in this mini-batch,"
+            " D is the hidden size.")
+      .AsIntermediate();
+  AddOutput("AttentionFCOut",
+            "(Tensor) (max_seq_len, 1), compute at each step.")
+      .AsIntermediate();
+  AddOutput("LSTMX",
+            "(Tensor) the input X of LSTM for each step."
+            "Shape is (1 x M), where M is the x frame size")
+      .AsIntermediate();
+  AddOutput(
+      "LSTMOUT",
+      "(Tensor) the output of LSTM X(1*(D+M))* weight((D+M)*4D) for each step."
+      "Shape is (1 x 4D), where M is the x frame size")
+      .AsIntermediate();
+  AddAttr<std::string>("gate_activation",
+                       "(string, default: sigmoid)"
+                       "The activation for input gate, forget gate and output "
+                       "gate, `sigmoid` by default.")
+      .SetDefault("sigmoid")
+      .InEnum({"sigmoid", "tanh", "relu", "identity"});
+  AddAttr<std::string>("cell_activation",
+                       "(string, default: tanh)"
+                       "The activation for cell output, `tanh` by defalut.")
+      .SetDefault("tanh")
+      .InEnum({"sigmoid", "tanh", "relu", "identity"});
+  AddAttr<std::string>("candidate_activation",
+                       "(string, default: tanh)"
+                       "The activation for candidate hidden state, "
+                       "`tanh` by default.")
+      .SetDefault("tanh")
+      .InEnum({"sigmoid", "tanh", "relu", "identity"});
+  AddComment(R"DOC(
+Attention Long-Short Term Memory (LSTM) Operator.
+
+Attention part:
+concat( x(seqlen * M), expand( cell_t-1(1,D) ) ) => tmp(seqlen*(M+D))
+
+tmp(seqlen*(M+D)) * fc((M+D)*1) => fcout(seqlen*1) with bias, relu
+
+fcout(seqlen*1) * scalar => fcout(seqlen*1) with bias, relu
+
+dotmul and sum pool ( fcout(seqlen*1), x(seqlen * M) ) => lstm_x_t(1, M) 
+
+LSTM part:
+use lstm_x_t as input and compute as standard LSTM.
+
+)DOC");
+}
+
+// y[i] = (x[i] + bias[0]) > 0 ? (x[i] + bias[0]) : 0;
+template <typename T>
+inline void bias_relu(const int n, const T* x, const T* bias, T* y) {
+  if (bias) {
+    math::vec_add_bias<T, platform::jit::avx>(n, *bias, x, y);
+    math::vec_relu<T, platform::jit::avx>(n, y, y);
+  } else {
+    math::vec_relu<T, platform::jit::avx>(n, x, y);
+  }
+}
+
+template <typename T>
+inline void vec_softmax(const int n, const T* x, T* y) {
+  T scalar = x[0];
+  // max
+  for (int i = 1; i < n; ++i) {
+    scalar = scalar < x[i] ? x[i] : scalar;
+  }
+  math::vec_add_bias<T, platform::jit::avx>(n, -scalar, x, y);  // sub
+  math::vec_exp<T>(n, y, y);                                    // exp
+  // sum
+  scalar = T(0);
+  for (int i = 0; i < n; ++i) {
+    scalar += y[i];
+  }
+  math::vec_scal<T>(n, static_cast<T>(1) / scalar, y);  // scale
+}
+
+template <typename T>
+class AttentionLSTMKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    using DeviceContext = paddle::platform::CPUDeviceContext;
+
+    auto* x = ctx.Input<LoDTensor>("X");
+    auto* h0 = ctx.Input<Tensor>("H0");
+    auto* c0 = ctx.Input<Tensor>("C0");
+    auto* atten_w = ctx.Input<Tensor>("AttentionWeight");
+    auto* atten_b = ctx.Input<Tensor>("AttentionBias");
+    auto* atten_scalar = ctx.Input<Tensor>("AttentionScalar");
+    auto* atten_scalar_bias = ctx.Input<Tensor>("AttentionScalarBias");
+    auto* lstm_w = ctx.Input<Tensor>("LSTMWeight");
+    auto* lstm_b = ctx.Input<Tensor>("LSTMBias");
+
+    auto* hidden_out = ctx.Output<LoDTensor>("Hidden");
+    auto* cell_out = ctx.Output<LoDTensor>("Cell");
+    auto* atted_x = ctx.Output<Tensor>("AttentionedX");
+    auto* fc_out = ctx.Output<Tensor>("AttentionFCOut");
+    auto* lstm_x = ctx.Output<Tensor>("LSTMX");
+    auto* lstm_out = ctx.Output<Tensor>("LSTMOUT");
+
+    // some shape should be reshape here since infershape can not get lod info
+    auto x_lod = x->lod();
+    const int N = x_lod[0].size() - 1;  // batch size
+    auto x_dims = x->dims();            // T x M
+    auto w_dims = lstm_w->dims();       // (D+M) x 4D
+    const int total_T = x_dims[0];
+    const int M = x_dims[1];      // x frame size
+    const int D = w_dims[1] / 4;  // gate frame size
+    const int D2 = D * 2;
+    const int D3 = D * 3;
+    const int D4 = w_dims[1];
+    int max_seq_len = x_lod[0][1];
+    for (int i = 1; i < N; ++i) {
+      int len = x_lod[0][i + 1] - x_lod[0][i];
+      max_seq_len = max_seq_len < len ? len : max_seq_len;
+    }
+    PADDLE_ENFORCE_EQ(x_lod.size(), 1, "Input(X)'s lod size must be 1.");
+    PADDLE_ENFORCE_EQ(c0->dims()[0], N, "C0 dims should be %d x %d.", N, D);
+    fc_out->Resize({max_seq_len, 1});
+
+    std::function<void(const int, const T *, T *)> act_gate, act_cell, act_cand;
+    auto& act_gate_str = ctx.Attr<std::string>("gate_activation");
+    auto& act_cell_str = ctx.Attr<std::string>("cell_activation");
+    auto& act_cand_str = ctx.Attr<std::string>("candidate_activation");
+    if (platform::jit::MayIUse(platform::jit::avx)) {
+      math::VecActivations<T, platform::jit::avx> act_functor;
+      act_gate = act_functor(act_gate_str);
+      act_cell = act_functor(act_cell_str);
+      act_cand = act_functor(act_cand_str);
+    } else {
+      math::VecActivations<T, platform::jit::isa_any> act_functor;
+      act_gate = act_functor(act_gate_str);
+      act_cell = act_functor(act_cell_str);
+      act_cand = act_functor(act_cand_str);
+    }
+
+    const T* x_data = x->data<T>();
+    const T* h0_data = h0 ? h0->data<T>() : NULL;
+    const T* c0_data = c0->data<T>();
+    const T* lstm_w_data = lstm_w->data<T>();
+    const T* lstm_b_data = lstm_b->data<T>();
+    const T* atten_w_data = atten_w->data<T>();
+    const T* atten_b_data = atten_b ? atten_b->data<T>() : NULL;
+    const T* atten_scalar_data = atten_scalar ? atten_scalar->data<T>() : NULL;
+    const T* atten_scalar_bias_data =
+        atten_scalar_bias ? atten_scalar_bias->data<T>() : NULL;
+
+    T* hidden_out_data = hidden_out->mutable_data<T>(ctx.GetPlace());
+    T* cell_out_data = cell_out->mutable_data<T>(ctx.GetPlace());
+    T* atted_x_data = atted_x->mutable_data<T>(ctx.GetPlace());
+    T* fc_out_data = fc_out->mutable_data<T>(ctx.GetPlace());
+    T* lstm_x_data = lstm_x->mutable_data<T>(ctx.GetPlace());
+    T* lstm_out_data = lstm_out->mutable_data<T>(ctx.GetPlace());
+
+    // x(TxM) * fc (Mx1) part of atten_wgt(M+D)x1
+    auto blas = math::GetBlas<DeviceContext, T>(ctx);
+    math::FCCompute<DeviceContext, T>(blas, total_T, 1, M, x_data, atten_w_data,
+                                      atted_x_data, atten_b_data);
+
+    const T* cur_atten_x_data = atted_x_data;
+    const T* cur_x_data = x_data;
+    const T* prev_cell_data = NULL;
+    const T* prev_hidden_data = NULL;
+    T* cur_cell_out_data = cell_out_data;
+    T* cur_hidden_out_data = hidden_out_data;
+    for (int i = 0; i < N; ++i) {
+      int seq_len = x_lod[0][i + 1] - x_lod[0][i];
+      prev_cell_data = c0_data + i * D;
+      prev_hidden_data = h0_data ? h0_data + i * D : NULL;
+      for (int step = 0; step < seq_len; ++step) {
+        /// 1. compute attention vector
+        // 1a. prev_cell(1xD) * fc(D) rest part of atten_wgt
+        T prev_cell_bias = blas.DOT(D, prev_cell_data, atten_w_data + M);
+        // 1b. add cell bias and relu
+        bias_relu<T>(seq_len, cur_atten_x_data, &prev_cell_bias, fc_out_data);
+        // 1c. fc scalar
+        if (atten_scalar_data) {
+          blas.SCAL(seq_len, *atten_scalar_data, fc_out_data);
+          bias_relu<T>(seq_len, fc_out_data, atten_scalar_bias_data,
+                       fc_out_data);
+        }
+        // 1d. softmax
+        vec_softmax<T>(seq_len, fc_out_data, fc_out_data);
+        // mul x(seq_len*M) and sum pool
+        math::FCCompute<DeviceContext, T>(blas, 1, M, seq_len, fc_out_data,
+                                          cur_x_data, lstm_x_data);
+
+        /// 2. compute LSTM step
+        // lstm weight : concat[forget , input , output , tilde]
+        // shape : (D + M) x (4 * D)
+        // fc inputX(1xM) * weightX(M*(4D))  => 1 x 4D
+        blas.MatMul(1, D4, M, lstm_x_data, lstm_w_data + D * D4, lstm_out_data);
+        if (prev_hidden_data) {
+          blas.GEMM(CblasNoTrans, CblasNoTrans, 1, D4, D, static_cast<T>(1),
+                    prev_hidden_data, D, lstm_w_data, D4, static_cast<T>(1),
+                    lstm_out_data, D4);
+        }
+        // since input is 1xM, so can use add bias
+        blas.VADD(D4, lstm_b_data, lstm_out_data, lstm_out_data);
+
+        // gate act: sigmoid
+        act_gate(D3, lstm_out_data, lstm_out_data);
+        // candicate act: tanh
+        act_cand(D, lstm_out_data + D3, lstm_out_data + D3);
+
+        // a = forget * prev_cell
+        blas.VMUL(D, lstm_out_data, prev_cell_data, lstm_out_data);
+
+        // b = input * tilde
+        blas.VMUL(D, lstm_out_data + D, lstm_out_data + D3, lstm_out_data + D);
+
+        // cell_out = a + b
+        blas.VADD(D, lstm_out_data, lstm_out_data + D, cur_cell_out_data);
+
+        // state act tanh(cell_out) * output_gate
+        act_cell(D, cur_cell_out_data, lstm_out_data);
+        blas.VMUL(D, lstm_out_data, lstm_out_data + D2, cur_hidden_out_data);
+
+        prev_hidden_data = cur_hidden_out_data;
+        prev_cell_data = cur_cell_out_data;
+        cur_cell_out_data = cur_cell_out_data + D;
+        cur_hidden_out_data = cur_hidden_out_data + D;
+      }
+      cur_x_data = cur_x_data + seq_len * M;
+      cur_atten_x_data = cur_atten_x_data + seq_len;
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(attention_lstm, ops::AttentionLSTMOp,
+                  ops::AttentionLSTMOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+
+REGISTER_OP_CPU_KERNEL(attention_lstm, ops::AttentionLSTMKernel<float>,
+                       ops::AttentionLSTMKernel<double>);
diff --git a/paddle/fluid/operators/attention_lstm_op.h b/paddle/fluid/operators/attention_lstm_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..6ede3a7f3c96dd2d13d7c5c19816647e16a3c8d0
--- /dev/null
+++ b/paddle/fluid/operators/attention_lstm_op.h
@@ -0,0 +1,41 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using LoDTensor = framework::LoDTensor;
+using Tensor = framework::Tensor;
+
+class AttentionLSTMOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override;
+};
+
+class AttentionLSTMOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override;
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/auc_op.cc b/paddle/fluid/operators/auc_op.cc
index c9871a9fe6b3b0d0cf671c2d155715f92c94fd8f..dfaa7456f917c1308984b361afed752f96ea6f59 100644
--- a/paddle/fluid/operators/auc_op.cc
+++ b/paddle/fluid/operators/auc_op.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/auc_op.h"
-#include <string>
 
 namespace paddle {
 namespace operators {
@@ -24,26 +23,31 @@ class AucOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Out"), "Input of Out should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Indices"),
-                   "Input of Indices should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Predict"),
+                   "Input of Out should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("Label"),
                    "Input of Label should not be null.");
-    auto inference_height = ctx->GetInputDim("Out")[0];
+    auto predict_width = ctx->GetInputDim("Predict")[1];
+    PADDLE_ENFORCE_EQ(predict_width, 2, "Only support binary classification");
+    auto predict_height = ctx->GetInputDim("Predict")[0];
     auto label_height = ctx->GetInputDim("Label")[0];
 
-    PADDLE_ENFORCE_EQ(inference_height, label_height,
+    PADDLE_ENFORCE_EQ(predict_height, label_height,
                       "Out and Label should have same height.");
 
+    int num_pred_buckets = ctx->Attrs().Get<int>("num_thresholds") + 1;
+
     ctx->SetOutputDim("AUC", {1});
-    ctx->ShareLoD("Out", /*->*/ "AUC");
+    ctx->SetOutputDim("BatchAUC", {1});
+    ctx->SetOutputDim("StatPosOut", {num_pred_buckets});
+    ctx->SetOutputDim("StatNegOut", {num_pred_buckets});
   }
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("Out")->type()),
+        framework::ToDataType(ctx.Input<Tensor>("Predict")->type()),
         ctx.device_context());
   }
 };
@@ -51,29 +55,31 @@ class AucOp : public framework::OperatorWithKernel {
 class AucOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
-    AddInput("Out",
-             "A floating point 2D tensor, values are in the range [0, 1]."
-             "Each row is sorted in descending order. This input should be the"
-             "output of topk."
+    AddInput("Predict",
+             "A floating point 2D tensor with shape [batch_size, 2], values "
+             "are in the range [0, 1]."
              "Typically, this tensor indicates the probability of each label");
-    AddInput("Indices",
-             "An int 2D tensor, indicating the indices of original"
-             "tensor before sorting. Typically, this tensor indicates which "
-             "label the probability stands for.");
     AddInput("Label",
-             "A 2D int tensor indicating the label of the training data."
-             "The height is batch size and width is always 1.");
+             "A 2D int tensor indicating the label of the training data. "
+             "shape: [batch_size, 1]");
     // TODO(typhoonzero): support weight input
+    AddInput("StatPos", "Statistic value when label = 1");
+    AddInput("StatNeg", "Statistic value when label = 0");
+
     AddOutput("AUC",
               "A scalar representing the "
               "current area-under-the-curve.");
+    AddOutput("BatchAUC", "The AUC for current batch");
+    AddOutput("StatPosOut", "Statistic value when label = 1");
+    AddOutput("StatNegOut", "Statistic value when label = 0");
 
     AddAttr<std::string>("curve", "Curve type, can be 'ROC' or 'PR'.")
         .SetDefault("ROC");
+
     AddAttr<int>("num_thresholds",
                  "The number of thresholds to use when discretizing the"
                  " roc curve.")
-        .SetDefault(200);
+        .SetDefault((2 << 12) - 1);
 
     AddComment(R"DOC(
 Area Under The Curve (AUC) Operator.
diff --git a/paddle/fluid/operators/auc_op.h b/paddle/fluid/operators/auc_op.h
index 8b016c3d31ad83e66baeb298c61840cc529efa1e..fb0517d70635e090f8c5b59ff9d8420fc34c747b 100644
--- a/paddle/fluid/operators/auc_op.h
+++ b/paddle/fluid/operators/auc_op.h
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+
 #include <string>
 #include <vector>
-#include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
@@ -23,110 +23,85 @@ namespace operators {
 
 using Tensor = framework::Tensor;
 
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
 template <typename DeviceContext, typename T>
 class AucKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* inference = ctx.Input<Tensor>("Out");
-    auto* label = ctx.Input<Tensor>("Label");
-    auto* auc = ctx.Output<Tensor>("AUC");
-
-    float* auc_data = auc->mutable_data<float>(ctx.GetPlace());
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *predict = ctx.Input<Tensor>("Predict");
+    auto *label = ctx.Input<Tensor>("Label");
 
     std::string curve = ctx.Attr<std::string>("curve");
     int num_thresholds = ctx.Attr<int>("num_thresholds");
-    std::vector<float> thresholds_list;
-    thresholds_list.reserve(num_thresholds);
-    for (int i = 1; i < num_thresholds - 1; i++) {
-      thresholds_list[i] = static_cast<float>(i) / (num_thresholds - 1);
-    }
-    const float kEpsilon = 1e-7;
-    thresholds_list[0] = 0.0f - kEpsilon;
-    thresholds_list[num_thresholds - 1] = 1.0f + kEpsilon;
-
-    size_t batch_size = inference->dims()[0];
-    size_t inference_width = inference->dims()[1];
-
-    const T* inference_data = inference->data<T>();
-    const int64_t* label_data = label->data<int64_t>();
-
-    // Create local tensor for storing the curve: TP, FN, TN, FP
-    // TODO(typhoonzero): use eigen op to caculate these values.
-    Tensor true_positive, false_positive, true_negative, false_negative;
-
-    true_positive.Resize({num_thresholds});
-    false_negative.Resize({num_thresholds});
-    true_negative.Resize({num_thresholds});
-    false_positive.Resize({num_thresholds});
-
-    int64_t* tp_data = true_positive.mutable_data<int64_t>(ctx.GetPlace());
-    int64_t* fn_data = false_negative.mutable_data<int64_t>(ctx.GetPlace());
-    int64_t* tn_data = true_negative.mutable_data<int64_t>(ctx.GetPlace());
-    int64_t* fp_data = false_positive.mutable_data<int64_t>(ctx.GetPlace());
-
-    for (int idx_thresh = 0; idx_thresh < num_thresholds; idx_thresh++) {
-      // caculate TP, FN, TN, FP for current thresh
-      int64_t tp = 0, fn = 0, tn = 0, fp = 0;
-      for (size_t i = 0; i < batch_size; i++) {
-        // NOTE: label_data used as bool, labels >0 will be treated as true.
-        if (label_data[i]) {
-          // use first(max) data in each row
-          if (inference_data[i * inference_width] >=
-              (thresholds_list[idx_thresh])) {
-            tp++;
-          } else {
-            fn++;
-          }
-        } else {
-          if (inference_data[i * inference_width] >=
-              (thresholds_list[idx_thresh])) {
-            fp++;
-          } else {
-            tn++;
-          }
-        }
+    int num_pred_buckets = num_thresholds + 1;
+
+    // Only use output var for now, make sure it's persistable and
+    // not cleaned up for each batch.
+    auto *auc = ctx.Output<Tensor>("AUC");
+    auto *stat_pos = ctx.Output<Tensor>("StatPosOut");
+    auto *stat_neg = ctx.Output<Tensor>("StatNegOut");
+
+    auto *stat_pos_data = stat_pos->mutable_data<int64_t>(ctx.GetPlace());
+    auto *stat_neg_data = stat_neg->mutable_data<int64_t>(ctx.GetPlace());
+    calcAuc(ctx, label, predict, stat_pos_data, stat_neg_data, num_thresholds,
+            auc);
+
+    auto *batch_auc = ctx.Output<Tensor>("BatchAUC");
+    std::vector<int64_t> stat_pos_batch(num_pred_buckets, 0);
+    std::vector<int64_t> stat_neg_batch(num_pred_buckets, 0);
+    calcAuc(ctx, label, predict, stat_pos_batch.data(), stat_neg_batch.data(),
+            num_thresholds, batch_auc);
+  }
+
+ private:
+  inline static double trapezoidArea(double X1, double X2, double Y1,
+                                     double Y2) {
+    return (X1 > X2 ? (X1 - X2) : (X2 - X1)) * (Y1 + Y2) / 2.0;
+  }
+
+  inline static void calcAuc(const framework::ExecutionContext &ctx,
+                             const framework::Tensor *label,
+                             const framework::Tensor *predict,
+                             int64_t *stat_pos, int64_t *stat_neg,
+                             int num_thresholds,
+                             framework::Tensor *auc_tensor) {
+    size_t batch_size = predict->dims()[0];
+    size_t inference_width = predict->dims()[1];
+    const T *inference_data = predict->data<T>();
+    const auto *label_data = label->data<int64_t>();
+
+    auto *auc = auc_tensor->mutable_data<double>(ctx.GetPlace());
+
+    for (size_t i = 0; i < batch_size; i++) {
+      uint32_t binIdx = static_cast<uint32_t>(
+          inference_data[i * inference_width + 1] * num_thresholds);
+      if (label_data[i]) {
+        stat_pos[binIdx] += 1.0;
+      } else {
+        stat_neg[binIdx] += 1.0;
       }
-      // store rates
-      tp_data[idx_thresh] = tp;
-      fn_data[idx_thresh] = fn;
-      tn_data[idx_thresh] = tn;
-      fp_data[idx_thresh] = fp;
     }
-    // epsilon to avoid divide by zero.
-    float epsilon = 1e-6;
-    // Riemann sum to caculate auc.
-    Tensor tp_rate, fp_rate, rec_rate;
-    tp_rate.Resize({num_thresholds});
-    fp_rate.Resize({num_thresholds});
-    rec_rate.Resize({num_thresholds});
-    float* tp_rate_data = tp_rate.mutable_data<float>(ctx.GetPlace());
-    float* fp_rate_data = fp_rate.mutable_data<float>(ctx.GetPlace());
-    float* rec_rate_data = rec_rate.mutable_data<float>(ctx.GetPlace());
-    for (int i = 0; i < num_thresholds; i++) {
-      tp_rate_data[i] = (static_cast<float>(tp_data[i]) + epsilon) /
-                        (tp_data[i] + fn_data[i] + epsilon);
-      fp_rate_data[i] =
-          static_cast<float>(fp_data[i]) / (fp_data[i] + tn_data[i] + epsilon);
-      rec_rate_data[i] = (static_cast<float>(tp_data[i]) + epsilon) /
-                         (tp_data[i] + fp_data[i] + epsilon);
+
+    *auc = 0.0f;
+
+    double totPos = 0.0;
+    double totNeg = 0.0;
+    double totPosPrev = 0.0;
+    double totNegPrev = 0.0;
+
+    int idx = num_thresholds;
+
+    while (idx >= 0) {
+      totPosPrev = totPos;
+      totNegPrev = totNeg;
+      totPos += stat_pos[idx];
+      totNeg += stat_neg[idx];
+      *auc += trapezoidArea(totNeg, totNegPrev, totPos, totPosPrev);
+
+      --idx;
     }
-    *auc_data = 0.0f;
-    if (curve == "ROC") {
-      for (int i = 0; i < num_thresholds - 1; i++) {
-        auto dx = fp_rate_data[i] - fp_rate_data[i + 1];
-        auto y = (tp_rate_data[i] + tp_rate_data[i + 1]) / 2.0f;
-        *auc_data = *auc_data + dx * y;
-      }
-    } else if (curve == "PR") {
-      for (int i = 1; i < num_thresholds; i++) {
-        auto dx = tp_rate_data[i] - tp_rate_data[i - 1];
-        auto y = (rec_rate_data[i] + rec_rate_data[i - 1]) / 2.0f;
-        *auc_data = *auc_data + dx * y;
-      }
+
+    if (totPos > 0.0 && totNeg > 0.0) {
+      *auc = *auc / totPos / totNeg;
     }
   }
 };
diff --git a/paddle/fluid/operators/batch_norm_mkldnn_op.cc b/paddle/fluid/operators/batch_norm_mkldnn_op.cc
index 9ab2179b5fe689762704039c5f67dd080e530aa5..de641cb08e4cc3322cc8387d873f2aaab279e1dd 100644
--- a/paddle/fluid/operators/batch_norm_mkldnn_op.cc
+++ b/paddle/fluid/operators/batch_norm_mkldnn_op.cc
@@ -37,6 +37,95 @@ struct bn_type_traits {
   using op_prim = typename op_type::primitive_desc;
 };
 
+class BatchNormMKLDNNHandler : public platform::MKLDNNHandler {
+ public:
+  BatchNormMKLDNNHandler(
+      std::shared_ptr<batch_norm_fwd::primitive_desc> batch_norm_pd,
+      const platform::MKLDNNDeviceContext &dev_ctx, mkldnn::engine engine,
+      const std::string &base_key)
+      : platform::MKLDNNHandler(dev_ctx, engine, base_key) {
+    batch_norm_pd_ = batch_norm_pd;
+  }
+
+  std::shared_ptr<memory> AcquireScaleshiftMemoryFromPrimitive(void *ptr) {
+    return this->AcquireMemoryFromPrimitive(
+        batch_norm_pd_->weights_primitive_desc(), ptr, "@scaleshift_mem_p");
+  }
+
+  std::shared_ptr<memory> AcquireMeanMemoryFromPrimitive(void *ptr) {
+    return this->AcquireMemoryFromPrimitive(
+        batch_norm_pd_->mean_primitive_desc(), ptr, "@mean_mem_p");
+  }
+
+  std::shared_ptr<memory> AcquireVarianceMemoryFromPrimitive(void *ptr) {
+    return this->AcquireMemoryFromPrimitive(
+        batch_norm_pd_->variance_primitive_desc(), ptr, "@variance_mem_p");
+  }
+
+  std::shared_ptr<batch_norm_fwd> AcquireTestTrainingBatchNormFwd(
+      std::shared_ptr<memory> src_memory,
+      std::shared_ptr<memory> scaleshift_memory,
+      std::shared_ptr<memory> dst_memory, std::shared_ptr<memory> mean_memory,
+      std::shared_ptr<memory> variance_memory, bool is_test) {
+    auto prim_key = key_ + "@batch_norm_p";
+    auto batch_norm_p =
+        std::static_pointer_cast<batch_norm_fwd>(dev_ctx_.GetBlob(prim_key));
+
+    PADDLE_ENFORCE((batch_norm_p != nullptr) || !is_reusing_,
+                   "Fail to find batch norm primitive in device context");
+
+    if (batch_norm_p == nullptr) {
+      if (is_test) {
+        batch_norm_p = std::make_shared<batch_norm_fwd>(
+            *batch_norm_pd_, *src_memory,
+            (const mkldnn::primitive::at &)*mean_memory,
+            (const mkldnn::primitive::at &)*variance_memory, *scaleshift_memory,
+            *dst_memory);
+      } else {
+        batch_norm_p = std::make_shared<batch_norm_fwd>(
+            *batch_norm_pd_, *src_memory, *scaleshift_memory, *dst_memory,
+            *mean_memory, *variance_memory);
+      }
+
+      dev_ctx_.SetBlob(prim_key, batch_norm_p);
+    } else {
+      is_reusing_ = true;
+    }
+
+    return batch_norm_p;
+  }
+
+  static std::string GetHash(const memory::dims &input_dims, float epsilon,
+                             unsigned flag, bool is_test, memory::format format,
+                             const std::string &suffix = "") {
+    auto dims2str = [](const memory::dims &operand_dims) {
+      std::string dstr = "";
+      for (size_t i = 0; i < operand_dims.size(); ++i) {
+        dstr += std::to_string(operand_dims[i]) + "-";
+      }
+      return dstr;
+    };
+    return dims2str(input_dims) + std::to_string(epsilon) +
+           std::to_string(flag) + std::to_string(is_test) +
+           std::to_string(format) + suffix;
+  }
+
+ private:
+  std::shared_ptr<batch_norm_fwd::primitive_desc> batch_norm_pd_;
+};
+
+std::shared_ptr<memory> UpdateMemoryData(
+    const platform::MKLDNNDeviceContext &dev_ctx, const std::string &key,
+    void *new_ptr) {
+  auto mem = std::static_pointer_cast<memory>(dev_ctx.GetBlob(key));
+  PADDLE_ENFORCE(
+      mem != nullptr,
+      (std::string("Fail to find memory in device context [key: ") + key + "]")
+          .c_str());
+  mem->set_data_handle(new_ptr);
+  return mem;
+}
+
 template <typename T, typename Container>
 void copy_to_weights(T scale_begin, T scale_end, T shift_begin, T shift_end,
                      Container *c) {
@@ -48,15 +137,6 @@ void copy_to_weights(T scale_begin, T scale_end, T shift_begin, T shift_end,
       std::inserter(*c, std::next(it, std::distance(scale_begin, scale_end))));
 }
 
-template <typename Op, typename... Args>
-void run_batch_norm_op(Args &&... args) {
-  Op batch_norm_op{args...};
-
-  std::vector<mkldnn::primitive> pipeline;
-  pipeline.push_back(batch_norm_op);
-  mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
-}
-
 }  // namespace
 
 template <typename T>
@@ -110,6 +190,14 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     PADDLE_ENFORCE(scale_tz.size() == 1, "Dims of scale tensor is NOT 1");
     const unsigned int ic = scale_tz[0];
 
+    // MKLDNN requires a single piece of memory for scale and shift/bias data
+    const size_t scaleshift_size = 2 * ic;
+    std::vector<T> scaleshift_data;
+    scaleshift_data.reserve(scaleshift_size);
+
+    copy_to_weights(scale->data<T>(), scale->data<T>() + ic, shift->data<T>(),
+                    shift->data<T>() + ic, &scaleshift_data);
+
     unsigned flags = mkldnn::use_scale_shift;
     if (is_test) flags |= mkldnn::use_global_stats;
     if (fuse_with_relu) flags |= mkldnn::fuse_bn_relu;
@@ -118,64 +206,69 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     mkldnn::memory::format input_format =
         platform::MKLDNNFormatForSize(src_tz.size(), x->format());
 
-    auto src_memory = memory(
-        {{{src_tz}, memory::data_type::f32, input_format}, mkldnn_engine},
-        to_void_cast(x_data));
+    // keys for backward pass
+    const std::string key = BatchNormMKLDNNHandler::GetHash(
+        src_tz, epsilon, flags, is_test, input_format,
+        ctx.op().Output("SavedMean"));
+    const std::string key_batch_norm_fwd_pd = key + "@bn_fwd_pd";
+
+    auto user_src_md = platform::MKLDNNMemDesc(
+        {src_tz}, platform::MKLDNNGetDataType<T>(), input_format);
 
     // create primitive descriptor for batch norm forward
     using bn_fwd_types = bn_type_traits<mkldnn::batch_normalization_forward>;
-    auto batch_norm_fwd_desc = bn_fwd_types::op_desc{
-        propagation, src_memory.get_primitive_desc().desc(), epsilon, flags};
-    std::shared_ptr<batch_norm_fwd::primitive_desc> batch_norm_fwd_pd =
-        std::shared_ptr<batch_norm_fwd::primitive_desc>(
-            new batch_norm_fwd::primitive_desc(batch_norm_fwd_desc,
-                                               mkldnn_engine));
-
-    // Save the pd to be used in backward pass
-    const std::string key = ctx.op().Output("SavedMean");
-    const std::string key_batch_norm_fwd_pd = key + "@bn_fwd_pd";
+    auto batch_norm_fwd_desc =
+        bn_fwd_types::op_desc{propagation, user_src_md, epsilon, flags};
+    auto batch_norm_fwd_pd = std::make_shared<batch_norm_fwd::primitive_desc>(
+        batch_norm_fwd_desc, mkldnn_engine);
+    // Save conv_pd/src_memory/weights_memory for backward pass
     dev_ctx.SetBlob(key_batch_norm_fwd_pd, batch_norm_fwd_pd);
 
-    // MKLDNN requires a single piece of memory for scale and shift/bias data
-    const size_t scaleshift_size = 2 * ic;
-    std::vector<T> scaleshift_data;
-    scaleshift_data.reserve(scaleshift_size);
+    BatchNormMKLDNNHandler handler(batch_norm_fwd_pd, dev_ctx, mkldnn_engine,
+                                   key);
 
-    copy_to_weights(scale->data<T>(), scale->data<T>() + ic, shift->data<T>(),
-                    shift->data<T>() + ic, &scaleshift_data);
+    auto src_memory =
+        handler.AcquireSrcMemory(user_src_md, to_void_cast(x_data));
 
     // crate mkldnn memory for weights(scale/shift)
-    auto scaleshift_memory = memory(batch_norm_fwd_pd->weights_primitive_desc(),
-                                    scaleshift_data.data());
+    auto scaleshift_memory =
+        handler.AcquireScaleshiftMemoryFromPrimitive(scaleshift_data.data());
 
     // create mkldnn memory for output y tensor
-    auto dst_memory = memory(batch_norm_fwd_pd->dst_primitive_desc(), y_data);
+    auto dst_memory = handler.AcquireDstMemory(
+        batch_norm_fwd_pd->dst_primitive_desc().desc(), y_data);
 
+    std::shared_ptr<batch_norm_fwd> batch_norm_p;
     if (is_test) {
       // create mkldnn memory for stats (as input)
-      auto mean_memory = memory(batch_norm_fwd_pd->mean_primitive_desc(),
-                                to_void_cast(mean_data));
-      auto variance_memory =
-          memory(batch_norm_fwd_pd->variance_primitive_desc(),
-                 to_void_cast(variance_data));
-
-      run_batch_norm_op<typename bn_fwd_types::op_type>(
-          *batch_norm_fwd_pd, src_memory,
-          (const mkldnn::primitive::at &)mean_memory,
-          (const mkldnn::primitive::at &)variance_memory, scaleshift_memory,
-          dst_memory);
+      std::shared_ptr<memory> mean_memory =
+          handler.AcquireMeanMemoryFromPrimitive(to_void_cast(mean_data));
+      std::shared_ptr<memory> variance_memory =
+          handler.AcquireVarianceMemoryFromPrimitive(
+              to_void_cast(variance_data));
+
+      batch_norm_p = handler.AcquireTestTrainingBatchNormFwd(
+          src_memory, scaleshift_memory, dst_memory, mean_memory,
+          variance_memory, true);
     } else {
       // create mkldnn memory for stats (as output)
-      auto mean_memory =
-          memory(batch_norm_fwd_pd->mean_primitive_desc(), batch_mean_data);
-      auto variance_memory = memory(
-          batch_norm_fwd_pd->variance_primitive_desc(), batch_variance_data);
-
-      run_batch_norm_op<bn_fwd_types::op_type>(*batch_norm_fwd_pd, src_memory,
-                                               scaleshift_memory, dst_memory,
-                                               mean_memory, variance_memory);
+      std::shared_ptr<memory> mean_memory =
+          handler.AcquireMeanMemoryFromPrimitive(batch_mean_data);
+      std::shared_ptr<memory> variance_memory =
+          handler.AcquireVarianceMemoryFromPrimitive(batch_variance_data);
+
+      batch_norm_p = handler.AcquireTestTrainingBatchNormFwd(
+          src_memory, scaleshift_memory, dst_memory, mean_memory,
+          variance_memory, false);
     }
 
+    y->set_layout(DataLayout::kMKLDNN);
+    y->set_format(platform::GetMKLDNNFormat(*dst_memory));
+
+    std::vector<mkldnn::primitive> pipeline;
+    pipeline.push_back(*batch_norm_p);
+    mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+
     if (!is_test) {
       // mkldnn only compute stats for current batch
       // so we need compute momentum stats via Eigen lib
@@ -192,10 +285,6 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
       running_variance_e =
           variance_e * momentum + batch_variance_e * one_minus_momentum;
     }
-
-    y->set_layout(DataLayout::kMKLDNN);
-    y->set_format(
-        (memory::format)dst_memory.get_primitive_desc().desc().data.format);
   }
 };
 
@@ -242,61 +331,48 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
 
     const unsigned int ic = scale_tz[0];
 
-    // Retrieve bn_fwd_pd from device context
-    const std::string key = ctx.op().Input("SavedMean");
-    const std::string key_batch_norm_fwd_pd = key + "@bn_fwd_pd";
-    auto batch_norm_fwd_pd =
-        std::static_pointer_cast<batch_norm_fwd::primitive_desc>(
-            dev_ctx.GetBlob(key_batch_norm_fwd_pd));
-    PADDLE_ENFORCE(batch_norm_fwd_pd != nullptr,
-                   "Fail to find batch_norm_fwd_pd in device context");
-
     using bn_bwd_types = bn_type_traits<mkldnn::batch_normalization_backward>;
 
-    // create mkldnn memory from input diff_y tensor
-
     mkldnn::memory::format dst_format =
         platform::MKLDNNFormatForSize(src_tz.size(), diff_y->format());
 
-    auto user_diff_dst_memory = memory(
-        {{{diff_dst_tz}, memory::data_type::f32, dst_format}, mkldnn_engine},
-        to_void_cast(diff_y_data));
-
-    // create mkldnn memory from input x tensor
     mkldnn::memory::format input_format =
         platform::MKLDNNFormatForSize(src_tz.size(), x->format());
 
-    auto src_memory = memory(
-        {{{src_tz}, memory::data_type::f32, input_format}, mkldnn_engine},
-        to_void_cast(x_data));
+    unsigned flags = mkldnn::use_scale_shift;
 
-    // for diff_dst, try to use same format as dst in forward pass
-    auto diff_dst_pd = batch_norm_fwd_pd.get()->dst_primitive_desc();
-    auto diff_dst_md = diff_dst_pd.desc();
+    // keys from forward pass
+    const std::string key = BatchNormMKLDNNHandler::GetHash(
+        src_tz, epsilon, flags, false, input_format,
+        ctx.op().Input("SavedMean"));
+    const std::string key_batch_norm_fwd_pd = key + "@bn_fwd_pd";
+
+    // keys for primitives reuse
+    const std::string key_with_hash =
+        key + BatchNormMKLDNNHandler::GetHash(src_tz, epsilon, flags, false,
+                                              input_format);
+    const std::string key_batch_norm_bwd_p =
+        key_with_hash + "@batch_norm_bwd_p";
+    const std::string key_batch_norm_src_mem_p =
+        key_with_hash + "@batch_norm_bwd_src_mem_p";
+    const std::string key_batch_norm_mean_mem_p =
+        key_with_hash + "@batch_norm_bwd_mean_mem_p";
+    const std::string key_batch_norm_variance_mem_p =
+        key_with_hash + "@batch_norm_bwd_variance_mem_p";
+    const std::string key_batch_norm_scaleshift_mem_p =
+        key_with_hash + "@batch_norm_bwd_scaleshift_mem_p";
+    const std::string key_batch_norm_diff_scaleshift_mem_p =
+        key_with_hash + "@batch_norm_bwd_diff_scaleshift_mem_p";
+    const std::string key_batch_norm_diff_src_mem_p =
+        key_with_hash + "@batch_norm_bwd_diff_src_mem_p";
+    const std::string key_batch_norm_diff_dst_mem_p =
+        key_with_hash + "@batch_norm_bwd_diff_dst_mem_p";
 
-    // create primitive descriptor for batch norm backward
-    unsigned flags = mkldnn::use_scale_shift;
-    auto batch_norm_bwd_desc = bn_bwd_types::op_desc{
-        mkldnn::prop_kind::backward, diff_dst_md,
-        src_memory.get_primitive_desc().desc(), epsilon, flags};
-    auto batch_norm_bwd_pd = bn_bwd_types::op_prim{
-        batch_norm_bwd_desc, mkldnn_engine, *batch_norm_fwd_pd};
-
-    // reorder user_diff_dst if it's not in preferred format
-    auto diff_dst_memory = user_diff_dst_memory;
     primitive reorder_diff_dst;
     bool is_diff_dst_reordered = false;
-    if (diff_dst_pd != user_diff_dst_memory.get_primitive_desc()) {
-      diff_dst_memory = memory(diff_dst_pd);
-      reorder_diff_dst = reorder(user_diff_dst_memory, diff_dst_memory);
-      is_diff_dst_reordered = true;
-    }
-
-    // create mkldnn memory for input tensors (src/mean/variance)
-    auto mean_memory = memory(batch_norm_bwd_pd.mean_primitive_desc(),
-                              to_void_cast(batch_mean_data));
-    auto variance_memory = memory(batch_norm_bwd_pd.variance_primitive_desc(),
-                                  to_void_cast(batch_variance_data));
+    auto user_diff_dst_memory = memory(
+        {{{diff_dst_tz}, memory::data_type::f32, dst_format}, mkldnn_engine},
+        to_void_cast(diff_y_data));
 
     // MKLDNN requires a single piece of memory for scale and shift/bias data
     const size_t scaleshift_size = 2 * ic;
@@ -306,30 +382,118 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     copy_to_weights(scale_data, scale_data + ic, shift_data, shift_data + ic,
                     &scaleshift_data);
 
-    // create mkldnn memory for input tensors (scale/shift)
-    auto scaleshift_memory = memory(batch_norm_bwd_pd.weights_primitive_desc(),
-                                    scaleshift_data.data());
-
-    // create mkldnn memory for output diff weights (combined scale/shift)
     std::vector<T> diff_scaleshift_data;
     diff_scaleshift_data.reserve(scaleshift_size);
-    auto diff_scaleshift_memory =
-        memory(batch_norm_bwd_pd.diff_weights_primitive_desc(),
-               diff_scaleshift_data.data());
 
-    // here assume diff_src is in the same format of src
-    auto diff_src_memory = memory(src_memory.get_primitive_desc(), diff_x_data);
+    auto batch_norm_fwd_pd =
+        std::static_pointer_cast<batch_norm_fwd::primitive_desc>(
+            dev_ctx.GetBlob(key_batch_norm_fwd_pd));
+    PADDLE_ENFORCE(batch_norm_fwd_pd != nullptr,
+                   "Fail to find batch_norm_fwd_pd in device context");
 
-    // finally create batch_norm backward primitive
-    auto batch_norm_bwd_prim =
-        batch_norm_bwd(batch_norm_bwd_pd, src_memory, mean_memory,
-                       variance_memory, diff_dst_memory, scaleshift_memory,
-                       diff_src_memory, diff_scaleshift_memory);
+    auto batch_norm_bwd_p = std::static_pointer_cast<batch_norm_bwd>(
+        dev_ctx.GetBlob(key_batch_norm_bwd_p));
+
+    if (batch_norm_bwd_p == nullptr) {
+      auto src_memory = std::shared_ptr<memory>(new memory(
+          {{{src_tz}, memory::data_type::f32, input_format}, mkldnn_engine},
+          to_void_cast(x_data)));
+
+      // for diff_dst, try to use same format as dst in forward pass
+      auto diff_dst_pd = batch_norm_fwd_pd.get()->dst_primitive_desc();
+      auto diff_dst_md = diff_dst_pd.desc();
+
+      // create primitive descriptor for batch norm backward
+      auto batch_norm_bwd_desc = bn_bwd_types::op_desc{
+          mkldnn::prop_kind::backward, diff_dst_md,
+          src_memory->get_primitive_desc().desc(), epsilon, flags};
+      auto batch_norm_bwd_pd = bn_bwd_types::op_prim{
+          batch_norm_bwd_desc, mkldnn_engine, *batch_norm_fwd_pd};
+
+      // reorder user_diff_dst if it's not in preferred format
+      auto diff_dst_memory = std::make_shared<memory>(user_diff_dst_memory);
+      if (diff_dst_pd != user_diff_dst_memory.get_primitive_desc()) {
+        diff_dst_memory = std::make_shared<memory>(diff_dst_pd);
+        reorder_diff_dst = reorder(user_diff_dst_memory, *diff_dst_memory);
+        is_diff_dst_reordered = true;
+      }
+
+      // create mkldnn memory for input tensors (src/mean/variance)
+      auto mean_memory =
+          std::make_shared<memory>(batch_norm_bwd_pd.mean_primitive_desc(),
+                                   to_void_cast(batch_mean_data));
+      auto variance_memory =
+          std::make_shared<memory>(batch_norm_bwd_pd.variance_primitive_desc(),
+                                   to_void_cast(batch_variance_data));
+
+      // create mkldnn memory for input tensors (scale/shift)
+      auto scaleshift_memory = std::make_shared<memory>(
+          batch_norm_bwd_pd.weights_primitive_desc(), scaleshift_data.data());
+
+      // create mkldnn memory for output diff weights (combined scale/shift)
+      auto diff_scaleshift_memory = std::make_shared<memory>(
+          batch_norm_bwd_pd.diff_weights_primitive_desc(),
+          diff_scaleshift_data.data());
+
+      // here assume diff_src is in the same format of src
+      auto diff_src_memory = std::make_shared<memory>(
+          src_memory->get_primitive_desc(), diff_x_data);
+
+      // finally create batch_norm backward primitive
+      batch_norm_bwd_p = std::make_shared<batch_norm_bwd>(
+          batch_norm_bwd_pd, *src_memory, *mean_memory, *variance_memory,
+          *diff_dst_memory, *scaleshift_memory, *diff_src_memory,
+          *diff_scaleshift_memory);
+
+      dev_ctx.SetBlob(key_batch_norm_bwd_p, batch_norm_bwd_p);
+      dev_ctx.SetBlob(key_batch_norm_src_mem_p, src_memory);
+      dev_ctx.SetBlob(key_batch_norm_mean_mem_p, mean_memory);
+      dev_ctx.SetBlob(key_batch_norm_variance_mem_p, variance_memory);
+      dev_ctx.SetBlob(key_batch_norm_scaleshift_mem_p, scaleshift_memory);
+      dev_ctx.SetBlob(key_batch_norm_diff_scaleshift_mem_p,
+                      diff_scaleshift_memory);
+      dev_ctx.SetBlob(key_batch_norm_diff_src_mem_p, diff_src_memory);
+      dev_ctx.SetBlob(key_batch_norm_diff_dst_mem_p, diff_dst_memory);
+
+      // set layout/format of output tensors
+      diff_x->set_layout(DataLayout::kMKLDNN);
+      diff_x->set_format((memory::format)diff_src_memory->get_primitive_desc()
+                             .desc()
+                             .data.format);
+    } else {
+      // primitives already exist
+      UpdateMemoryData(dev_ctx, key_batch_norm_src_mem_p, to_void_cast(x_data));
+      UpdateMemoryData(dev_ctx, key_batch_norm_mean_mem_p,
+                       to_void_cast(batch_mean_data));
+      UpdateMemoryData(dev_ctx, key_batch_norm_variance_mem_p,
+                       to_void_cast(batch_variance_data));
+      UpdateMemoryData(dev_ctx, key_batch_norm_scaleshift_mem_p,
+                       scaleshift_data.data());
+      UpdateMemoryData(dev_ctx, key_batch_norm_diff_scaleshift_mem_p,
+                       diff_scaleshift_data.data());
+      auto diff_src_memory = UpdateMemoryData(
+          dev_ctx, key_batch_norm_diff_src_mem_p, to_void_cast(diff_x_data));
+      auto diff_dst_memory = UpdateMemoryData(
+          dev_ctx, key_batch_norm_diff_dst_mem_p, to_void_cast(diff_y_data));
+
+      // reorder user_diff_dst if it's not in preferred format
+      if (diff_dst_memory->get_primitive_desc() !=
+          user_diff_dst_memory.get_primitive_desc()) {
+        reorder_diff_dst = reorder(user_diff_dst_memory, *diff_dst_memory);
+        is_diff_dst_reordered = true;
+      }
+
+      // set layout/format of output tensors
+      diff_x->set_layout(DataLayout::kMKLDNN);
+      diff_x->set_format((memory::format)diff_src_memory->get_primitive_desc()
+                             .desc()
+                             .data.format);
+    }
 
     // execute optional reorder and batch_norm backward primitive
     std::vector<primitive> pipeline;
     if (is_diff_dst_reordered) pipeline.push_back(reorder_diff_dst);
-    pipeline.push_back(batch_norm_bwd_prim);
+    pipeline.push_back(*batch_norm_bwd_p);
     stream(stream::kind::eager).submit(pipeline).wait();
 
     // copy back diff sacle/shift to output tensors (diff scale/shift)
@@ -338,12 +502,6 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     std::copy(it, std::next(it, ic), diff_scale_data);
     std::copy(std::next(it, ic), std::end(diff_scaleshift_data),
               diff_shift_data);
-
-    // set layout/format of output tensors
-    diff_x->set_layout(DataLayout::kMKLDNN);
-    diff_x->set_format((memory::format)diff_src_memory.get_primitive_desc()
-                           .desc()
-                           .data.format);
   }
 };
 }  // namespace operators
diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc
index 693bf973c2b8790d2c50cee9b86b365493e8c754..5912a1a17cbd29c3ebd83f37133c044f0905c8bd 100644
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -216,6 +216,18 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
       saved_mean_e.setZero();
       saved_variance_e.setZero();
 
+      EigenVectorArrayMap<T> running_mean_arr(
+          mean_out->mutable_data<T>(ctx.GetPlace()), C);
+      EigenVectorArrayMap<T> running_var_arr(
+          variance_out->mutable_data<T>(ctx.GetPlace()), C);
+
+      if ((N * sample_size) == 1) {
+        LOG(WARNING) << "Only 1 element in normalization dimension, "
+                     << "we skip the batch norm calculation, let y = x.";
+        framework::TensorCopySync(*x, ctx.GetPlace(), y);
+        return;
+      }
+
       switch (data_layout) {
         case DataLayout::kNCHW: {
           ConstEigenArrayMap<T> x_arr(x->data<T>(), sample_size, N * C);
@@ -247,10 +259,6 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
           PADDLE_THROW("Unknown storage order: %s", data_layout_str);
       }
 
-      EigenVectorArrayMap<T> running_mean_arr(
-          mean_out->mutable_data<T>(ctx.GetPlace()), C);
-      EigenVectorArrayMap<T> running_var_arr(
-          variance_out->mutable_data<T>(ctx.GetPlace()), C);
       running_mean_arr =
           running_mean_arr * momentum + saved_mean_e * (1. - momentum);
       running_var_arr =
@@ -427,6 +435,11 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T>
     d_bias_arr.setZero();
     d_scale_arr.setZero();
 
+    if ((N * sample_size) == 1) {
+      framework::TensorCopySync(*d_y, ctx.GetPlace(), d_x);
+      return;
+    }
+
     const auto scale_inv_var_nhw = scale_arr * inv_var_arr / (N * sample_size);
 
     switch (data_layout) {
diff --git a/paddle/fluid/operators/batch_norm_op.cu.cc b/paddle/fluid/operators/batch_norm_op.cu.cc
index 550dd32d36767f90e880415bfffaf01aeb623609..ca6cd8669352fd5814f25a04433ca97fe4abe9ff 100644
--- a/paddle/fluid/operators/batch_norm_op.cu.cc
+++ b/paddle/fluid/operators/batch_norm_op.cu.cc
@@ -72,6 +72,9 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
     int N, C, H, W, D;
     ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
 
+    auto *y = ctx.Output<Tensor>("Y");
+    y->mutable_data<T>(ctx.GetPlace());
+
     // ------------------- cudnn descriptors ---------------------
     cudnnTensorDescriptor_t data_desc_;
     cudnnTensorDescriptor_t bn_param_desc_;
@@ -93,7 +96,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
     mode_ = CUDNN_BATCHNORM_SPATIAL;
 #endif
 
-    VLOG(1) << "Setting descriptors.";
+    VLOG(3) << "Setting descriptors.";
     std::vector<int> dims;
     std::vector<int> strides;
     if (data_layout == DataLayout::kNCHW) {
@@ -113,11 +116,6 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
     const auto *scale = ctx.Input<Tensor>("Scale");
     const auto *bias = ctx.Input<Tensor>("Bias");
 
-    auto *y = ctx.Output<Tensor>("Y");
-
-    // alloc memory
-    y->mutable_data<T>(ctx.GetPlace());
-
     auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
 
     auto handle = dev_ctx.cudnn_handle();
@@ -162,22 +160,28 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
       functor(dev_ctx, saved_mean, static_cast<BatchNormParamType<T>>(0));
       functor(dev_ctx, saved_variance, static_cast<BatchNormParamType<T>>(0));
 
-      double this_factor = 1. - momentum;
-
-      CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationForwardTraining(
-          handle, mode_, CudnnDataType<T>::kOne(), CudnnDataType<T>::kZero(),
-          data_desc_, x->template data<T>(), data_desc_,
-          y->template mutable_data<T>(ctx.GetPlace()), bn_param_desc_,
-          scale->template data<BatchNormParamType<T>>(),
-          bias->template data<BatchNormParamType<T>>(), this_factor,
-          mean_out->template mutable_data<BatchNormParamType<T>>(
-              ctx.GetPlace()),
-          variance_out->template mutable_data<BatchNormParamType<T>>(
-              ctx.GetPlace()),
-          epsilon, saved_mean->template mutable_data<BatchNormParamType<T>>(
-                       ctx.GetPlace()),
-          saved_variance->template mutable_data<BatchNormParamType<T>>(
-              ctx.GetPlace())));
+      if ((N * H * W * D) == 1) {
+        LOG(WARNING) << "Only 1 element in normalization dimension, "
+                     << "we skip the batch norm calculation, let y = x.";
+        framework::TensorCopySync(*x, ctx.GetPlace(), y);
+      } else {
+        double this_factor = 1. - momentum;
+
+        CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationForwardTraining(
+            handle, mode_, CudnnDataType<T>::kOne(), CudnnDataType<T>::kZero(),
+            data_desc_, x->template data<T>(), data_desc_,
+            y->template mutable_data<T>(ctx.GetPlace()), bn_param_desc_,
+            scale->template data<BatchNormParamType<T>>(),
+            bias->template data<BatchNormParamType<T>>(), this_factor,
+            mean_out->template mutable_data<BatchNormParamType<T>>(
+                ctx.GetPlace()),
+            variance_out->template mutable_data<BatchNormParamType<T>>(
+                ctx.GetPlace()),
+            epsilon, saved_mean->template mutable_data<BatchNormParamType<T>>(
+                         ctx.GetPlace()),
+            saved_variance->template mutable_data<BatchNormParamType<T>>(
+                ctx.GetPlace())));
+      }
     }
 
     // clean when exit.
@@ -209,6 +213,25 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
     int N, C, H, W, D;
     ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
 
+    // init output
+    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
+    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+
+    d_x->mutable_data<T>(ctx.GetPlace());
+    d_scale->mutable_data<T>(ctx.GetPlace());
+    d_bias->mutable_data<T>(ctx.GetPlace());
+
+    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    if ((N * H * W * D) == 1) {
+      framework::TensorCopySync(*d_y, ctx.GetPlace(), d_x);
+      math::SetConstant<platform::CUDADeviceContext, BatchNormParamType<T>>
+          functor;
+      functor(dev_ctx, d_scale, static_cast<BatchNormParamType<T>>(0));
+      functor(dev_ctx, d_bias, static_cast<BatchNormParamType<T>>(0));
+      return;
+    }
+
     PADDLE_ENFORCE_EQ(scale->dims().size(), 1UL);
     PADDLE_ENFORCE_EQ(scale->dims()[0], C);
 
@@ -247,21 +270,11 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
     CUDNN_ENFORCE(platform::dynload::cudnnDeriveBNTensorDescriptor(
         bn_param_desc_, data_desc_, mode_));
 
-    // init output
-    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
-    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
-
-    d_x->mutable_data<T>(ctx.GetPlace());
-    d_scale->mutable_data<T>(ctx.GetPlace());
-    d_bias->mutable_data<T>(ctx.GetPlace());
-
     const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
     const auto *saved_var = ctx.Input<Tensor>("SavedVariance");
     const void *saved_mean_data = saved_mean->template data<T>();
     const void *saved_var_data = saved_var->template data<T>();
 
-    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationBackward(
         dev_ctx.cudnn_handle(), mode_, CudnnDataType<T>::kOne(),
         CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(),
diff --git a/paddle/fluid/operators/beam_search_decode_op.cc b/paddle/fluid/operators/beam_search_decode_op.cc
index 10d678111f5325e495b24286e6ecf651230393fe..b6cb935814e25b31d4104f9ce24fe952680cb491 100644
--- a/paddle/fluid/operators/beam_search_decode_op.cc
+++ b/paddle/fluid/operators/beam_search_decode_op.cc
@@ -74,7 +74,7 @@ struct BeamSearchDecodeFunctor {
   }
 
   template <typename T>
-  void operator()() const;
+  void apply() const;
 
   bool tensor_on_gpu_;
   size_t beam_size_;
@@ -88,7 +88,7 @@ struct BeamSearchDecodeFunctor {
 };
 
 template <typename T>
-void BeamSearchDecodeFunctor::operator()() const {
+void BeamSearchDecodeFunctor::apply() const {
   BeamSearchDecoder<T> beam_search_decoder(beam_size_, end_id_);
   // Check if the tensor is on GPU. If so, use the CPU copy instead
   if (tensor_on_gpu_) {
@@ -101,7 +101,7 @@ void BeamSearchDecodeFunctor::operator()() const {
 }
 
 template <>
-void BeamSearchDecodeFunctor::operator()<bool>() const {
+void BeamSearchDecodeFunctor::apply<bool>() const {
   PADDLE_THROW("beam search decode op does not support bool!");
 }
 
diff --git a/paddle/fluid/operators/cast_op.h b/paddle/fluid/operators/cast_op.h
index 6220e57f5941d89cbf0aea268b85ad55af6132cc..8fa0416049f8fa128d7ab61f8350b41960f07263 100644
--- a/paddle/fluid/operators/cast_op.h
+++ b/paddle/fluid/operators/cast_op.h
@@ -37,7 +37,7 @@ struct CastOpFunctor {
       : in_(in), out_(out), ctx_(ctx) {}
 
   template <typename OutT>
-  void operator()() const {
+  void apply() const {
     auto* in_begin = in_->data<InT>();
     auto numel = in_->numel();
     auto* in_end = in_begin + numel;
diff --git a/paddle/fluid/operators/checkpoint_notify_op.cc b/paddle/fluid/operators/checkpoint_notify_op.cc
index c4219a429a53eb4869426a2674109555fb784b85..3a2527e407bb179c4873fa3ffe2e8f22fb47faf7 100644
--- a/paddle/fluid/operators/checkpoint_notify_op.cc
+++ b/paddle/fluid/operators/checkpoint_notify_op.cc
@@ -48,7 +48,7 @@ class CheckpointNotifyOp : public framework::OperatorBase {
       VLOG(3) << "checkpoint notify sending lookup table: " << lookup_table_name
               << " and dir:" << dir << " to " << epmap[i];
     }
-    rpc_client->Wait();
+    PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
   }
 };
 
diff --git a/paddle/fluid/operators/concat_op.h b/paddle/fluid/operators/concat_op.h
index a496301526f58875ff51aeaa5b2094c3c656531c..78be2e1e1f06c7a518e35a770c1dc9581b2d10fe 100644
--- a/paddle/fluid/operators/concat_op.h
+++ b/paddle/fluid/operators/concat_op.h
@@ -62,9 +62,21 @@ class ConcatGradKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const {
     auto* out_grad =
         ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto ins = ctx.MultiInput<framework::Tensor>("X");
+    auto ins = ctx.MultiInput<framework::LoDTensor>("X");
     auto out_var_names = ctx.Outputs(framework::GradVarName("X"));
-    auto outs = ctx.MultiOutput<framework::Tensor>(framework::GradVarName("X"));
+    auto outs =
+        ctx.MultiOutput<framework::LoDTensor>(framework::GradVarName("X"));
+
+    {
+      auto dx = outs;
+      auto x = ins;
+      for (size_t i = 0; i < dx.size(); ++i) {
+        if (dx[i] != nullptr) {
+          dx[i]->set_lod(x[i]->lod());
+        }
+      }
+    }
+
     int64_t axis = static_cast<int64_t>(ctx.Attr<int>("axis"));
 
     // get output tensor that the name is not kEmptyVarName
diff --git a/paddle/fluid/operators/conditional_block_op.cc b/paddle/fluid/operators/conditional_block_op.cc
index 8cc1d94260baccfe28d213b7e021956819e2e79e..135254ce6b6bf9add7bb1f0c3f645ed47081fba4 100644
--- a/paddle/fluid/operators/conditional_block_op.cc
+++ b/paddle/fluid/operators/conditional_block_op.cc
@@ -29,9 +29,9 @@ class ConditionalOp : public framework::OperatorBase {
 
  protected:
   std::vector<const framework::LoDTensor *> InputTensors(
-      const framework::Scope &scope) const {
+      const framework::Scope &scope, const std::string &in_name) const {
     std::vector<const framework::LoDTensor *> retv;
-    auto xs = Inputs("X");
+    auto xs = Inputs(in_name);
     retv.resize(xs.size(), nullptr);
     std::transform(
         xs.begin(), xs.end(), retv.begin(),
@@ -81,12 +81,18 @@ class ConditionalBlockOp : public ConditionalOp {
  private:
   void RunImpl(const framework::Scope &scope,
                const platform::Place &dev_place) const override {
-    auto xs = InputTensors(scope);
-
     bool need_run;
     if (Attr<bool>("is_scalar_condition")) {
+      // When is_scalar_condition is True, the conditional variable is a scalar,
+      // whether need to execute the operators in sub-block depends on the
+      // conditional variable (Cond).
+      auto xs = InputTensors(scope, "Cond");
       need_run = ScalarCondition(xs);
     } else {
+      // When is_scalar_condition is False, the conditional variable maybe a
+      // vector or tensor, whether need to execute the operators in sub-block
+      // depends on the input variables (Input).
+      auto xs = InputTensors(scope, "Input");
       need_run = std::all_of(
           xs.begin(), xs.end(),
           [](const framework::LoDTensor *t) { return t->numel() != 0; });
@@ -110,11 +116,11 @@ class ConditionalBlockOp : public ConditionalOp {
 class ConditionalBlockOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
-    AddInput("X",
-             "The conditional variable of this operator. If X is empty, the "
+    AddInput("Cond",
+             "The conditional variable of this operator. If Cond is empty, the "
              "whole sub-block will not be executed.")
         .AsDuplicable();
-    AddInput("Params", "The input variables of the sub-block.").AsDuplicable();
+    AddInput("Input", "The input variables of the sub-block.").AsDuplicable();
     AddOutput("Out", "The output variables of the sub-block.").AsDuplicable();
     AddOutput("Scope",
               "(std::vector<Scope*>) The step scope of conditional block. To "
@@ -123,13 +129,18 @@ class ConditionalBlockOpProtoMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<framework::BlockDesc *>(
         "sub_block", "The step block of conditional block operator");
     AddAttr<bool>("is_scalar_condition",
-                  "the input X is used as scalar "
-                  "condition")
+                  "The conditional variable (Cond) is used as scalar "
+                  "condition.")
         .SetDefault(false);
     AddComment(R"DOC(Conditional block operator
 
-Run the sub-block if X is not empty. Params is the other inputs and Out is the
-outputs of the sub-block.
+If `is_scalar_condition` is True, the conditional variable (Cond) is a scalar,
+run the operators in sub-block if Cond is True.
+
+If `is_scalar_condition` is False, the conditional variable (Cond) is a vector or
+tensor, run the operators in sub-block if all of input variables are not empty.
+
+
 )DOC");
   }
 };
@@ -145,12 +156,12 @@ class ConditionalBlockGradOp : public ConditionalOp {
  private:
   void RunImpl(const framework::Scope &scope,
                const platform::Place &dev_place) const override {
-    auto xs = this->InputTensors(scope);
-
     bool need_run;
     if (Attr<bool>("is_scalar_condition")) {
+      auto xs = this->InputTensors(scope, "Cond");
       need_run = ScalarCondition(xs);
     } else {
+      auto xs = this->InputTensors(scope, "Input");
       need_run = std::all_of(
           xs.begin(), xs.end(),
           [](const framework::LoDTensor *t) { return t->numel() != 0; });
@@ -166,11 +177,11 @@ class ConditionalBlockGradOp : public ConditionalOp {
       auto *block = Attr<framework::BlockDesc *>("sub_block");
       exec.Run(*block->Program(), &cur_scope, block->ID(), false);
 
-      AssignLocalGradientToGlobal(dev_place, cur_scope, Inputs("Params"),
-                                  Outputs(framework::GradVarName("Params")));
+      AssignLocalGradientToGlobal(dev_place, cur_scope, Inputs("Input"),
+                                  Outputs(framework::GradVarName("Input")));
 
-      AssignLocalGradientToGlobal(dev_place, cur_scope, Inputs("X"),
-                                  Outputs(framework::GradVarName("X")));
+      AssignLocalGradientToGlobal(dev_place, cur_scope, Inputs("Cond"),
+                                  Outputs(framework::GradVarName("Cond")));
     }
   }
 
@@ -199,15 +210,16 @@ class ConditionalBlockGradOp : public ConditionalOp {
 class ConditionalBlockGradInferShape : public framework::InferShapeBase {
  public:
   void operator()(framework::InferShapeContext *context) const override {
-    PADDLE_ENFORCE(context->HasInputs("X"));
-    if (context->HasInputs("Params")) {
-      PADDLE_ENFORCE(context->HasOutputs(framework::GradVarName("Params")));
-      context->SetOutputsDim(framework::GradVarName("Params"),
-                             context->GetInputsDim("Params"));
+    PADDLE_ENFORCE(context->HasInputs("Cond"));
+    if (context->HasInputs("Input")) {
+      PADDLE_ENFORCE(context->HasOutputs(framework::GradVarName("Input")));
+      context->SetOutputsDim(framework::GradVarName("Input"),
+                             context->GetInputsDim("Input"));
+    }
+    if (context->HasOutputs(framework::GradVarName("Cond"))) {
+      context->SetOutputsDim(framework::GradVarName("Cond"),
+                             context->GetInputsDim("Cond"));
     }
-    PADDLE_ENFORCE(context->HasOutputs(framework::GradVarName("X")));
-    context->SetOutputsDim(framework::GradVarName("X"),
-                           context->GetInputsDim("X"));
   }
 };
 
@@ -219,14 +231,15 @@ class ConditionalBlockGradMaker : public framework::SingleGradOpDescMaker {
   std::unique_ptr<framework::OpDesc> Apply() const override {
     auto grad_op = new framework::OpDesc();
     grad_op->SetType("conditional_block_grad");
-    grad_op->SetInput("X", Input("X"));
-    grad_op->SetInput("Params", Input("Params"));
+    grad_op->SetInput("Cond", Input("Cond"));
+    grad_op->SetInput("Input", Input("Input"));
     grad_op->SetInput("Out", Output("Out"));
     grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
     grad_op->SetInput("Scope", Output("Scope"));
-    grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X", false));
-    grad_op->SetOutput(framework::GradVarName("Params"),
-                       InputGrad("Params", false));
+    grad_op->SetOutput(framework::GradVarName("Cond"),
+                       InputGrad("Cond", false));
+    grad_op->SetOutput(framework::GradVarName("Input"),
+                       InputGrad("Input", false));
     grad_op->SetBlockAttr("sub_block", this->grad_block_[0]);
     grad_op->SetAttr("is_scalar_condition", GetAttr("is_scalar_condition"));
     return std::unique_ptr<framework::OpDesc>(grad_op);
diff --git a/paddle/fluid/operators/conv_cudnn_op.cu.cc b/paddle/fluid/operators/conv_cudnn_op.cu.cc
index 1828be57b5a54005a0066b18ebebdb740726f67a..4a7a6bcf7154d5680de751e3c933be46fb09fd74 100644
--- a/paddle/fluid/operators/conv_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc
@@ -20,10 +20,10 @@ limitations under the License. */
 #include "paddle/fluid/platform/cudnn_helper.h"
 #include "paddle/fluid/platform/float16.h"
 
-DEFINE_bool(cudnn_deterministic, true,
+DEFINE_bool(cudnn_deterministic, false,
             "Whether allow using an autotuning algorithm for convolution "
             "operator. The autotuning algorithm may be non-deterministic. If "
-            "false, the algorithm is deterministic.");
+            "true, the algorithm is deterministic.");
 
 namespace paddle {
 namespace operators {
@@ -77,7 +77,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
     // cudnn 7 can support groups, no need to do it mannually
     // FIXME(typhoonzero): find a better way to disable groups
     // rather than setting it to 1.
-    PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionGroupCount(
+    CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionGroupCount(
         cudnn_conv_desc, groups));
     groups = 1;
 #endif
@@ -118,7 +118,6 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
         output_channels / groups * output_height * output_width * output_depth;
     int group_offset_filter = filter->numel() / groups;
     // ------------------- cudnn conv workspace ---------------------
-    void* cudnn_workspace = nullptr;
     size_t workspace_size_in_bytes;  // final workspace to allocate.
     size_t workspace_size_limit = kCONV_CUDNN_WORKSPACE_LIMIT_BYTES;
     if (user_workspace_size > 0) {
@@ -129,7 +128,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
     auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     auto handle = dev_ctx.cudnn_handle();
 
-    PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
+    CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
         handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
         cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
         workspace_size_limit, &algo));
@@ -140,18 +139,18 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
     if (dev_ctx.GetComputeCapability() >= 70 &&
         std::type_index(typeid(T)) ==
             std::type_index(typeid(platform::float16))) {
-      PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
+      CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
           cudnn_conv_desc, CUDNN_TENSOR_OP_MATH));
       // Currently tensor core is only enabled using this algo
       algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
     } else {
-      PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
+      CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
           cudnn_conv_desc, CUDNN_DEFAULT_MATH));
     }
 #endif
 
     // get workspace size able to allocate
-    PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
+    CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
         handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
         cudnn_output_desc, algo, &workspace_size_in_bytes));
     // It is possible for float16 on Volta GPU to allocate more memory than
@@ -159,20 +158,18 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit,
                       "workspace_size to be allocated exceeds the limit");
 
-    // Allocate on GPU memory
-    platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
-    cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
     // ------------------- cudnn conv forward ---------------------
     ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
     for (int i = 0; i < groups; i++) {
-      PADDLE_ENFORCE(platform::dynload::cudnnConvolutionForward(
-          handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in,
-          cudnn_filter_desc, filter_data + i * group_offset_filter,
-          cudnn_conv_desc, algo, cudnn_workspace, workspace_size_in_bytes,
-          &beta, cudnn_output_desc, output_data + i * group_offset_out));
+      auto cudnn_func = [&](void* cudnn_workspace) {
+        CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward(
+            handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in,
+            cudnn_filter_desc, filter_data + i * group_offset_filter,
+            cudnn_conv_desc, algo, cudnn_workspace, workspace_size_in_bytes,
+            &beta, cudnn_output_desc, output_data + i * group_offset_out));
+      };
+      dev_ctx.RunCudnnFuncWithWorkspace(cudnn_func, workspace_size_in_bytes);
     }
-    // Release the cudnn workspace
-    paddle::memory::Free(gpu, cudnn_workspace);
   }
 };
 
@@ -218,7 +215,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
     // cudnn 7 can support groups, no need to do it mannually
     // FIXME(typhoonzero): find a better way to disable groups
     // rather than setting it to 1.
-    PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionGroupCount(
+    CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionGroupCount(
         cudnn_conv_desc, groups));
     groups = 1;
 #endif
@@ -272,8 +269,8 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
     auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     auto handle = dev_ctx.cudnn_handle();
     if (input_grad) {
-      if (FLAGS_cudnn_deterministic) {
-        PADDLE_ENFORCE(
+      if (!FLAGS_cudnn_deterministic) {
+        CUDNN_ENFORCE(
             platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
                 handle, cudnn_filter_desc,
                 // dyDesc: Handle to the previously initialized input
@@ -289,7 +286,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
         data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
       }
 
-      PADDLE_ENFORCE(
+      CUDNN_ENFORCE(
           platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
               handle, cudnn_filter_desc, cudnn_output_grad_desc,
               cudnn_conv_desc, cudnn_input_desc, data_algo, &tmp_size));
@@ -297,8 +294,8 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
     }
 
     if (filter_grad) {
-      if (FLAGS_cudnn_deterministic) {
-        PADDLE_ENFORCE(
+      if (!FLAGS_cudnn_deterministic) {
+        CUDNN_ENFORCE(
             platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
                 handle, cudnn_input_desc, cudnn_output_grad_desc,
                 cudnn_conv_desc, cudnn_filter_desc,
@@ -308,17 +305,13 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
         filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
       }
 
-      PADDLE_ENFORCE(
+      CUDNN_ENFORCE(
           platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
               handle, cudnn_input_desc, cudnn_output_grad_desc, cudnn_conv_desc,
               cudnn_filter_desc, filter_algo, &tmp_size));
       workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size);
     }
-    // ------------------- cudnn conv workspace ---------------------
-    // Already on GPU
-    void* cudnn_workspace = nullptr;
-    platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
-    cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
+
     // ------------------- cudnn conv backward data ---------------------
     ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
     if (input_grad) {
@@ -326,12 +319,15 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
       // Because beta is zero, it is unnecessary to reset input_grad.
 
       for (int i = 0; i < groups; i++) {
-        PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
-            handle, &alpha, cudnn_filter_desc,
-            filter_data + i * group_offset_filter, cudnn_output_grad_desc,
-            output_grad_data + i * group_offset_out, cudnn_conv_desc, data_algo,
-            cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc,
-            input_grad_data + i * group_offset_in));
+        auto cudnn_func = [&](void* cudnn_workspace) {
+          CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
+              handle, &alpha, cudnn_filter_desc,
+              filter_data + i * group_offset_filter, cudnn_output_grad_desc,
+              output_grad_data + i * group_offset_out, cudnn_conv_desc,
+              data_algo, cudnn_workspace, workspace_size_in_bytes, &beta,
+              cudnn_input_desc, input_grad_data + i * group_offset_in));
+        };
+        dev_ctx.RunCudnnFuncWithWorkspace(cudnn_func, workspace_size_in_bytes);
       }
     }
     // ------------------- cudnn conv backward filter ---------------------
@@ -339,16 +335,17 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
       T* filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
       // Because beta is zero, it is unnecessary to reset filter_grad.
       for (int i = 0; i < groups; i++) {
-        PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
-            handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in,
-            cudnn_output_grad_desc, output_grad_data + i * group_offset_out,
-            cudnn_conv_desc, filter_algo, cudnn_workspace,
-            workspace_size_in_bytes, &beta, cudnn_filter_desc,
-            filter_grad_data + i * group_offset_filter));
+        auto cudnn_func = [&](void* cudnn_workspace) {
+          CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
+              handle, &alpha, cudnn_input_desc,
+              input_data + i * group_offset_in, cudnn_output_grad_desc,
+              output_grad_data + i * group_offset_out, cudnn_conv_desc,
+              filter_algo, cudnn_workspace, workspace_size_in_bytes, &beta,
+              cudnn_filter_desc, filter_grad_data + i * group_offset_filter));
+        };
+        dev_ctx.RunCudnnFuncWithWorkspace(cudnn_func, workspace_size_in_bytes);
       }
     }
-    // Release the cudnn workspace
-    paddle::memory::Free(gpu, cudnn_workspace);
   }
 };
 
diff --git a/paddle/fluid/operators/conv_mkldnn_op.cc b/paddle/fluid/operators/conv_mkldnn_op.cc
index 6b06913d1c83f4534238ac3dd22ac4035c0f0fbf..eae65968285703f5882d910e29bc5d8e1511cba6 100644
--- a/paddle/fluid/operators/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/conv_mkldnn_op.cc
@@ -18,9 +18,6 @@
 namespace paddle {
 namespace operators {
 
-using conv_bwd_data = mkldnn::convolution_backward_data;
-using conv_bwd_weights = mkldnn::convolution_backward_weights;
-using conv_fwd = mkldnn::convolution_forward;
 using framework::DataLayout;
 using mkldnn::memory;
 using mkldnn::primitive;
@@ -29,6 +26,240 @@ using mkldnn::stream;
 using platform::to_void_cast;
 using platform::GetMKLDNNFormat;
 
+class ConvMKLDNNHandler : public platform::MKLDNNHandler {
+ public:
+  ConvMKLDNNHandler(
+      std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd,
+      const platform::MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine,
+      const std::string& base_key)
+      : platform::MKLDNNHandler(dev_ctx, engine, base_key) {
+    conv_pd_ = conv_pd;
+  }
+
+  ConvMKLDNNHandler(
+      std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd,
+      std::shared_ptr<mkldnn::convolution_backward_data::primitive_desc>
+          conv_bwd_data_pd,
+      std::shared_ptr<mkldnn::convolution_backward_weights::primitive_desc>
+          conv_bwd_weights_pd,
+      const platform::MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine,
+      const std::string& base_key)
+      : platform::MKLDNNHandler(dev_ctx, engine, base_key),
+        conv_pd_(conv_pd),
+        conv_bwd_weights_pd_(conv_bwd_weights_pd),
+        conv_bwd_data_pd_(conv_bwd_data_pd) {
+    // If we are in Grad operatgor then update a key with BWD suffix to
+    // distinguish from FWD memory primitives
+    key_ += "-BWD";
+  }
+
+  size_t GetDstMemorySize() const {
+    return conv_pd_->dst_primitive_desc().get_size();
+  }
+
+  size_t GetDiffWeightsMemorySize() const {
+    return conv_bwd_weights_pd_->diff_weights_primitive_desc().get_size();
+  }
+
+  size_t GetDiffSourceMemorySize() const {
+    return conv_bwd_data_pd_->diff_src_primitive_desc().get_size();
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireSrcMemoryFromWeightsPrimitive(
+      const std::shared_ptr<mkldnn::memory> user_memory_p,
+      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
+    auto src_pd = conv_bwd_weights_pd_->src_primitive_desc();
+    auto user_pd = user_memory_p->get_primitive_desc();
+    return this->AcquireMemory(src_pd, user_pd, user_memory_p,
+                               "@weights-src_mem_p", pipeline);
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireDiffDstMemoryFromWeightsPrimitive(
+      const std::shared_ptr<mkldnn::memory> user_memory_p,
+      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
+    auto diff_dst_pd = conv_bwd_weights_pd_->diff_dst_primitive_desc();
+    auto user_pd = user_memory_p->get_primitive_desc();
+    return this->AcquireMemory(diff_dst_pd, user_pd, user_memory_p,
+                               "@weights-diff_dst_mem_p", pipeline);
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireDiffWeightsMemoryFromWeightsPrimitive(
+      void* ptr) {
+    return this->AcquireMemoryFromPrimitive(
+        conv_bwd_weights_pd_->diff_weights_primitive_desc(), ptr,
+        "@diff_weights_mem_p");
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireDiffDstMemoryFromDataPrimitive(
+      const std::shared_ptr<mkldnn::memory> user_memory_p,
+      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
+    auto diff_dst_pd = conv_bwd_data_pd_->diff_dst_primitive_desc();
+    auto user_pd = user_memory_p->get_primitive_desc();
+    return this->AcquireMemory(diff_dst_pd, user_pd, user_memory_p,
+                               "@data-diff_dst_mem_p", pipeline);
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireWeightsMemoryFromDataPrimitive(
+      const std::shared_ptr<mkldnn::memory> user_weights_memory_p,
+      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
+    auto weights_pd = conv_bwd_data_pd_->weights_primitive_desc();
+    auto user_pd = user_weights_memory_p->get_primitive_desc();
+    return this->AcquireMemory(weights_pd, user_pd, user_weights_memory_p,
+                               "@data-weights_mem_p", pipeline);
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireDiffSrcMemoryFromDataPrimitive(
+      void* ptr) {
+    return this->AcquireMemoryFromPrimitive(
+        conv_bwd_data_pd_->diff_src_primitive_desc(), ptr, "@diff_src_mem_p");
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireDstMemoryFromPrimitive(void* ptr) {
+    return this->AcquireMemoryFromPrimitive(conv_pd_->dst_primitive_desc(), ptr,
+                                            "@dst_mem_p");
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireSrcMemoryFromPrimitive(
+      const std::shared_ptr<mkldnn::memory> user_memory_p,
+      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
+    auto src_pd = conv_pd_->src_primitive_desc();
+    auto user_pd = user_memory_p->get_primitive_desc();
+    return this->AcquireMemory(src_pd, user_pd, user_memory_p, "@src_mem_p",
+                               pipeline);
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireWeightsMemoryFromPrimitive(
+      const std::shared_ptr<mkldnn::memory> user_weights_memory_p,
+      std::vector<mkldnn::primitive>& pipeline,  // NOLINT
+      bool is_persistent = false) {
+    auto user_weights_pd = user_weights_memory_p->get_primitive_desc();
+    auto weights_pd = conv_pd_->weights_primitive_desc();
+    return this->AcquireMemory(weights_pd, user_weights_pd,
+                               user_weights_memory_p, "@weights_mem_p",
+                               pipeline, is_persistent);
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireBiasMemoryFromPrimitive(
+      const std::shared_ptr<mkldnn::memory> user_bias_memory_p,
+      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
+    auto user_bias_pd = user_bias_memory_p->get_primitive_desc();
+    auto bias_pd = conv_pd_->bias_primitive_desc();
+    return this->AcquireMemory(bias_pd, user_bias_pd, user_bias_memory_p,
+                               "@bias_mem_p", pipeline);
+  }
+
+  std::shared_ptr<mkldnn::convolution_forward> AcquireConvolution(
+      std::shared_ptr<mkldnn::memory> src_memory_p,
+      std::shared_ptr<mkldnn::memory> weights_memory_p,
+      std::shared_ptr<mkldnn::memory> dst_memory_p) {
+    auto prim_key = key_ + "@conv_p";
+    auto conv_p = std::static_pointer_cast<mkldnn::convolution_forward>(
+        dev_ctx_.GetBlob(prim_key));
+    PADDLE_ENFORCE((conv_p != nullptr) || (is_reusing_ == false),
+                   "Fail to find convolution primitive in device context");
+    if (conv_p == nullptr) {
+      conv_p = std::make_shared<mkldnn::convolution_forward>(
+          *conv_pd_, *(src_memory_p), *(weights_memory_p.get()),
+          *(dst_memory_p.get()));
+
+      dev_ctx_.SetBlob(prim_key, conv_p);
+    } else {
+      is_reusing_ = true;
+    }
+    return conv_p;
+  }
+
+  std::shared_ptr<mkldnn::convolution_forward> AcquireConvolution(
+      std::shared_ptr<mkldnn::memory> src_memory_p,
+      std::shared_ptr<mkldnn::memory> weights_memory_p,
+      std::shared_ptr<mkldnn::memory> bias_memory_p,
+      std::shared_ptr<mkldnn::memory> dst_memory_p) {
+    auto prim_key = key_ + "@conv_p";
+    auto conv_p = std::static_pointer_cast<mkldnn::convolution_forward>(
+        dev_ctx_.GetBlob(prim_key));
+    PADDLE_ENFORCE((conv_p != nullptr) || (is_reusing_ == false),
+                   "Fail to find convolution primitive in device context");
+    if (conv_p == nullptr) {
+      conv_p = std::make_shared<mkldnn::convolution_forward>(
+          *conv_pd_, *(src_memory_p), *(weights_memory_p.get()),
+          *(bias_memory_p.get()), *(dst_memory_p.get()));
+
+      dev_ctx_.SetBlob(prim_key, conv_p);
+    } else {
+      is_reusing_ = true;
+    }
+    return conv_p;
+  }
+
+  std::shared_ptr<mkldnn::convolution_backward_weights>
+  AcquireConvolutionBackwardWeights(
+      std::shared_ptr<mkldnn::memory> src_memory_p,
+      std::shared_ptr<mkldnn::memory> diff_dst_memory_p,
+      std::shared_ptr<mkldnn::memory> diff_weights_memory_p) {
+    auto prim_key = key_ + "@conv_bwd_weights_p";
+    auto conv_bwd_weights_p =
+        std::static_pointer_cast<mkldnn::convolution_backward_weights>(
+            dev_ctx_.GetBlob(prim_key));
+    PADDLE_ENFORCE(
+        (conv_bwd_weights_p != nullptr) || (is_reusing_ == false),
+        "Fail to find convolution bwd weights primitive in device context");
+    if (conv_bwd_weights_p == nullptr) {
+      // create backward conv primitive for weights
+      conv_bwd_weights_p =
+          std::make_shared<mkldnn::convolution_backward_weights>(
+              *conv_bwd_weights_pd_, *src_memory_p, *diff_dst_memory_p,
+              *diff_weights_memory_p);
+      dev_ctx_.SetBlob(prim_key, conv_bwd_weights_p);
+    } else {
+      is_reusing_ = true;
+    }
+    return conv_bwd_weights_p;
+  }
+
+  std::shared_ptr<mkldnn::convolution_backward_data>
+  AcquireConvolutionBackwardData(
+      std::shared_ptr<mkldnn::memory> diff_dst_memory_p,
+      std::shared_ptr<mkldnn::memory> weights_memory_p,
+      std::shared_ptr<mkldnn::memory> diff_src_memory_p) {
+    auto prim_key = key_ + "@conv_bwd_data_p";
+    auto conv_bwd_data_p =
+        std::static_pointer_cast<mkldnn::convolution_backward_data>(
+            dev_ctx_.GetBlob(prim_key));
+    PADDLE_ENFORCE(
+        (conv_bwd_data_p != nullptr) || (is_reusing_ == false),
+        "Fail to find convolution bwd data primitive in device context");
+    if (conv_bwd_data_p == nullptr) {
+      conv_bwd_data_p = std::make_shared<mkldnn::convolution_backward_data>(
+          *conv_bwd_data_pd_, *diff_dst_memory_p, *weights_memory_p,
+          *diff_src_memory_p);
+      dev_ctx_.SetBlob(prim_key, conv_bwd_data_p);
+    } else {
+      is_reusing_ = true;
+    }
+    return conv_bwd_data_p;
+  }
+
+  // Generate keys for storing/retriving primitives for this operator
+  // TODO(jczaja): Make hashing function more optimial
+  static std::string GetHash(memory::dims& input_dims,     // NOLINT
+                             memory::dims& weights_dims,   // NOLINT
+                             std::vector<int>& strides,    // NOLINT
+                             std::vector<int>& paddings,   // NOLINT
+                             std::vector<int>& dilations,  // NOLINT
+                             int groups, const std::string& suffix) {
+    return dims2str(input_dims) + dims2str(weights_dims) + dims2str(strides) +
+           dims2str(paddings) + dims2str(dilations) + std::to_string(groups) +
+           suffix;
+  }
+
+ private:
+  std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd_;
+  std::shared_ptr<mkldnn::convolution_backward_weights::primitive_desc>
+      conv_bwd_weights_pd_;
+  std::shared_ptr<mkldnn::convolution_backward_data::primitive_desc>
+      conv_bwd_data_pd_;
+};
+
 template <typename T>
 class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
  public:
@@ -36,9 +267,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
                    "It must use CPUPlace.");
 
-    // Get unique name for index
-    const std::string key = ctx.op().Output("Output");
-    const std::string key_conv_pd = key + "@conv_pd";
+    const bool is_test = ctx.Attr<bool>("is_test");
 
     auto& dev_ctx =
         ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
@@ -46,6 +275,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
 
     auto* input = ctx.Input<Tensor>("Input");
     auto* filter = ctx.Input<Tensor>("Filter");
+    auto* bias = ctx.HasInput("Bias") ? ctx.Input<Tensor>("Bias") : nullptr;
     auto* output = ctx.Output<Tensor>("Output");
 
     PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN &&
@@ -54,112 +284,214 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN &&
                        filter->format() != memory::format::format_undef,
                    "Wrong layout/format set for Filter tensor");
+    PADDLE_ENFORCE(input->dims().size() == 4,
+                   "Input must be with 4 dimensions, i.e. NCHW");
+    PADDLE_ENFORCE(filter->dims().size() == 4,
+                   "Filter must be with 4 dimensions, i.e. OIHW");
+    if (bias) {
+      PADDLE_ENFORCE(bias->layout() == DataLayout::kMKLDNN &&
+                         bias->format() != memory::format::format_undef,
+                     "Wrong layout/format set for Bias tensor");
+      PADDLE_ENFORCE(bias->dims().size() == 1,
+                     "Bias must only have 1 dimension, i.e. X");
+    }
 
     std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
     std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
+    bool fuse_relu = ctx.Attr<bool>("fuse_relu");
+    bool fuse_eltwise = ctx.Attr<bool>("fuse_eltwise");
     int groups = ctx.Attr<int>("groups");
 
-    // TODO(pzelazko-intel) add support for group convolution and dilation
-    PADDLE_ENFORCE(groups == 1, "group convolution is not implemented yet");
+    // TODO: add support for dilation
     PADDLE_ENFORCE(
         dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1,
         "dilation in convolution is not implemented yet");
 
     const T* input_data = input->data<T>();
     const T* filter_data = filter->data<T>();
-    T* output_data = output->mutable_data<T>(ctx.GetPlace());
-
-    PADDLE_ENFORCE(input->dims().size() == 4,
-                   "Input must be with 4 dimensions, i.e. NCHW");
-    PADDLE_ENFORCE(filter->dims().size() == 4,
-                   "Filter must be with 4 dimensions, i.e. OIHW");
 
     std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
     std::vector<int> weights_tz =
         paddle::framework::vectorize2int(filter->dims());
+    int g = std::max(groups, 1);
+    if (g > 1) {
+      int o = weights_tz[0];
+      int i = weights_tz[1];
+      int h = weights_tz[2];
+      int w = weights_tz[3];
+      weights_tz.resize(5);
+      weights_tz[0] = g;
+      weights_tz[1] = o / g;
+      weights_tz[2] = i;
+      weights_tz[3] = h;
+      weights_tz[4] = w;
+    }
     std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
 
-    // create mkldnn memory from input tensors (data/weights)
-    auto user_src_memory = memory(
-        {{{src_tz}, memory::data_type::f32, input->format()}, mkldnn_engine},
-        to_void_cast(input_data));
-    auto user_weights_memory =
-        memory({{{weights_tz}, memory::data_type::f32, filter->format()},
-                mkldnn_engine},
-               to_void_cast(filter_data));
+    // Get unique name for storing MKLDNN primitives
+    const std::string key = ConvMKLDNNHandler::GetHash(
+        src_tz, weights_tz, strides, paddings, dilations, groups,
+        ctx.op().Output("Output"));
+    const std::string key_conv_pd = key + "@conv_pd";
+
+    std::vector<primitive> pipeline;
+
+    auto user_src_md = platform::MKLDNNMemDesc(
+        {src_tz}, platform::MKLDNNGetDataType<T>(), input->format());
+    auto user_weights_md = platform::MKLDNNMemDesc(
+        {weights_tz}, platform::MKLDNNGetDataType<T>(),
+        (g == 1) ? filter->format() : mkldnn::memory::format::goihw);
 
     /* create memory descriptor for convolution without specified format
      * ('any') which lets a primitive (convolution in this case) choose
      * the memory format preferred for best performance
      */
-    auto src_md = platform::MKLDNNMemDesc(src_tz, memory::data_type::f32,
-                                          memory::format::any);
+    std::string data_format = ctx.Attr<std::string>("data_format");
+    auto chosen_memory_format =
+        platform::data_format_to_memory_format(data_format);
+
+    auto src_md = platform::MKLDNNMemDesc(
+        src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
     auto weights_md = platform::MKLDNNMemDesc(
-        weights_tz, memory::data_type::f32, memory::format::any);
-    auto dst_md = platform::MKLDNNMemDesc(dst_tz, memory::data_type::f32,
-                                          memory::format::any);
+        weights_tz, platform::MKLDNNGetDataType<T>(),
+        (g == 1) ? chosen_memory_format : mkldnn::memory::format::goihw);
+    std::vector<int> bias_tz;  // TODO(mgallus): avoid empty vector creation.
+                               // Currently used whenever bias is != nullptr.
+    auto dst_md = platform::MKLDNNMemDesc(
+        dst_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
 
     // create a conv primitive descriptor and save it for usage in backward
-    std::shared_ptr<conv_fwd::primitive_desc> conv_pd = ConvFwdPrimitiveDesc(
-        src_md, weights_md, dst_md, strides, paddings, mkldnn_engine);
-
-    // create reorder primitive if the input format is not the preferred one
-    auto src_memory = user_src_memory;
-    primitive reorder_src;
-    bool is_src_reordered = false;
-    if (memory::primitive_desc(conv_pd->src_primitive_desc()) !=
-        user_src_memory.get_primitive_desc()) {
-      src_memory = memory(conv_pd->src_primitive_desc());
-      reorder_src = reorder(user_src_memory, src_memory);
-      is_src_reordered = true;
-    }
-    auto weights_memory = user_weights_memory;
-    primitive reorder_weights;
-    bool is_weights_reordered = false;
-    if (memory::primitive_desc(conv_pd->weights_primitive_desc()) !=
-        user_weights_memory.get_primitive_desc()) {
-      weights_memory = memory(conv_pd->weights_primitive_desc());
-      reorder_weights = reorder(user_weights_memory, weights_memory);
-      is_weights_reordered = true;
+    std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd;
+    if (bias) {
+      bias_tz = paddle::framework::vectorize2int(bias->dims());
+      auto bias_md = platform::MKLDNNMemDesc(
+          bias_tz, platform::MKLDNNGetDataType<T>(), memory::format::x);
+      conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, bias_md, dst_md,
+                                     strides, paddings, mkldnn_engine,
+                                     fuse_relu, fuse_eltwise);
+    } else {
+      conv_pd =
+          ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides, paddings,
+                               mkldnn_engine, fuse_relu, fuse_eltwise);
     }
+    // Save conv_pd/src_memory/weights_memory for backward pass
+    dev_ctx.SetBlob(key_conv_pd, conv_pd);
+
+    ConvMKLDNNHandler handler(conv_pd, dev_ctx, mkldnn_engine, key);
+
+    // create mkldnn memory from input tensors (data/weights)
+    auto user_src_memory_p =
+        handler.AcquireSrcMemory(user_src_md, to_void_cast<T>(input_data));
+    auto user_weights_memory_p = handler.AcquireWeightsMemory(
+        user_weights_md, to_void_cast<T>(filter_data));
 
-    // create memory primitive for conv dst
-    auto dst_memory = memory(conv_pd->dst_primitive_desc(), output_data);
+    T* output_data =
+        output->mutable_data<T>(ctx.GetPlace(), handler.GetDstMemorySize());
+    // create reorder primitive if the input format is not the preferred one
+    auto src_memory_p =
+        handler.AcquireSrcMemoryFromPrimitive(user_src_memory_p, pipeline);
+    auto weights_memory_p = handler.AcquireWeightsMemoryFromPrimitive(
+        user_weights_memory_p, pipeline, is_test);
+    auto dst_memory_p =
+        handler.AcquireDstMemoryFromPrimitive(to_void_cast<T>(output_data));
 
     // create convolution op primitive
-    auto conv_prim = conv_fwd(*conv_pd, src_memory, weights_memory, dst_memory);
+    std::shared_ptr<mkldnn::convolution_forward> conv_p;
+    if (bias) {
+      const T* bias_data = bias->data<T>();
+      auto user_bias_md = platform::MKLDNNMemDesc(
+          {bias_tz}, platform::MKLDNNGetDataType<T>(), memory::format::x);
+      auto user_bias_memory_p =
+          handler.AcquireBiasMemory(user_bias_md, to_void_cast<T>(bias_data));
+
+      auto bias_memory_p =
+          handler.AcquireBiasMemoryFromPrimitive(user_bias_memory_p, pipeline);
+      conv_p = handler.AcquireConvolution(src_memory_p, weights_memory_p,
+                                          bias_memory_p, dst_memory_p);
+    } else {
+      conv_p = handler.AcquireConvolution(src_memory_p, weights_memory_p,
+                                          dst_memory_p);
+    }
 
     // push primitive to stream and wait until it's executed
-    std::vector<primitive> pipeline;
-    if (is_src_reordered) pipeline.push_back(reorder_src);
-    if (is_weights_reordered) pipeline.push_back(reorder_weights);
-    pipeline.push_back(conv_prim);
+    pipeline.push_back(*conv_p);
     stream(stream::kind::eager).submit(pipeline).wait();
 
-    // Save conv_pd/src_memory/weights_memory for backward pass
-    dev_ctx.SetBlob(key_conv_pd, conv_pd);
-
     output->set_layout(DataLayout::kMKLDNN);
-    output->set_format(GetMKLDNNFormat(dst_memory));
+    output->set_format(GetMKLDNNFormat(*dst_memory_p));
   }
 
  private:
-  std::unique_ptr<conv_fwd::primitive_desc> ConvFwdPrimitiveDesc(
-      const memory::desc& src, const memory::desc& weights,
-      const memory::desc& dst, const std::vector<int>& strides,
-      const std::vector<int>& paddings, const mkldnn::engine& engine) const {
+  mkldnn::primitive_attr CreatePostOps(bool fuse_relu,
+                                       bool fuse_eltwise) const {
+    mkldnn::primitive_attr conv_attr;
+    mkldnn::post_ops post_operations;
+    // Fusion with Elementwise layer relies on adding a sum post-operation with
+    // the scale parameter. It is assumed that when fuse_eltwise is true, the
+    // Output tensor contains the data coming from residual connection. The
+    // result of this post_op is: Output = scale * Output + Conv_Out.
+    if (fuse_eltwise) {
+      post_operations.append_sum(1.0f);
+    }
+    // Fusion with ReLU layer is executed through the PostOps feature. Create a
+    // PostOps object and configure it to execute an eltwise relu operation.
+    if (fuse_relu) {
+      constexpr float scale = 1.0f;
+      constexpr float negative_slope = 0.0f;
+      constexpr float placeholder = 0.0f;
+      post_operations.append_eltwise(scale, mkldnn::algorithm::eltwise_relu,
+                                     negative_slope, placeholder);
+    }
+    conv_attr.set_post_ops(post_operations);
+    return conv_attr;
+  }
+
+  std::unique_ptr<mkldnn::convolution_forward::primitive_desc>
+  ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights,
+                       const memory::desc& dst, const std::vector<int>& strides,
+                       const std::vector<int>& paddings,
+                       const mkldnn::engine& engine, const bool fuse_relu,
+                       const bool fuse_eltwise) const {
+    memory::dims stride_dims = {strides[0], strides[1]};
+    memory::dims padding_dims = {paddings[0], paddings[1]};
+
+    auto conv_desc = mkldnn::convolution_forward::desc(
+        mkldnn::prop_kind::forward, mkldnn::convolution_direct, src, weights,
+        dst, stride_dims, padding_dims, padding_dims,
+        mkldnn::padding_kind::zero);
+
+    mkldnn::primitive_attr conv_attr = CreatePostOps(fuse_relu, fuse_eltwise);
+
+    auto p_conv_pd = new mkldnn::convolution_forward::primitive_desc(
+        conv_desc, conv_attr, engine);
+
+    return std::unique_ptr<mkldnn::convolution_forward::primitive_desc>(
+        p_conv_pd);
+  }
+
+  std::unique_ptr<mkldnn::convolution_forward::primitive_desc>
+  ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights,
+                       const memory::desc& bias, const memory::desc& dst,
+                       const std::vector<int>& strides,
+                       const std::vector<int>& paddings,
+                       const mkldnn::engine& engine, const bool fuse_relu,
+                       const bool fuse_eltwise) const {
     memory::dims stride_dims = {strides[0], strides[1]};
     memory::dims padding_dims = {paddings[0], paddings[1]};
 
-    auto conv_desc =
-        conv_fwd::desc(mkldnn::prop_kind::forward, mkldnn::convolution_direct,
-                       src, weights, dst, stride_dims, padding_dims,
-                       padding_dims, mkldnn::padding_kind::zero);
+    auto conv_desc = mkldnn::convolution_forward::desc(
+        mkldnn::prop_kind::forward, mkldnn::convolution_direct, src, weights,
+        bias, dst, stride_dims, padding_dims, padding_dims,
+        mkldnn::padding_kind::zero);
+
+    mkldnn::primitive_attr conv_attr = CreatePostOps(fuse_relu, fuse_eltwise);
 
-    auto p_conv_pd = new conv_fwd::primitive_desc(conv_desc, engine);
+    auto p_conv_pd = new mkldnn::convolution_forward::primitive_desc(
+        conv_desc, conv_attr, engine);
 
-    return std::unique_ptr<conv_fwd::primitive_desc>(p_conv_pd);
+    return std::unique_ptr<mkldnn::convolution_forward::primitive_desc>(
+        p_conv_pd);
   }
 };
 
@@ -197,13 +529,10 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
 
     if (!input_grad && !filter_grad) return;
 
-    // Get an unique name from "argument" name of "Output" variable
-    // This name will be used as key when saving info into device context
-    const std::string key = ctx.op().Input("Output");
-    const std::string key_conv_pd = key + "@conv_pd";
-
     std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
+    int groups = ctx.Attr<int>("groups");
 
     const T* input_data = input->data<T>();
     const T* filter_data = filter->data<T>();
@@ -211,158 +540,131 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     T* input_grad_data = nullptr;
     T* filter_grad_data = nullptr;
 
-    if (input_grad) {
-      input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
-    }
-    if (filter_grad) {
-      filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
-    }
-
     std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
     std::vector<int> weights_tz =
         paddle::framework::vectorize2int(filter->dims());
     std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
 
-    // create mkldnn memory from input tensors (input/weights/output_grad)
-    auto user_src_memory = memory(
-        {{{src_tz}, memory::data_type::f32, input->format()}, mkldnn_engine},
-        to_void_cast(input_data));
-    auto user_weights_memory =
-        memory({{{weights_tz}, memory::data_type::f32, filter->format()},
-                mkldnn_engine},
-               to_void_cast(filter_data));
-    auto user_diff_dst_memory =
-        memory({{{dst_tz}, memory::data_type::f32, output_grad->format()},
-                mkldnn_engine},
-               to_void_cast(output_grad_data));
+    // Get an unique name from "argument" name of "Output" variable
+    // as well as attributes of primitive to be created
+    // This name will be used as key when saving info into device context
+    const std::string key =
+        ConvMKLDNNHandler::GetHash(src_tz, weights_tz, strides, paddings,
+                                   dilations, groups, ctx.op().Input("Output"));
+
+    const std::string key_conv_pd = key + "@conv_pd";
+    std::vector<primitive> pipeline;
+
+    // Create user memory descriptors
+    auto user_src_md = platform::MKLDNNMemDesc(
+        {src_tz}, platform::MKLDNNGetDataType<T>(), input->format());
+    auto user_weights_md = platform::MKLDNNMemDesc(
+        {weights_tz}, platform::MKLDNNGetDataType<T>(), filter->format());
+    auto user_diff_dst_md = platform::MKLDNNMemDesc(
+        {dst_tz}, platform::MKLDNNGetDataType<T>(), output_grad->format());
 
     /* create memory descriptor for conv backward without specified format
      * ('any') which lets a primitive (conv backward in this case) choose
      * the memory format preferred for best performance
      */
-    auto src_md = platform::MKLDNNMemDesc(src_tz, memory::data_type::f32,
-                                          memory::format::any);
-    auto diff_src_md = platform::MKLDNNMemDesc(src_tz, memory::data_type::f32,
-                                               memory::format::any);
+    std::string data_format = ctx.Attr<std::string>("data_format");
+    auto chosen_memory_format =
+        platform::data_format_to_memory_format(data_format);
+
+    auto src_md = platform::MKLDNNMemDesc(
+        src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
+    auto diff_src_md = platform::MKLDNNMemDesc(
+        src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
     auto weights_md = platform::MKLDNNMemDesc(
-        weights_tz, memory::data_type::f32, memory::format::any);
+        weights_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
     auto diff_weights_md = platform::MKLDNNMemDesc(
-        weights_tz, memory::data_type::f32, memory::format::any);
-    auto diff_dst_md = platform::MKLDNNMemDesc(dst_tz, memory::data_type::f32,
-                                               memory::format::any);
+        weights_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
+    auto diff_dst_md = platform::MKLDNNMemDesc(
+        dst_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
 
     // Retrieve conv_pd from device context
-    auto conv_pd = std::static_pointer_cast<conv_fwd::primitive_desc>(
-        dev_ctx.GetBlob(key_conv_pd));
+    auto conv_pd =
+        std::static_pointer_cast<mkldnn::convolution_forward::primitive_desc>(
+            dev_ctx.GetBlob(key_conv_pd));
     PADDLE_ENFORCE(conv_pd != nullptr,
                    "Fail to find conv_pd in device context");
 
+    // create backward convolution weights primitive descriptor
+    auto conv_bwd_weights_desc = mkldnn::convolution_backward_weights::desc(
+        mkldnn::convolution_direct, src_md, diff_weights_md, diff_dst_md,
+        strides, paddings, paddings, mkldnn::padding_kind::zero);
+    auto conv_bwd_weights_pd =
+        std::make_shared<mkldnn::convolution_backward_weights::primitive_desc>(
+            conv_bwd_weights_desc, mkldnn_engine, *conv_pd);
+
+    // create backward convolution data primitive descriptor
+    auto conv_bwd_data_desc = mkldnn::convolution_backward_data::desc(
+        mkldnn::convolution_direct, diff_src_md, weights_md, diff_dst_md,
+        strides, paddings, paddings, mkldnn::padding_kind::zero);
+    auto conv_bwd_data_pd =
+        std::make_shared<mkldnn::convolution_backward_data::primitive_desc>(
+            conv_bwd_data_desc, mkldnn_engine, *conv_pd);
+
+    ConvMKLDNNHandler handler(conv_pd, conv_bwd_data_pd, conv_bwd_weights_pd,
+                              dev_ctx, mkldnn_engine, key);
+
+    // create mkldnn memory from input tensors (data/weights)
+    auto user_src_memory_p =
+        handler.AcquireSrcMemory(user_src_md, to_void_cast<T>(input_data));
+    auto user_weights_memory_p = handler.AcquireWeightsMemory(
+        user_weights_md, to_void_cast<T>(filter_data));
+    auto user_diff_dst_memory_p = handler.AcquireDiffDstMemory(
+        user_diff_dst_md, to_void_cast<T>(output_grad_data));
+
     // create backward conv primitive for weights
     if (filter_grad) {
-      // create backward convolution primitive descriptor
-      auto conv_bwd_weights_desc = conv_bwd_weights::desc(
-          mkldnn::convolution_direct, src_md, diff_weights_md, diff_dst_md,
-          strides, paddings, paddings, mkldnn::padding_kind::zero);
-      auto conv_bwd_weights_pd = conv_bwd_weights::primitive_desc(
-          conv_bwd_weights_desc, mkldnn_engine, *conv_pd);
-
-      // create reorder primitive if the input format is not the preferred one
-      auto src_memory = user_src_memory;
-      primitive reorder_src;
-      bool is_src_reordered = false;
-      if (memory::primitive_desc(conv_bwd_weights_pd.src_primitive_desc()) !=
-          user_src_memory.get_primitive_desc()) {
-        src_memory = memory(conv_bwd_weights_pd.src_primitive_desc());
-        reorder_src = reorder(user_src_memory, src_memory);
-        is_src_reordered = true;
-      }
-
-      auto diff_dst_memory_4filter = user_diff_dst_memory;
-      primitive reorder_diff_dst_4filter;
-      bool is_diff_dst_reordered_4filter = false;
-      if (memory::primitive_desc(
-              conv_bwd_weights_pd.diff_dst_primitive_desc()) !=
-          user_diff_dst_memory.get_primitive_desc()) {
-        diff_dst_memory_4filter =
-            memory(conv_bwd_weights_pd.diff_dst_primitive_desc());
-        reorder_diff_dst_4filter =
-            reorder(user_diff_dst_memory, diff_dst_memory_4filter);
-        is_diff_dst_reordered_4filter = true;
-      }
-
-      // create mkldnn memory for output (i.e. diff weights)
-      auto diff_weights_memory =
-          memory(conv_bwd_weights_pd.diff_weights_primitive_desc(),
-                 reinterpret_cast<void*>(filter_grad_data));
+      auto src_memory_p = handler.AcquireSrcMemoryFromWeightsPrimitive(
+          user_src_memory_p, pipeline);
 
-      // create backward conv primitive for weights
-      auto conv_bwd_weights_prim =
-          conv_bwd_weights(conv_bwd_weights_pd, src_memory,
-                           diff_dst_memory_4filter, diff_weights_memory);
-
-      // push primitive and execute it
-      std::vector<primitive> pipeline;
-      if (is_src_reordered) pipeline.push_back(reorder_src);
-      if (is_diff_dst_reordered_4filter)
-        pipeline.push_back(reorder_diff_dst_4filter);
-      pipeline.push_back(conv_bwd_weights_prim);
-      stream(stream::kind::eager).submit(pipeline).wait();
+      auto diff_dst_memory_4filter_p =
+          handler.AcquireDiffDstMemoryFromWeightsPrimitive(
+              user_diff_dst_memory_p, pipeline);
+
+      const size_t size = handler.GetDiffWeightsMemorySize();
+      filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace(), size);
+
+      auto diff_weights_memory_p =
+          handler.AcquireDiffWeightsMemoryFromWeightsPrimitive(
+              reinterpret_cast<void*>(filter_grad_data));
+
+      auto conv_bwd_weights_p = handler.AcquireConvolutionBackwardWeights(
+          src_memory_p, diff_dst_memory_4filter_p, diff_weights_memory_p);
+
+      // push primitive to stream and wait until it's executed
+      pipeline.push_back(*conv_bwd_weights_p);
 
       filter_grad->set_layout(DataLayout::kMKLDNN);
-      filter_grad->set_format(GetMKLDNNFormat(diff_weights_memory));
+      filter_grad->set_format(GetMKLDNNFormat(*diff_weights_memory_p));
     }
 
     if (input_grad) {
-      // create backward convolution primitive descriptor
-      auto conv_bwd_data_desc = conv_bwd_data::desc(
-          mkldnn::convolution_direct, diff_src_md, weights_md, diff_dst_md,
-          strides, paddings, paddings, mkldnn::padding_kind::zero);
-      auto conv_bwd_data_pd = conv_bwd_data::primitive_desc(
-          conv_bwd_data_desc, mkldnn_engine, *conv_pd);
-
-      // create reorder primitive if the input format is not the preferred one
-      auto weights_memory = user_weights_memory;
-      primitive reorder_weights;
-      bool is_weights_reordered = false;
-      if (memory::primitive_desc(conv_bwd_data_pd.weights_primitive_desc()) !=
-          user_weights_memory.get_primitive_desc()) {
-        weights_memory = memory(conv_bwd_data_pd.weights_primitive_desc());
-        reorder_weights = reorder(user_weights_memory, weights_memory);
-        is_weights_reordered = true;
-      }
-
-      auto diff_dst_memory_4data = user_diff_dst_memory;
-      primitive reorder_diff_dst_4data;
-      bool is_diff_dst_reordered_4data = false;
-      if (memory::primitive_desc(conv_bwd_data_pd.diff_dst_primitive_desc()) !=
-          user_diff_dst_memory.get_primitive_desc()) {
-        diff_dst_memory_4data =
-            memory(conv_bwd_data_pd.diff_dst_primitive_desc());
-        reorder_diff_dst_4data =
-            reorder(user_diff_dst_memory, diff_dst_memory_4data);
-        is_diff_dst_reordered_4data = true;
-      }
-
-      // create mkldnn memory for output (i.e. diff src)
-      auto diff_src_memory = memory(conv_bwd_data_pd.diff_src_primitive_desc(),
-                                    reinterpret_cast<void*>(input_grad_data));
-
-      // create backward conv primitive for data
-      auto conv_bwd_data_prim =
-          conv_bwd_data(conv_bwd_data_pd, diff_dst_memory_4data, weights_memory,
-                        diff_src_memory);
-
-      // push primitive and execute it
-      std::vector<primitive> pipeline;
-      if (is_weights_reordered) pipeline.push_back(reorder_weights);
-      if (is_diff_dst_reordered_4data)
-        pipeline.push_back(reorder_diff_dst_4data);
-      pipeline.push_back(conv_bwd_data_prim);
-      stream(stream::kind::eager).submit(pipeline).wait();
+      auto weights_memory_p = handler.AcquireWeightsMemoryFromDataPrimitive(
+          user_weights_memory_p, pipeline);
+
+      auto diff_dst_memory_4data_p =
+          handler.AcquireDiffDstMemoryFromDataPrimitive(user_diff_dst_memory_p,
+                                                        pipeline);
+
+      const size_t size = handler.GetDiffSourceMemorySize();
+      input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace(), size);
+
+      auto diff_src_memory_p = handler.AcquireDiffSrcMemoryFromDataPrimitive(
+          reinterpret_cast<void*>(input_grad_data));
+
+      auto conv_bwd_data_p = handler.AcquireConvolutionBackwardData(
+          diff_dst_memory_4data_p, weights_memory_p, diff_src_memory_p);
+
+      pipeline.push_back(*conv_bwd_data_p);
 
       input_grad->set_layout(DataLayout::kMKLDNN);
-      input_grad->set_format(GetMKLDNNFormat(diff_src_memory));
+      input_grad->set_format(GetMKLDNNFormat(*diff_src_memory_p));
     }
+    stream(stream::kind::eager).submit(pipeline).wait();
   }  // Compute()
 };
 
diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
index 37153d58439a90190eb2ad82d5dcc145e22dfa48..8f84bf71a7f77606bed6672f0830e3fc80165a42 100644
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -37,6 +37,7 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
 
   auto in_dims = ctx->GetInputDim("Input");
   auto filter_dims = ctx->GetInputDim("Filter");
+
   std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
   std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
   int groups = ctx->Attrs().Get<int>("groups");
@@ -57,7 +58,6 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
   PADDLE_ENFORCE_EQ(in_dims[1], filter_dims[1] * groups,
                     "The number of input channels should be equal to filter "
                     "channels * groups.");
-
   PADDLE_ENFORCE_EQ(
       filter_dims[0] % groups, 0,
       "The number of output channels should be divided by groups.");
@@ -109,6 +109,7 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
 }
 
 void Conv2DOpMaker::Make() {
+  AddAttr<bool>("is_test", "").SetDefault(false);
   AddInput(
       "Input",
       "(Tensor) The input tensor of convolution operator. "
@@ -122,6 +123,11 @@ void Conv2DOpMaker::Make() {
            "H is the height of the filter, and W is the width of the filter. "
            "If the groups attribute is greater than 1, C equals the number of "
            "input image channels divided by the groups.");
+  AddInput("Bias",
+           "(Tensor) Bias to be added to each output of filter application."
+           "The format of output tensor is X (one-dimensional) of size equal"
+           "to the number of output channels. Only used with MKL-DNN.")
+      .AsDispensable();
   AddOutput("Output",
             "(Tensor) The output tensor of convolution operator. "
             "The format of output tensor is also NCHW.")
@@ -156,6 +162,13 @@ void Conv2DOpMaker::Make() {
   AddAttr<bool>("use_mkldnn",
                 "(bool, default false) Only used in mkldnn kernel")
       .SetDefault(false);
+  AddAttr<bool>("fuse_relu", "(bool, default false) Only used in mkldnn kernel")
+      .SetDefault(false);
+  AddAttr<bool>("fuse_eltwise",
+                "(bool, default false) Only used in mkldnn kernel. Used "
+                "whenever convolution output is connected via skip connection "
+                "to a previous layer.")
+      .SetDefault(false);
   AddAttr<std::string>(
       "data_format",
       "(string, default NCHW) Only used in "
diff --git a/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc b/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc
index 038ea8999072f562104c5386ed18b6b275816345..73831611d01b8c5b8d2d9f7f15634a0094e4a608 100644
--- a/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc
@@ -76,7 +76,6 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
         conv_desc.descriptor<T>(paddings, strides, dilations);
 
     // ------------------- cudnn conv workspace ---------------------
-    void* cudnn_workspace = nullptr;
     size_t workspace_size_in_bytes;  // final workspace to allocate.
     size_t workspace_size_limit = kConvCUDNNWorkspaceLimitBytes;
     if (user_workspace_size > 0) {
@@ -87,7 +86,7 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
     auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     auto handle = dev_ctx.cudnn_handle();
     // Get the algorithm
-    PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
+    CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
         handle, cudnn_filter_desc, cudnn_input_desc, cudnn_conv_desc,
         // dxDesc: Handle to the previously initialized output tensor
         // descriptor.
@@ -95,30 +94,26 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
         workspace_size_limit, &algo));
 
     // get workspace size able to allocate
-    PADDLE_ENFORCE(
+    CUDNN_ENFORCE(
         platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
             handle, cudnn_filter_desc, cudnn_input_desc, cudnn_conv_desc,
             cudnn_output_desc, algo, &workspace_size_in_bytes));
 
-    // Allocate on GPU memory
-    platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
-    cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
-
     // ------------------- cudnn conv transpose forward ---------------------
     int input_offset = input->numel() / input->dims()[0] / groups;
     int output_offset = output->numel() / output->dims()[0] / groups;
     int filter_offset = filter->numel() / groups;
     T alpha = 1.0f, beta = 0.0f;
     for (int g = 0; g < groups; g++) {
-      PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
-          handle, &alpha, cudnn_filter_desc, filter_data + filter_offset * g,
-          cudnn_input_desc, input_data + input_offset * g, cudnn_conv_desc,
-          algo, cudnn_workspace, workspace_size_in_bytes, &beta,
-          cudnn_output_desc, output_data + output_offset * g));
+      auto cudnn_func = [&](void* cudnn_workspace) {
+        CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
+            handle, &alpha, cudnn_filter_desc, filter_data + filter_offset * g,
+            cudnn_input_desc, input_data + input_offset * g, cudnn_conv_desc,
+            algo, cudnn_workspace, workspace_size_in_bytes, &beta,
+            cudnn_output_desc, output_data + output_offset * g));
+      };
+      dev_ctx.RunCudnnFuncWithWorkspace(cudnn_func, workspace_size_in_bytes);
     }
-
-    // Release the cudnn workspace
-    paddle::memory::Free(gpu, cudnn_workspace);
   }
 };
 
@@ -178,11 +173,11 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
     auto handle = dev_ctx.cudnn_handle();
     if (input_grad) {
       // choose backward algorithm for data
-      PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
+      CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
           handle, cudnn_output_desc, cudnn_filter_desc, cudnn_conv_desc,
           cudnn_input_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
           workspace_size_limit, &data_algo));
-      PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
+      CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
           handle, cudnn_output_desc, cudnn_filter_desc, cudnn_conv_desc,
           cudnn_input_desc, data_algo, &fwd_ws_size));
       workspace_size_in_bytes = std::max(workspace_size_in_bytes, fwd_ws_size);
@@ -190,7 +185,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
 
     if (filter_grad) {
       // choose backward algorithm for filter
-      PADDLE_ENFORCE(
+      CUDNN_ENFORCE(
           platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
               handle, cudnn_output_desc, cudnn_input_desc, cudnn_conv_desc,
               cudnn_filter_desc,
@@ -198,7 +193,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
               workspace_size_limit, &filter_algo));
 
       // get workspace for backwards filter algorithm
-      PADDLE_ENFORCE(
+      CUDNN_ENFORCE(
           platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
               handle, cudnn_output_desc, cudnn_input_desc, cudnn_conv_desc,
               cudnn_filter_desc, filter_algo, &bwd_filter_ws_size));
@@ -206,11 +201,6 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
           std::max(workspace_size_in_bytes, bwd_filter_ws_size);
     }
 
-    // ------------------- cudnn conv workspace ---------------------
-    // Already on GPU
-    void* cudnn_workspace = nullptr;
-    platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
-    cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
     // ------------------- cudnn conv backward data ---------------------
     // FIXME(typhoonzero): template type T may not be the same as cudnn call.
     int input_offset = input->numel() / input->dims()[0] / groups;
@@ -222,12 +212,15 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
       T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
       // Because beta is zero, it is unnecessary to reset input_grad.
       for (int g = 0; g < groups; g++) {
-        PADDLE_ENFORCE(platform::dynload::cudnnConvolutionForward(
-            handle, &alpha, cudnn_output_desc,
-            output_grad_data + output_grad_offset * g, cudnn_filter_desc,
-            filter_data + filter_offset * g, cudnn_conv_desc, data_algo,
-            cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc,
-            input_grad_data + input_offset * g));
+        auto cudnn_func = [&](void* cudnn_workspace) {
+          CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward(
+              handle, &alpha, cudnn_output_desc,
+              output_grad_data + output_grad_offset * g, cudnn_filter_desc,
+              filter_data + filter_offset * g, cudnn_conv_desc, data_algo,
+              cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc,
+              input_grad_data + input_offset * g));
+        };
+        dev_ctx.RunCudnnFuncWithWorkspace(cudnn_func, workspace_size_in_bytes);
       }
     }
 
@@ -237,17 +230,17 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
       // Because beta is zero, it is unnecessary to reset filter_grad.
       // Gradient with respect to the filter
       for (int g = 0; g < groups; g++) {
-        PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
-            handle, &alpha, cudnn_output_desc,
-            output_grad_data + output_grad_offset * g, cudnn_input_desc,
-            input_data + input_offset * g, cudnn_conv_desc, filter_algo,
-            cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_filter_desc,
-            filter_grad_data + filter_offset * g));
+        auto cudnn_func = [&](void* cudnn_workspace) {
+          CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
+              handle, &alpha, cudnn_output_desc,
+              output_grad_data + output_grad_offset * g, cudnn_input_desc,
+              input_data + input_offset * g, cudnn_conv_desc, filter_algo,
+              cudnn_workspace, workspace_size_in_bytes, &beta,
+              cudnn_filter_desc, filter_grad_data + filter_offset * g));
+        };
+        dev_ctx.RunCudnnFuncWithWorkspace(cudnn_func, workspace_size_in_bytes);
       }
     }
-
-    // Release the cudnn workspace
-    paddle::memory::Free(gpu, cudnn_workspace);
   }
 };
 
diff --git a/paddle/fluid/operators/crf_decoding_op.h b/paddle/fluid/operators/crf_decoding_op.h
index 3f5fab3b382bea97f43e4bc1b2cd436c956ba264..8181897c3d3844bda5574e85a08b2af038fcd664 100644
--- a/paddle/fluid/operators/crf_decoding_op.h
+++ b/paddle/fluid/operators/crf_decoding_op.h
@@ -85,6 +85,199 @@ class CRFDecodingOpKernel : public framework::OpKernel<T> {
     int* track_value =
         track.mutable_data<int>(emission_dims, platform::CPUPlace());
 
+#ifdef __AVX__
+// It use the AVX or AVX512 instruction to deal the data as the vector of 8 or
+// 16 elements per iteration. Then it can implement the parallel processing.
+// Only optimize for float type.
+#ifdef __AVX512F__
+    size_t step_size = 16;
+#else
+    size_t step_size = 8;
+#endif
+    if (std::is_same<T, float>::value && (tag_num >= step_size)) {
+      size_t steps = tag_num / step_size;
+      size_t remain = tag_num % step_size;
+      int last_offset = static_cast<int>(remain) - static_cast<int>(step_size);
+
+      // Setup the alpha initial value.
+      size_t i_offset = 0;
+      for (size_t i = 0; i <= steps; ++i) {
+#ifdef __AVX512F__
+        // Declare the variable for the content of weights, input and alpha
+        // values.
+        __m512 w_content, x_content, alpha_content;
+
+        // Load the relevant data into the variables from un-aligned address.
+        w_content = _mm512_loadu_ps((const float*)(w + i_offset));
+        x_content = _mm512_loadu_ps((const float*)(x + i_offset));
+        alpha_content = _mm512_add_ps(w_content, x_content);
+
+        // Save the alpha value.
+        _mm512_storeu_ps(reinterpret_cast<float*>(alpha_value + i_offset),
+                         alpha_content);
+#else
+        // Declare the variable for the content of weights, input and alpha
+        // values.
+        __m256 w_content, x_content, alpha_content;
+
+        // Load the relevant data into the variables from un-aligned address.
+        w_content = _mm256_loadu_ps((const float*)(w + i_offset));
+        x_content = _mm256_loadu_ps((const float*)(x + i_offset));
+        alpha_content = _mm256_add_ps(w_content, x_content);
+
+        // Save the alpha value.
+        _mm256_storeu_ps(reinterpret_cast<float*>(alpha_value + i_offset),
+                         alpha_content);
+#endif
+        i_offset += step_size;
+        if (i == steps - 1) {
+          if (remain > 0) {
+            i_offset += last_offset;
+          } else {
+            break;
+          }
+        }
+      }
+
+      // Use the column-major strategy to get the location of maximum score.
+      size_t seq_offset = 0;
+      for (size_t k = 1; k < seq_len; ++k) {
+        size_t j_offset = 0;
+        for (size_t j = 0; j <= steps; ++j) {
+#ifdef __AVX512F__
+          // Initialize the variables of maximum score and location.
+          __m512 max_score = _mm512_set1_ps(-std::numeric_limits<T>::max());
+          __m512i max_j = _mm512_setzero_si512();
+#else
+          // Initialize the variables of maximum score and location.
+          __m256 max_score = _mm256_set1_ps(-std::numeric_limits<T>::max());
+          __m256i max_j = _mm256_set1_epi32(0);
+#endif
+          // Calculate the offset of transition_weights.
+          size_t trans_offset = state_trans_base_idx * tag_num + j_offset;
+          for (size_t i = 0; i < tag_num; ++i) {
+#ifdef __AVX512F__
+            // Initalize the content of alpha variable with related offset.
+            __m512 alpha_content =
+                _mm512_set1_ps(*(const float*)(alpha_value + seq_offset + i));
+            // Obtain the content of weights from un-aligned address.
+            __m512 w_content =
+                _mm512_loadu_ps((const float*)(w + trans_offset));
+
+            __m512 score_v = _mm512_add_ps(alpha_content, w_content);
+
+            __mmask16 mask = _mm512_cmp_ps_mask(score_v, max_score, _CMP_GT_OS);
+
+            // According to the mask value, it update the index of the max_score
+            // location.
+            max_j = _mm512_mask_set1_epi32(max_j, mask, i);
+
+            // Update the max_score value.
+            max_score = _mm512_max_ps(max_score, score_v);
+#else
+            // Initalize the content of alpha variable with related offset.
+            __m256 alpha_content = _mm256_broadcast_ss(
+                (const float*)(alpha_value + seq_offset + i));
+            // Obtain the content of weights from un-aligned address.
+            __m256 w_content =
+                _mm256_loadu_ps((const float*)(w + trans_offset));
+            __m256 score_v = _mm256_add_ps(alpha_content, w_content);
+
+            __m256 mask = _mm256_cmp_ps(score_v, max_score, _CMP_GT_OS);
+
+#ifdef __AVX2__
+            // According to the mask value, it update the index of the max_score
+            // location.
+            max_j = _mm256_or_si256(
+                _mm256_andnot_si256((__m256i)mask, max_j),
+                _mm256_and_si256((__m256i)mask, _mm256_set1_epi32(i)));
+#else
+            __m128i lo_max_j = _mm256_extractf128_si256(max_j, 0);
+            __m128i hi_max_j = _mm256_extractf128_si256(max_j, 1);
+            __m128i lo_mask = _mm256_extractf128_si256((__m256i)mask, 0);
+            __m128i hi_mask = _mm256_extractf128_si256((__m256i)mask, 1);
+
+            lo_max_j = _mm_andnot_si128(lo_mask, lo_max_j);
+            hi_max_j = _mm_andnot_si128(hi_mask, hi_max_j);
+            lo_mask = _mm_and_si128(lo_mask, _mm_set1_epi32(i));
+            hi_mask = _mm_and_si128(hi_mask, _mm_set1_epi32(i));
+
+            lo_max_j = _mm_or_si128(lo_mask, lo_max_j);
+            hi_max_j = _mm_or_si128(hi_mask, hi_max_j);
+
+            // According to the mask value, it update the index of the max_score
+            // location.
+            max_j = _mm256_insertf128_si256(max_j, lo_max_j, 0);
+            max_j = _mm256_insertf128_si256(max_j, hi_max_j, 1);
+#endif
+
+            // Update the max_score value.
+            max_score = _mm256_max_ps(max_score, score_v);
+#endif
+            trans_offset += tag_num;
+          }
+
+#ifdef __AVX512F__
+          // Update the alpha and track values.
+          __m512 x_content = _mm512_loadu_ps(
+              (const float*)(x + seq_offset + tag_num + j_offset));
+          max_score = _mm512_add_ps(max_score, x_content);
+          _mm512_storeu_ps(reinterpret_cast<float*>(alpha_value + seq_offset +
+                                                    tag_num + j_offset),
+                           max_score);
+          _mm512_storeu_si512(
+              reinterpret_cast<__m512i*>(track_value + seq_offset + tag_num +
+                                         j_offset),
+              max_j);
+#else
+          // Update the alpha and track values.
+          __m256 x_content = _mm256_loadu_ps(
+              (const float*)(x + seq_offset + tag_num + j_offset));
+          max_score = _mm256_add_ps(max_score, x_content);
+          _mm256_storeu_ps(reinterpret_cast<float*>(alpha_value + seq_offset +
+                                                    tag_num + j_offset),
+                           max_score);
+          _mm256_storeu_si256(
+              reinterpret_cast<__m256i*>(track_value + seq_offset + tag_num +
+                                         j_offset),
+              max_j);
+#endif
+
+          // Calculate the offset of next step
+          j_offset += step_size;
+          if (j == steps - 1) {
+            if (remain > 0) {
+              j_offset += last_offset;
+            } else {
+              break;
+            }
+          }
+        }
+
+        seq_offset += tag_num;
+      }
+    } else {
+      for (size_t i = 0; i < tag_num; ++i) alpha_value[i] = w[i] + x[i];
+
+      for (size_t k = 1; k < seq_len; ++k) {
+        for (size_t i = 0; i < tag_num; ++i) {
+          T max_score = -std::numeric_limits<T>::max();
+          int max_j = 0;
+          for (size_t j = 0; j < tag_num; ++j) {
+            T score = alpha_value[(k - 1) * tag_num + j] +
+                      w[(j + state_trans_base_idx) * tag_num + i];
+            if (score > max_score) {
+              max_score = score;
+              max_j = j;
+            }
+          }
+
+          alpha_value[k * tag_num + i] = max_score + x[k * tag_num + i];
+          track_value[k * tag_num + i] = max_j;
+        }
+      }
+    }
+#else
     for (size_t i = 0; i < tag_num; ++i) alpha_value[i] = w[i] + x[i];
 
     for (size_t k = 1; k < seq_len; ++k) {
@@ -105,6 +298,7 @@ class CRFDecodingOpKernel : public framework::OpKernel<T> {
       }
     }
 
+#endif
     T max_score = -std::numeric_limits<T>::max();
     int max_i = 0;
     for (size_t i = 0; i < tag_num; ++i) {
diff --git a/paddle/fluid/operators/crop_op.cc b/paddle/fluid/operators/crop_op.cc
index 5b5a220cf90e7813f914ae35733e7a4103391b2d..a2a871efa850df5101be7c27ebd81456acace7e1 100644
--- a/paddle/fluid/operators/crop_op.cc
+++ b/paddle/fluid/operators/crop_op.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -188,6 +188,7 @@ namespace ops = paddle::operators;
 REGISTER_OPERATOR(crop, ops::CropOp, ops::CropOpMaker,
                   paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(crop_grad, ops::CropOpGrad);
-REGISTER_OP_CPU_KERNEL(crop, ops::CropKernel<float>);
+REGISTER_OP_CPU_KERNEL(
+    crop, ops::CropKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
     crop_grad, ops::CropGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/crop_op.cu b/paddle/fluid/operators/crop_op.cu
index 1a391860463dba14ad0de755ceb659bc9f64adc9..b75678217e36aa2297c68a7f8e2a9dfafadaca72 100644
--- a/paddle/fluid/operators/crop_op.cu
+++ b/paddle/fluid/operators/crop_op.cu
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/crop_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(crop, ops::CropKernel<float>);
+REGISTER_OP_CUDA_KERNEL(
+    crop, ops::CropKernel<paddle::platform::CUDADeviceContext, float>);
 REGISTER_OP_CUDA_KERNEL(
     crop_grad, ops::CropGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/crop_op.h b/paddle/fluid/operators/crop_op.h
index 772e80bbea4f2db654cefd0dcb404bc33803bd7a..2d7d33bd4f9b42b644444912570375bad92ba6c2 100644
--- a/paddle/fluid/operators/crop_op.h
+++ b/paddle/fluid/operators/crop_op.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -58,32 +58,74 @@ static std::vector<int> GetOffsets(const framework::ExecutionContext& ctx) {
   return res;
 }
 
-template <typename T>
+template <typename DeviceContext, typename T, size_t D>
+void CropFunction(const framework::ExecutionContext& context) {
+  auto* x = context.Input<Tensor>("X");
+  auto* out = context.Output<Tensor>("Out");
+  auto out_dims = out->dims();
+  if (out_dims[0] == -1) {
+    out_dims[0] = x->dims()[0];
+  }
+  out->mutable_data<T>(out_dims, context.GetPlace());
+  auto x_stride = framework::stride(x->dims());
+  auto out_stride = framework::stride(out->dims());
+  auto offsets = GetOffsets(context);
+  int64_t offset = 0;
+  for (size_t i = 0; i < offsets.size(); ++i) {
+    offset += (x_stride[i] * offsets[i]);
+  }
+
+  auto x_tensor = EigenTensor<T, D>::From(*x);
+  auto out_tensor = EigenTensor<T, D>::From(*out);
+  Eigen::array<int, D> e_offsets;
+  Eigen::array<int, D> e_shape;
+  for (size_t i = 0; i < D; ++i) {
+    e_offsets[i] = offsets[i];
+    e_shape[i] = out->dims()[i];
+  }
+  auto& place =
+      *context.template device_context<DeviceContext>().eigen_device();
+  out_tensor.device(place) = x_tensor.slice(e_offsets, e_shape);
+}
+
+template <typename DeviceContext, typename T>
 class CropKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<Tensor>("X");
-    auto* out = context.Output<Tensor>("Out");
-    const T* x_data = x->data<T>();
-    T* out_data = out->mutable_data<T>(context.GetPlace());
-    auto x_stride = framework::stride(x->dims());
-    auto out_stride = framework::stride(out->dims());
-    auto offsets = GetOffsets(context);
-    int64_t offset = 0;
-    for (size_t i = 0; i < offsets.size(); ++i) {
-      offset += (x_stride[i] * offsets[i]);
+    int rank = context.Input<Tensor>("X")->dims().size();
+    switch (rank) {
+      case 1:
+        CropFunction<DeviceContext, T, 1>(context);
+        break;
+      case 2:
+        CropFunction<DeviceContext, T, 2>(context);
+        break;
+      case 3:
+        CropFunction<DeviceContext, T, 3>(context);
+        break;
+      case 4:
+        CropFunction<DeviceContext, T, 4>(context);
+        break;
+      case 5:
+        CropFunction<DeviceContext, T, 5>(context);
+        break;
+      case 6:
+        CropFunction<DeviceContext, T, 6>(context);
+        break;
+      default:
+        PADDLE_THROW(
+            "CropOp only support tensors with no more than 6 dimensions.");
     }
-    StridedMemcpy<T>(context.device_context(), x_data + offset, x_stride,
-                     out->dims(), out_stride, out_data);
   }
 };
 
 template <typename DeviceContext, typename T, size_t D>
 void CropGradFunction(const framework::ExecutionContext& context) {
   auto* d_x = context.Output<Tensor>(framework::GradVarName("X"));
+  auto* x = context.Input<Tensor>("X");
   if (d_x != nullptr) {
     auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
-    d_x->mutable_data<T>(context.GetPlace());
+    d_x->mutable_data<T>(x->dims(), context.GetPlace());
     auto offsets = GetOffsets(context);
     Eigen::array<std::pair<int, int>, D> paddings;
     for (size_t i = 0; i < D; ++i) {
diff --git a/paddle/fluid/operators/cross_entropy_op.cc b/paddle/fluid/operators/cross_entropy_op.cc
index d5e095f9cad95b74b8ff79e4a60ccbdf11512a5a..66f19fe7ecfa51b2ce917f0c5fcb6d486f1a7307 100644
--- a/paddle/fluid/operators/cross_entropy_op.cc
+++ b/paddle/fluid/operators/cross_entropy_op.cc
@@ -28,23 +28,26 @@ class CrossEntropyOp : public framework::OperatorWithKernel {
 
     auto x_dims = ctx->GetInputDim("X");
     auto label_dims = ctx->GetInputDim("Label");
-    PADDLE_ENFORCE_EQ(x_dims.size(), 2UL, "Input(X)'s rank should be 2.");
-    PADDLE_ENFORCE_EQ(label_dims.size(), 2UL,
-                      "Input(Label)'s rank should be 2.");
-    PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0],
-                      "The 1st dimension of Input(X) and Input(Label) should "
-                      "be equal.");
+    int rank = x_dims.size();
+    PADDLE_ENFORCE_EQ(rank, label_dims.size(),
+                      "Input(X) and Input(Label) shall have the same rank.");
+    PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1),
+                      framework::slice_ddim(label_dims, 0, rank - 1),
+                      "Input(X) and Input(Label) shall have the same shape "
+                      "except the last dimension.");
     if (ctx->Attrs().Get<bool>("soft_label")) {
-      PADDLE_ENFORCE_EQ(x_dims[1], label_dims[1],
-                        "If Attr(soft_label) == true, the 2nd dimension of "
+      PADDLE_ENFORCE_EQ(x_dims[rank - 1], label_dims[rank - 1],
+                        "If Attr(soft_label) == true, the last dimension of "
                         "Input(X) and Input(Label) should be equal.");
     } else {
-      PADDLE_ENFORCE_EQ(label_dims[1], 1UL,
-                        "If Attr(softLabel) == false, the 2nd dimension of "
+      PADDLE_ENFORCE_EQ(label_dims[rank - 1], 1UL,
+                        "If Attr(softLabel) == false, the last dimension of "
                         "Input(Label) should be 1.");
     }
 
-    ctx->SetOutputDim("Y", {x_dims[0], 1});
+    auto y_dims = x_dims;
+    y_dims[rank - 1] = 1;
+    ctx->SetOutputDim("Y", y_dims);
     ctx->ShareLoD("X", /*->*/ "Y");
   }
 
@@ -74,24 +77,28 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel {
     auto x_dims = ctx->GetInputDim("X");
     auto label_dims = ctx->GetInputDim("Label");
     auto dy_dims = ctx->GetInputDim(framework::GradVarName("Y"));
-    PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2.");
-    PADDLE_ENFORCE_EQ(dy_dims.size(), 2, "Input(Y@Grad)'s rank should be 2.");
-    PADDLE_ENFORCE_EQ(label_dims.size(), 2, "Input(Label)'s rank should be 2.");
-    PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0],
-                      "The 1st dimension of Input(X) and Input(Label) should "
-                      "be equal.");
-    PADDLE_ENFORCE_EQ(x_dims[0], dy_dims[0],
-                      "The 1st dimension of Input(X) and Input(Y@Grad) should "
-                      "be equal.");
-    PADDLE_ENFORCE_EQ(dy_dims[1], 1,
-                      "The 2nd dimension of Input(Y@Grad) should be 1.");
+    int rank = x_dims.size();
+    PADDLE_ENFORCE_EQ(dy_dims.size(), rank,
+                      "Input(Y@Grad) and Input(X) should have the same rank.");
+    PADDLE_ENFORCE_EQ(label_dims.size(), rank,
+                      "Input(Label) and Input(X) should have the same rank.");
+    PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1),
+                      framework::slice_ddim(label_dims, 0, rank - 1),
+                      "The Input(X) and Input(Label) should have the same "
+                      "shape except the last dimension.");
+    PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1),
+                      framework::slice_ddim(dy_dims, 0, rank - 1),
+                      "The Input(X) and Input(Y@Grad) should have the same "
+                      "shape except the last dimension.");
+    PADDLE_ENFORCE_EQ(dy_dims[rank - 1], 1,
+                      "The last dimension of Input(Y@Grad) should be 1.");
     if (ctx->Attrs().Get<bool>("soft_label")) {
-      PADDLE_ENFORCE_EQ(x_dims[1], label_dims[1],
-                        "When Attr(soft_label) == true, the 2nd dimension of "
+      PADDLE_ENFORCE_EQ(x_dims[rank - 1], label_dims[rank - 1],
+                        "When Attr(soft_label) == true, the last dimension of "
                         "Input(X) and Input(Label) should be equal.");
     } else {
-      PADDLE_ENFORCE_EQ(label_dims[1], 1,
-                        "When Attr(soft_label) == false, the 2nd dimension of "
+      PADDLE_ENFORCE_EQ(label_dims[rank - 1], 1,
+                        "When Attr(soft_label) == false, the last dimension of "
                         "Input(Label) should be 1.");
     }
     ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
@@ -113,26 +120,38 @@ class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
     AddInput("X",
-             "(Tensor, default Tensor<float>), a 2-D tensor with shape [N x D],"
-             " where N is the batch size and D is the number of classes. "
-             "This input is a probability computed by the previous operator, "
-             "which is almost always the result of a softmax operator.");
-    AddInput("Label",
-             "(Tensor), the ground truth which is a 2-D tensor. When "
-             "soft_label is set to false, Label is a Tensor<int64> with shape "
-             "[N x 1]. When soft_label is set to true, Label is a "
-             "Tensor<float/double> with shape [N x D].");
+             "(Tensor, default Tensor<float>), a tensor whose last dimension "
+             "size is equal to the number of classes. This input is a "
+             "probability computed by the previous operator, which is almost "
+             "always the result of a softmax operator.");
+    AddInput(
+        "Label",
+        "(Tensor), the tensor which represents the ground truth. It has the "
+        "same shape with 'X' except the last dimension. When soft_label is set "
+        "to false, the last dimension size is 1; when soft_label is set to "
+        "true, the last dimension size is equal to the number of classes.");
     AddOutput("Y",
-              "(Tensor, default Tensor<float>), a 2-D tensor with shape "
-              "[N x 1]. The cross entropy loss.")
-        .Reuse("X");
+              "(Tensor, default Tensor<float>), a tensor whose shape is same "
+              "with 'X' except that the last dimension size is 1. It "
+              "represents the cross entropy loss.");
     AddAttr<bool>("soft_label",
                   "(bool, default false), a flag indicating whether to "
                   "interpretate the given labels as soft labels.")
         .SetDefault(false);
+    AddAttr<int>("ignore_index",
+                 "(int, default -100), Specifies a target value that is"
+                 "ignored and does not contribute to the input gradient."
+                 "Only valid if soft_label is set to False")
+        .SetDefault(-100);
     AddComment(R"DOC(
 CrossEntropy Operator.
 
+The input 'X' and 'Label' will first be logically flattened to 2-D matrixs. 
+The matrix's second dimension(row length) is as same as the original last 
+dimension, and the first dimension(column length) is the product of all other 
+original dimensions. Then the softmax computation will take palce on each raw 
+of flattened matrixs.
+
 It supports both standard cross-entropy and soft-label cross-entropy loss
 computation.
 1) One-hot cross-entropy:
diff --git a/paddle/fluid/operators/cross_entropy_op.h b/paddle/fluid/operators/cross_entropy_op.h
index 19a2aec92b267ece94685ce34604b7d1cfa5d209..03974a7fc511b1e1cb5b0eca532b260fdf9bf964 100644
--- a/paddle/fluid/operators/cross_entropy_op.h
+++ b/paddle/fluid/operators/cross_entropy_op.h
@@ -33,9 +33,14 @@ class CrossEntropyOpKernel : public framework::OpKernel<T> {
     auto* y = ctx.Output<Tensor>("Y");
     y->mutable_data<T>(ctx.GetPlace());
 
+    int rank = x->dims().size();
+    Tensor x_2d = framework::ReshapeToMatrix(*x, rank - 1);
+    Tensor labels_2d = framework::ReshapeToMatrix(*labels, rank - 1);
+    Tensor y_2d = framework::ReshapeToMatrix(*y, rank - 1);
+
     math::CrossEntropyFunctor<DeviceContext, T>()(
-        ctx.template device_context<DeviceContext>(), y, x, labels,
-        ctx.Attr<bool>("soft_label"));
+        ctx.template device_context<DeviceContext>(), &y_2d, &x_2d, &labels_2d,
+        ctx.Attr<bool>("soft_label"), ctx.Attr<int>("ignore_index"));
   }
 };
 
@@ -69,16 +74,22 @@ class XeGradFunctor {
                 const T* dy,           // NOLINT
                 const T* x,            // NOLINT
                 const int64_t* label,  // NOLINT
-                size_t num_classes)
-      : dx_(dx), dy_(dy), x_(x), label_(label), num_classes_(num_classes) {}
+                size_t num_classes, size_t ignore_index)
+      : dx_(dx),
+        dy_(dy),
+        x_(x),
+        label_(label),
+        num_classes_(num_classes),
+        ignore_index_(ignore_index) {}
 
   HOSTDEVICE void operator()(size_t sample_id) {
     auto x_is_true_offset = sample_id * num_classes_ + label_[sample_id];
     for (size_t x_offset = sample_id * num_classes_;
          x_offset < (sample_id + 1) * num_classes_; ++x_offset) {
-      dx_[x_offset] = x_offset != x_is_true_offset
-                          ? static_cast<T>(0)
-                          : -dy_[sample_id] / x_[x_offset];
+      dx_[x_offset] =
+          (x_offset != x_is_true_offset || label_[sample_id] == ignore_index_)
+              ? static_cast<T>(0)
+              : -dy_[sample_id] / x_[x_offset];
     }
   }
 
@@ -88,6 +99,7 @@ class XeGradFunctor {
   const T* x_;
   const int64_t* label_;
   size_t num_classes_;
+  size_t ignore_index_;
 };
 
 template <typename DeviceContext, typename T>
@@ -98,9 +110,13 @@ class CrossEntropyGradientOpKernel : public framework::OpKernel<T> {
     auto* dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
     auto* label = ctx.Input<Tensor>("Label");
     auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dx_data = dx->mutable_data<T>(ctx.GetPlace());
+    T* dx_data = dx->mutable_data<T>(ctx.GetPlace());
 
-    int64_t class_num = x->dims()[1];
+    // Following computation only depends on the last dimension size. So it's
+    // unnecessary to convert tensors to 2-D views.
+    int rank = x->dims().size();
+    int64_t class_num = x->dims()[rank - 1];
+    int64_t ignore_index = ctx.Attr<int>("ignore_index");
     if (ctx.Attr<bool>("soft_label")) {
       XeSoftlabelGradFunctor<T> functor(dx_data, dy->data<T>(), x->data<T>(),
                                         label->data<T>(),
@@ -110,9 +126,9 @@ class CrossEntropyGradientOpKernel : public framework::OpKernel<T> {
           static_cast<size_t>(dx->numel()));
       for_range(functor);
     } else {
-      XeGradFunctor<T> functor(dx_data, dy->data<T>(), x->data<T>(),
-                               label->data<int64_t>(),
-                               static_cast<size_t>(class_num));
+      XeGradFunctor<T> functor(
+          dx_data, dy->data<T>(), x->data<T>(), label->data<int64_t>(),
+          static_cast<size_t>(class_num), static_cast<size_t>(ignore_index));
       platform::ForRange<DeviceContext> for_range(
           ctx.template device_context<DeviceContext>(),
           static_cast<size_t>(dy->numel()));
diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt
index 6d296ff7bf14de9175dc589dfa8b46c534127ca1..f4983c65432991a45f226d97f0fb05b08a30ca89 100644
--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -27,7 +27,9 @@ anchor_generator_op.cu)
 detection_library(target_assign_op SRCS target_assign_op.cc
 target_assign_op.cu)
 detection_library(polygon_box_transform_op SRCS polygon_box_transform_op.cc
-    polygon_box_transform_op.cu)
-
-# Export local libraries to parent
+polygon_box_transform_op.cu)
+detection_library(rpn_target_assign_op SRCS rpn_target_assign_op.cc)
+detection_library(generate_proposal_labels_op SRCS generate_proposal_labels_op.cc)
+detection_library(generate_proposals_op SRCS generate_proposals_op.cc)
+#Export local libraries to parent
 set(DETECTION_LIBRARY ${LOCAL_DETECTION_LIBS} PARENT_SCOPE)
diff --git a/paddle/fluid/operators/detection/bbox_util.h b/paddle/fluid/operators/detection/bbox_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..6abeca1da443248d6ad3c1bcc64dd775d77f4ed8
--- /dev/null
+++ b/paddle/fluid/operators/detection/bbox_util.h
@@ -0,0 +1,97 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <algorithm>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/tensor.h"
+
+namespace paddle {
+namespace operators {
+
+/*
+ * transform that computes target bounding-box regression deltas
+ * given proposal boxes and ground-truth boxes.
+ */
+template <typename T>
+inline void BoxToDelta(const int box_num, const framework::Tensor& ex_boxes,
+                       const framework::Tensor& gt_boxes, const float* weights,
+                       const bool normalized, framework::Tensor* box_delta) {
+  auto ex_boxes_et = framework::EigenTensor<T, 2>::From(ex_boxes);
+  auto gt_boxes_et = framework::EigenTensor<T, 2>::From(gt_boxes);
+  auto trg = framework::EigenTensor<T, 2>::From(*box_delta);
+  T ex_w, ex_h, ex_ctr_x, ex_ctr_y, gt_w, gt_h, gt_ctr_x, gt_ctr_y;
+  for (int64_t i = 0; i < box_num; ++i) {
+    ex_w = ex_boxes_et(i, 2) - ex_boxes_et(i, 0) + (normalized == false);
+    ex_h = ex_boxes_et(i, 3) - ex_boxes_et(i, 1) + (normalized == false);
+    ex_ctr_x = ex_boxes_et(i, 0) + 0.5 * ex_w;
+    ex_ctr_y = ex_boxes_et(i, 1) + 0.5 * ex_h;
+
+    gt_w = gt_boxes_et(i, 2) - gt_boxes_et(i, 0) + (normalized == false);
+    gt_h = gt_boxes_et(i, 3) - gt_boxes_et(i, 1) + (normalized == false);
+    gt_ctr_x = gt_boxes_et(i, 0) + 0.5 * gt_w;
+    gt_ctr_y = gt_boxes_et(i, 1) + 0.5 * gt_h;
+
+    trg(i, 0) = (gt_ctr_x - ex_ctr_x) / ex_w;
+    trg(i, 1) = (gt_ctr_y - ex_ctr_y) / ex_h;
+    trg(i, 2) = std::log(gt_w / ex_w);
+    trg(i, 3) = std::log(gt_h / ex_h);
+
+    if (weights) {
+      trg(i, 0) = trg(i, 0) / weights[0];
+      trg(i, 1) = trg(i, 1) / weights[1];
+      trg(i, 2) = trg(i, 2) / weights[2];
+      trg(i, 3) = trg(i, 3) / weights[3];
+    }
+  }
+}
+
+template <typename T>
+void Gather(const T* in, const int in_stride, const int* index, const int num,
+            T* out) {
+  const int stride_bytes = in_stride * sizeof(T);
+  for (int i = 0; i < num; ++i) {
+    int id = index[i];
+    memcpy(out + i * in_stride, in + id * in_stride, stride_bytes);
+  }
+}
+
+template <typename T>
+void BboxOverlaps(const framework::Tensor& r_boxes,
+                  const framework::Tensor& c_boxes,
+                  framework::Tensor* overlaps) {
+  auto r_boxes_et = framework::EigenTensor<T, 2>::From(r_boxes);
+  auto c_boxes_et = framework::EigenTensor<T, 2>::From(c_boxes);
+  auto overlaps_et = framework::EigenTensor<T, 2>::From(*overlaps);
+  int r_num = r_boxes.dims()[0];
+  int c_num = c_boxes.dims()[0];
+  auto zero = static_cast<T>(0.0);
+  T r_box_area, c_box_area, x_min, y_min, x_max, y_max, inter_w, inter_h,
+      inter_area;
+  for (int i = 0; i < r_num; ++i) {
+    r_box_area = (r_boxes_et(i, 2) - r_boxes_et(i, 0) + 1) *
+                 (r_boxes_et(i, 3) - r_boxes_et(i, 1) + 1);
+    for (int j = 0; j < c_num; ++j) {
+      c_box_area = (c_boxes_et(j, 2) - c_boxes_et(j, 0) + 1) *
+                   (c_boxes_et(j, 3) - c_boxes_et(j, 1) + 1);
+      x_min = std::max(r_boxes_et(i, 0), c_boxes_et(j, 0));
+      y_min = std::max(r_boxes_et(i, 1), c_boxes_et(j, 1));
+      x_max = std::min(r_boxes_et(i, 2), c_boxes_et(j, 2));
+      y_max = std::min(r_boxes_et(i, 3), c_boxes_et(j, 3));
+      inter_w = std::max(x_max - x_min + 1, zero);
+      inter_h = std::max(y_max - y_min + 1, zero);
+      inter_area = inter_w * inter_h;
+      overlaps_et(i, j) = inter_area / (r_box_area + c_box_area - inter_area);
+    }
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d7a53f1bef98ecda3ba7b36323678a11a632a15c
--- /dev/null
+++ b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
@@ -0,0 +1,479 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <math.h>
+#include <algorithm>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/detection/bbox_util.h"
+#include "paddle/fluid/operators/gather.h"
+#include "paddle/fluid/operators/math/concat.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+const int kBoxDim = 4;
+
+template <typename T>
+void AppendRois(LoDTensor* out, int64_t offset, Tensor* to_add) {
+  auto* out_data = out->data<T>();
+  auto* to_add_data = to_add->data<T>();
+  memcpy(out_data + offset, to_add_data, to_add->numel() * sizeof(T));
+}
+
+class GenerateProposalLabelsOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("RpnRois"),
+                   "Input(RpnRois) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput("GtClasses"),
+                   "Input(GtClasses) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput("IsCrowd"),
+                   "Input(IsCrowd) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput("GtBoxes"),
+                   "Input(GtBoxes) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput("ImInfo"), "Input(ImInfo) shouldn't be null.");
+
+    PADDLE_ENFORCE(ctx->HasOutput("Rois"),
+                   "Output(Rois) of RpnTargetAssignOp should not be null");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("LabelsInt32"),
+        "Output(LabelsInt32) of RpnTargetAssignOp should not be null");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("BboxTargets"),
+        "Output(BboxTargets) of RpnTargetAssignOp should not be null");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("BboxInsideWeights"),
+        "Output(BboxInsideWeights) of RpnTargetAssignOp should not be null");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("BboxOutsideWeights"),
+        "Output(BboxOutsideWeights) of RpnTargetAssignOp should not be null");
+
+    auto rpn_rois_dims = ctx->GetInputDim("RpnRois");
+    auto gt_classes_dims = ctx->GetInputDim("GtClasses");
+    auto is_crowd_dims = ctx->GetInputDim("IsCrowd");
+    auto gt_boxes_dims = ctx->GetInputDim("GtBoxes");
+    auto im_info_dims = ctx->GetInputDim("ImInfo");
+
+    PADDLE_ENFORCE_EQ(rpn_rois_dims.size(), 2,
+                      "The rank of Input(RpnRois) must be 2.");
+    PADDLE_ENFORCE_EQ(gt_boxes_dims.size(), 2,
+                      "The rank of Input(GtBoxes) must be 2.");
+    PADDLE_ENFORCE_EQ(im_info_dims.size(), 2,
+                      "The rank of Input(ImInfo) must be 2.");
+
+    int class_nums = ctx->Attrs().Get<int>("class_nums");
+
+    ctx->SetOutputDim("Rois", {-1, 4});
+    ctx->SetOutputDim("LabelsInt32", {-1, 1});
+    ctx->SetOutputDim("BboxTargets", {-1, 4 * class_nums});
+    ctx->SetOutputDim("BboxInsideWeights", {-1, 4 * class_nums});
+    ctx->SetOutputDim("BboxOutsideWeights", {-1, 4 * class_nums});
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("RpnRois"));
+    return framework::OpKernelType(data_type, platform::CPUPlace());
+  }
+};
+
+template <typename T>
+void Concat(const platform::CPUDeviceContext& context,
+            const Tensor& in_tensor_a, const Tensor& in_tensor_b,
+            Tensor* out_tensor) {
+  int axis = 0;
+  std::vector<Tensor> inputs;
+  inputs.emplace_back(in_tensor_a);
+  inputs.emplace_back(in_tensor_b);
+  math::ConcatFunctor<platform::CPUDeviceContext, T> concat_functor;
+  concat_functor(context, inputs, axis, out_tensor);
+}
+
+template <typename T>
+std::vector<std::vector<int>> SampleFgBgGt(
+    const platform::CPUDeviceContext& context, Tensor* iou,
+    const Tensor& is_crowd, const int batch_size_per_im,
+    const float fg_fraction, const float fg_thresh, const float bg_thresh_hi,
+    const float bg_thresh_lo, std::minstd_rand engine, const bool use_random) {
+  std::vector<int> fg_inds;
+  std::vector<int> bg_inds;
+  std::vector<int> gt_inds;
+  int64_t gt_num = is_crowd.numel();
+  const int* crowd_data = is_crowd.data<int>();
+  T* proposal_to_gt_overlaps = iou->data<T>();
+  int64_t row = iou->dims()[0];
+  int64_t col = iou->dims()[1];
+  float epsilon = 0.00001;
+
+  // Follow the Faster RCNN's implementation
+  for (int64_t i = 0; i < row; ++i) {
+    const T* v = proposal_to_gt_overlaps + i * col;
+    T max_overlap = *std::max_element(v, v + col);
+    if ((i < gt_num) && (crowd_data[i])) {
+      max_overlap = -1.0;
+    }
+    if (max_overlap > fg_thresh) {
+      for (int64_t j = 0; j < col; ++j) {
+        T val = proposal_to_gt_overlaps[i * col + j];
+        auto diff = std::abs(max_overlap - val);
+        if (diff < epsilon) {
+          fg_inds.emplace_back(i);
+          gt_inds.emplace_back(j);
+          break;
+        }
+      }
+    } else {
+      if ((max_overlap >= bg_thresh_lo) && (max_overlap < bg_thresh_hi)) {
+        bg_inds.emplace_back(i);
+      }
+    }
+  }
+
+  // Reservoir Sampling
+  std::uniform_real_distribution<float> uniform(0, 1);
+  int fg_rois_per_im = std::floor(batch_size_per_im * fg_fraction);
+  int fg_rois_this_image = fg_inds.size();
+  int fg_rois_per_this_image = std::min(fg_rois_per_im, fg_rois_this_image);
+  if (use_random) {
+    const int64_t fg_size = static_cast<int64_t>(fg_inds.size());
+    if (fg_size > fg_rois_per_this_image) {
+      for (int64_t i = fg_rois_per_this_image; i < fg_size; ++i) {
+        int rng_ind = std::floor(uniform(engine) * i);
+        if (rng_ind < fg_rois_per_this_image) {
+          std::iter_swap(fg_inds.begin() + rng_ind, fg_inds.begin() + i);
+          std::iter_swap(gt_inds.begin() + rng_ind, gt_inds.begin() + i);
+        }
+      }
+    }
+  }
+  std::vector<int> new_fg_inds(fg_inds.begin(),
+                               fg_inds.begin() + fg_rois_per_this_image);
+  std::vector<int> new_gt_inds(gt_inds.begin(),
+                               gt_inds.begin() + fg_rois_per_this_image);
+
+  int bg_rois_per_image = batch_size_per_im - fg_rois_per_this_image;
+  int bg_rois_this_image = bg_inds.size();
+  int bg_rois_per_this_image = std::min(bg_rois_per_image, bg_rois_this_image);
+  if (use_random) {
+    const int64_t bg_size = static_cast<int64_t>(bg_inds.size());
+    if (bg_size > bg_rois_per_this_image) {
+      for (int64_t i = bg_rois_per_this_image; i < bg_size; ++i) {
+        int rng_ind = std::floor(uniform(engine) * i);
+        if (rng_ind < fg_rois_per_this_image)
+          std::iter_swap(bg_inds.begin() + rng_ind, bg_inds.begin() + i);
+      }
+    }
+  }
+  std::vector<int> new_bg_inds(bg_inds.begin(),
+                               bg_inds.begin() + bg_rois_per_this_image);
+  std::vector<std::vector<int>> res;
+  res.emplace_back(new_fg_inds);
+  res.emplace_back(new_bg_inds);
+  res.emplace_back(new_gt_inds);
+  return res;
+}
+
+template <typename T>
+void GatherBoxesLabels(const platform::CPUDeviceContext& context,
+                       const Tensor& boxes, const Tensor& gt_boxes,
+                       const Tensor& gt_classes,
+                       const std::vector<int>& fg_inds,
+                       const std::vector<int>& bg_inds,
+                       const std::vector<int>& gt_inds, Tensor* sampled_boxes,
+                       Tensor* sampled_labels, Tensor* sampled_gts) {
+  int fg_num = fg_inds.size();
+  int bg_num = bg_inds.size();
+  Tensor fg_inds_t, bg_inds_t, gt_box_inds_t, gt_label_inds_t;
+  int* fg_inds_data = fg_inds_t.mutable_data<int>({fg_num}, context.GetPlace());
+  int* bg_inds_data = bg_inds_t.mutable_data<int>({bg_num}, context.GetPlace());
+  int* gt_box_inds_data =
+      gt_box_inds_t.mutable_data<int>({fg_num}, context.GetPlace());
+  int* gt_label_inds_data =
+      gt_label_inds_t.mutable_data<int>({fg_num}, context.GetPlace());
+  std::copy(fg_inds.begin(), fg_inds.end(), fg_inds_data);
+  std::copy(bg_inds.begin(), bg_inds.end(), bg_inds_data);
+  std::copy(gt_inds.begin(), gt_inds.end(), gt_box_inds_data);
+  std::copy(gt_inds.begin(), gt_inds.end(), gt_label_inds_data);
+
+  Tensor fg_boxes, bg_boxes, fg_labels, bg_labels;
+  fg_boxes.mutable_data<T>({fg_num, kBoxDim}, context.GetPlace());
+  CPUGather<T>(context, boxes, fg_inds_t, &fg_boxes);
+  bg_boxes.mutable_data<T>({bg_num, kBoxDim}, context.GetPlace());
+  CPUGather<T>(context, boxes, bg_inds_t, &bg_boxes);
+  Concat<T>(context, fg_boxes, bg_boxes, sampled_boxes);
+  CPUGather<T>(context, gt_boxes, gt_box_inds_t, sampled_gts);
+  fg_labels.mutable_data<int>({fg_num}, context.GetPlace());
+  CPUGather<int>(context, gt_classes, gt_label_inds_t, &fg_labels);
+  bg_labels.mutable_data<int>({bg_num}, context.GetPlace());
+  math::set_constant(context, &bg_labels, 0);
+  Concat<int>(context, fg_labels, bg_labels, sampled_labels);
+}
+
+template <typename T>
+std::vector<Tensor> SampleRoisForOneImage(
+    const platform::CPUDeviceContext& context, Tensor* rpn_rois,
+    Tensor* gt_classes, Tensor* is_crowd, Tensor* gt_boxes, Tensor* im_info,
+    const int batch_size_per_im, const float fg_fraction, const float fg_thresh,
+    const float bg_thresh_hi, const float bg_thresh_lo,
+    const std::vector<float>& bbox_reg_weights, const int class_nums,
+    std::minstd_rand engine, bool use_random) {
+  auto rpn_rois_et = framework::EigenTensor<T, 2>::From(*rpn_rois);
+  auto im_scale = im_info->data<T>()[2];
+  rpn_rois_et = rpn_rois_et / im_scale;
+
+  Tensor boxes;
+  int proposals_num = gt_boxes->dims()[0] + rpn_rois->dims()[0];
+  boxes.mutable_data<T>({proposals_num, kBoxDim}, context.GetPlace());
+  Concat<T>(context, *gt_boxes, *rpn_rois, &boxes);
+
+  // Overlaps
+  Tensor proposal_to_gt_overlaps;
+  proposal_to_gt_overlaps.mutable_data<T>({proposals_num, gt_boxes->dims()[0]},
+                                          context.GetPlace());
+  BboxOverlaps<T>(boxes, *gt_boxes, &proposal_to_gt_overlaps);
+
+  // Generate proposal index
+  std::vector<std::vector<int>> fg_bg_gt = SampleFgBgGt<T>(
+      context, &proposal_to_gt_overlaps, *is_crowd, batch_size_per_im,
+      fg_fraction, fg_thresh, bg_thresh_hi, bg_thresh_lo, engine, use_random);
+  std::vector<int> fg_inds = fg_bg_gt[0];
+  std::vector<int> bg_inds = fg_bg_gt[1];
+  std::vector<int> gt_inds = fg_bg_gt[2];
+
+  // Gather boxes and labels
+  Tensor sampled_boxes, sampled_labels, sampled_gts;
+  int fg_num = fg_inds.size();
+  int bg_num = bg_inds.size();
+  int boxes_num = fg_num + bg_num;
+  framework::DDim bbox_dim({boxes_num, kBoxDim});
+  sampled_boxes.mutable_data<T>(bbox_dim, context.GetPlace());
+  sampled_labels.mutable_data<int>({boxes_num}, context.GetPlace());
+  sampled_gts.mutable_data<T>({fg_num, kBoxDim}, context.GetPlace());
+  GatherBoxesLabels<T>(context, boxes, *gt_boxes, *gt_classes, fg_inds, bg_inds,
+                       gt_inds, &sampled_boxes, &sampled_labels, &sampled_gts);
+
+  // Compute targets
+  Tensor bbox_targets_single;
+  bbox_targets_single.mutable_data<T>(bbox_dim, context.GetPlace());
+  BoxToDelta<T>(fg_num, sampled_boxes, sampled_gts, bbox_reg_weights.data(),
+                false, &bbox_targets_single);
+
+  // Scale rois
+  Tensor sampled_rois;
+  sampled_rois.mutable_data<T>(sampled_boxes.dims(), context.GetPlace());
+  auto sampled_rois_et = framework::EigenTensor<T, 2>::From(sampled_rois);
+  auto sampled_boxes_et = framework::EigenTensor<T, 2>::From(sampled_boxes);
+  sampled_rois_et = sampled_boxes_et * im_scale;
+
+  // Expand box targets
+  Tensor bbox_targets, bbox_inside_weights, bbox_outside_weights;
+  framework::DDim bbox_expand_dim({boxes_num, kBoxDim * class_nums});
+  bbox_targets.mutable_data<T>(bbox_expand_dim, context.GetPlace());
+  bbox_inside_weights.mutable_data<T>(bbox_expand_dim, context.GetPlace());
+  bbox_outside_weights.mutable_data<T>(bbox_expand_dim, context.GetPlace());
+  math::set_constant(context, &bbox_targets, 0.0);
+  math::set_constant(context, &bbox_inside_weights, 0.0);
+  math::set_constant(context, &bbox_outside_weights, 0.0);
+
+  auto* bbox_targets_single_data = bbox_targets_single.data<T>();
+  auto* sampled_labels_data = sampled_labels.data<int>();
+  auto* bbox_targets_data = bbox_targets.data<T>();
+  auto* bbox_inside_weights_data = bbox_inside_weights.data<T>();
+  auto* bbox_outside_weights_data = bbox_outside_weights.data<T>();
+  int width = kBoxDim * class_nums;
+  for (int64_t i = 0; i < boxes_num; ++i) {
+    int label = sampled_labels_data[i];
+    if (label > 0) {
+      int dst_idx = i * width + kBoxDim * label;
+      int src_idx = kBoxDim * i;
+      bbox_targets_data[dst_idx] = bbox_targets_single_data[src_idx];
+      bbox_targets_data[dst_idx + 1] = bbox_targets_single_data[src_idx + 1];
+      bbox_targets_data[dst_idx + 2] = bbox_targets_single_data[src_idx + 2];
+      bbox_targets_data[dst_idx + 3] = bbox_targets_single_data[src_idx + 3];
+      bbox_inside_weights_data[dst_idx] = 1;
+      bbox_inside_weights_data[dst_idx + 1] = 1;
+      bbox_inside_weights_data[dst_idx + 2] = 1;
+      bbox_inside_weights_data[dst_idx + 3] = 1;
+      bbox_outside_weights_data[dst_idx] = 1;
+      bbox_outside_weights_data[dst_idx + 1] = 1;
+      bbox_outside_weights_data[dst_idx + 2] = 1;
+      bbox_outside_weights_data[dst_idx + 3] = 1;
+    }
+  }
+  std::vector<Tensor> res;
+  res.emplace_back(sampled_rois);
+  res.emplace_back(sampled_labels);
+  res.emplace_back(bbox_targets);
+  res.emplace_back(bbox_inside_weights);
+  res.emplace_back(bbox_outside_weights);
+  return res;
+}
+
+template <typename T>
+class GenerateProposalLabelsKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* rpn_rois = context.Input<LoDTensor>("RpnRois");
+    auto* gt_classes = context.Input<LoDTensor>("GtClasses");
+    auto* is_crowd = context.Input<LoDTensor>("IsCrowd");
+    auto* gt_boxes = context.Input<LoDTensor>("GtBoxes");
+    auto* im_info = context.Input<LoDTensor>("ImInfo");
+
+    auto* rois = context.Output<LoDTensor>("Rois");
+    auto* labels_int32 = context.Output<LoDTensor>("LabelsInt32");
+    auto* bbox_targets = context.Output<LoDTensor>("BboxTargets");
+    auto* bbox_inside_weights = context.Output<LoDTensor>("BboxInsideWeights");
+    auto* bbox_outside_weights =
+        context.Output<LoDTensor>("BboxOutsideWeights");
+
+    int batch_size_per_im = context.Attr<int>("batch_size_per_im");
+    float fg_fraction = context.Attr<float>("fg_fraction");
+    float fg_thresh = context.Attr<float>("fg_thresh");
+    float bg_thresh_hi = context.Attr<float>("bg_thresh_hi");
+    float bg_thresh_lo = context.Attr<float>("bg_thresh_lo");
+    std::vector<float> bbox_reg_weights =
+        context.Attr<std::vector<float>>("bbox_reg_weights");
+    int class_nums = context.Attr<int>("class_nums");
+    bool use_random = context.Attr<bool>("use_random");
+
+    PADDLE_ENFORCE_EQ(rpn_rois->lod().size(), 1UL,
+                      "GenerateProposalLabelsOp rpn_rois needs 1 level of LoD");
+    PADDLE_ENFORCE_EQ(
+        gt_classes->lod().size(), 1UL,
+        "GenerateProposalLabelsOp gt_classes needs 1 level of LoD");
+    PADDLE_ENFORCE_EQ(is_crowd->lod().size(), 1UL,
+                      "GenerateProposalLabelsOp is_crowd needs 1 level of LoD");
+    PADDLE_ENFORCE_EQ(gt_boxes->lod().size(), 1UL,
+                      "GenerateProposalLabelsOp gt_boxes needs 1 level of LoD");
+    int64_t n = static_cast<int64_t>(rpn_rois->lod().back().size() - 1);
+
+    rois->mutable_data<T>({n * batch_size_per_im, kBoxDim}, context.GetPlace());
+    labels_int32->mutable_data<int>({n * batch_size_per_im, 1},
+                                    context.GetPlace());
+    bbox_targets->mutable_data<T>({n * batch_size_per_im, kBoxDim * class_nums},
+                                  context.GetPlace());
+    bbox_inside_weights->mutable_data<T>(
+        {n * batch_size_per_im, kBoxDim * class_nums}, context.GetPlace());
+    bbox_outside_weights->mutable_data<T>(
+        {n * batch_size_per_im, kBoxDim * class_nums}, context.GetPlace());
+
+    std::random_device rnd;
+    std::minstd_rand engine;
+    int seed = rnd();
+    engine.seed(seed);
+
+    framework::LoD lod;
+    std::vector<size_t> lod0(1, 0);
+
+    int64_t num_rois = 0;
+    auto& dev_ctx = context.device_context<platform::CPUDeviceContext>();
+
+    auto rpn_rois_lod = rpn_rois->lod().back();
+    auto gt_classes_lod = gt_classes->lod().back();
+    auto is_crowd_lod = is_crowd->lod().back();
+    auto gt_boxes_lod = gt_boxes->lod().back();
+    for (int i = 0; i < n; ++i) {
+      Tensor rpn_rois_slice =
+          rpn_rois->Slice(rpn_rois_lod[i], rpn_rois_lod[i + 1]);
+      Tensor gt_classes_slice =
+          gt_classes->Slice(gt_classes_lod[i], gt_classes_lod[i + 1]);
+      Tensor is_crowd_slice =
+          is_crowd->Slice(is_crowd_lod[i], is_crowd_lod[i + 1]);
+      Tensor gt_boxes_slice =
+          gt_boxes->Slice(gt_boxes_lod[i], gt_boxes_lod[i + 1]);
+      Tensor im_info_slice = im_info->Slice(i, i + 1);
+      std::vector<Tensor> tensor_output = SampleRoisForOneImage<T>(
+          dev_ctx, &rpn_rois_slice, &gt_classes_slice, &is_crowd_slice,
+          &gt_boxes_slice, &im_info_slice, batch_size_per_im, fg_fraction,
+          fg_thresh, bg_thresh_hi, bg_thresh_lo, bbox_reg_weights, class_nums,
+          engine, use_random);
+      Tensor sampled_rois = tensor_output[0];
+      Tensor sampled_labels_int32 = tensor_output[1];
+      Tensor sampled_bbox_targets = tensor_output[2];
+      Tensor sampled_bbox_inside_weights = tensor_output[3];
+      Tensor sampled_bbox_outside_weights = tensor_output[4];
+
+      AppendRois<T>(rois, kBoxDim * num_rois, &sampled_rois);
+      AppendRois<int>(labels_int32, num_rois, &sampled_labels_int32);
+      AppendRois<T>(bbox_targets, kBoxDim * num_rois * class_nums,
+                    &sampled_bbox_targets);
+      AppendRois<T>(bbox_inside_weights, kBoxDim * num_rois * class_nums,
+                    &sampled_bbox_inside_weights);
+      AppendRois<T>(bbox_outside_weights, kBoxDim * num_rois * class_nums,
+                    &sampled_bbox_outside_weights);
+
+      num_rois += sampled_rois.dims()[0];
+      lod0.emplace_back(num_rois);
+    }
+
+    lod.emplace_back(lod0);
+    rois->set_lod(lod);
+    labels_int32->set_lod(lod);
+    bbox_targets->set_lod(lod);
+    bbox_inside_weights->set_lod(lod);
+    bbox_outside_weights->set_lod(lod);
+    rois->Resize({num_rois, kBoxDim});
+    labels_int32->Resize({num_rois, 1});
+    bbox_targets->Resize({num_rois, kBoxDim * class_nums});
+    bbox_inside_weights->Resize({num_rois, kBoxDim * class_nums});
+    bbox_outside_weights->Resize({num_rois, kBoxDim * class_nums});
+  }
+};
+
+class GenerateProposalLabelsOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    // TODO(buxingyuan): Add Document
+    AddInput("RpnRois", "RpnRois.");
+    AddInput("GtClasses", "GtClasses.");
+    AddInput("IsCrowd", "IsCrowd.");
+    AddInput("GtBoxes", "GtBoxes.");
+    AddInput("ImInfo", "ImInfo.");
+
+    AddOutput("Rois", "Rois.");
+    AddOutput("LabelsInt32", "LabelsInt32.");
+    AddOutput("BboxTargets", "BboxTargets.");
+    AddOutput("BboxInsideWeights", "BboxInsideWeights.");
+    AddOutput("BboxOutsideWeights", "BboxOutsideWeights.");
+
+    AddAttr<int>("batch_size_per_im", "batch_size_per_im");
+    AddAttr<float>("fg_fraction", "fg_fraction");
+    AddAttr<float>("fg_thresh", "fg_thresh");
+    AddAttr<float>("bg_thresh_hi", "bg_thresh_hi");
+    AddAttr<float>("bg_thresh_lo", "bg_thresh_lo");
+    AddAttr<std::vector<float>>("bbox_reg_weights", "bbox_reg_weights");
+    AddAttr<int>("class_nums", "class_nums");
+    AddAttr<bool>("use_random", "use_random").SetDefault(true);
+
+    AddComment(R"DOC(
+Generate Proposals Labels Operator.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(generate_proposal_labels, ops::GenerateProposalLabelsOp,
+                  ops::GenerateProposalLabelsOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(generate_proposal_labels,
+                       ops::GenerateProposalLabelsKernel<float>,
+                       ops::GenerateProposalLabelsKernel<double>);
diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cc b/paddle/fluid/operators/detection/generate_proposals_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c33aa255362bc5234f2813fb93e70c943b03c33f
--- /dev/null
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cc
@@ -0,0 +1,494 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/gather.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+struct AppendProposalsFunctor {
+  LoDTensor *out_;
+  int64_t offset_;
+  Tensor *to_add_;
+
+  AppendProposalsFunctor(LoDTensor *out, int64_t offset, Tensor *to_add)
+      : out_(out), offset_(offset), to_add_(to_add) {}
+
+  template <typename T>
+  void apply() const {
+    auto *out_data = out_->data<T>();
+    auto *to_add_data = to_add_->data<T>();
+    memcpy(out_data + offset_, to_add_data, to_add_->numel() * sizeof(T));
+  }
+};
+
+class GenerateProposalsOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Scores"), "Input(Scores) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput("BboxDeltas"),
+                   "Input(BboxDeltas) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput("ImInfo"), "Input(ImInfo) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Anchors"),
+                   "Input(Anchors) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Variances"),
+                   "Input(Variances) shouldn't be null.");
+
+    auto scores_dims = ctx->GetInputDim("Scores");
+    auto bbox_deltas_dims = ctx->GetInputDim("BboxDeltas");
+    auto im_info_dims = ctx->GetInputDim("ImInfo");
+    auto anchors_dims = ctx->GetInputDim("Anchors");
+    auto variances_dims = ctx->GetInputDim("Variances");
+
+    ctx->SetOutputDim("RpnRois", {-1, 4});
+    ctx->SetOutputDim("RpnRoiProbs", {-1, 1});
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("Anchors")->type()),
+        platform::CPUPlace());
+  }
+};
+
+template <class T>
+void BoxCoder(const platform::DeviceContext &ctx, Tensor *all_anchors,
+              Tensor *bbox_deltas, Tensor *variances, Tensor *proposals) {
+  T *proposals_data = proposals->mutable_data<T>(ctx.GetPlace());
+
+  int64_t row = all_anchors->dims()[0];
+  int64_t len = all_anchors->dims()[1];
+
+  auto *bbox_deltas_data = bbox_deltas->data<T>();
+  auto *anchor_data = all_anchors->data<T>();
+  const T *variances_data = nullptr;
+  if (variances) {
+    variances_data = variances->data<T>();
+  }
+
+  for (int64_t i = 0; i < row; ++i) {
+    T anchor_width = anchor_data[i * len + 2] - anchor_data[i * len] + 1.0;
+    T anchor_height = anchor_data[i * len + 3] - anchor_data[i * len + 1] + 1.0;
+
+    T anchor_center_x = anchor_data[i * len] + 0.5 * anchor_width;
+    T anchor_center_y = anchor_data[i * len + 1] + 0.5 * anchor_height;
+
+    T bbox_center_x = 0, bbox_center_y = 0;
+    T bbox_width = 0, bbox_height = 0;
+
+    if (variances) {
+      bbox_center_x =
+          variances_data[i * len] * bbox_deltas_data[i * len] * anchor_width +
+          anchor_center_x;
+      bbox_center_y = variances_data[i * len + 1] *
+                          bbox_deltas_data[i * len + 1] * anchor_height +
+                      anchor_center_y;
+      bbox_width = std::exp(std::min<T>(variances_data[i * len + 2] *
+                                            bbox_deltas_data[i * len + 2],
+                                        std::log(1000.0 / 16.0))) *
+                   anchor_width;
+      bbox_height = std::exp(std::min<T>(variances_data[i * len + 3] *
+                                             bbox_deltas_data[i * len + 3],
+                                         std::log(1000.0 / 16.0))) *
+                    anchor_height;
+    } else {
+      bbox_center_x =
+          bbox_deltas_data[i * len] * anchor_width + anchor_center_x;
+      bbox_center_y =
+          bbox_deltas_data[i * len + 1] * anchor_height + anchor_center_y;
+      bbox_width = std::exp(std::min<T>(bbox_deltas_data[i * len + 2],
+                                        std::log(1000.0 / 16.0))) *
+                   anchor_width;
+      bbox_height = std::exp(std::min<T>(bbox_deltas_data[i * len + 3],
+                                         std::log(1000.0 / 16.0))) *
+                    anchor_height;
+    }
+
+    proposals_data[i * len] = bbox_center_x - bbox_width / 2;
+    proposals_data[i * len + 1] = bbox_center_y - bbox_height / 2;
+    proposals_data[i * len + 2] = bbox_center_x + bbox_width / 2 - 1;
+    proposals_data[i * len + 3] = bbox_center_y + bbox_height / 2 - 1;
+  }
+  // return proposals;
+}
+
+template <class T>
+void ClipTiledBoxes(const platform::DeviceContext &ctx, const Tensor &im_info,
+                    Tensor *boxes) {
+  T *boxes_data = boxes->mutable_data<T>(ctx.GetPlace());
+  const T *im_info_data = im_info.data<T>();
+  for (int64_t i = 0; i < boxes->numel(); ++i) {
+    if (i % 4 == 0) {
+      boxes_data[i] =
+          std::max(std::min(boxes_data[i], im_info_data[1] - 1), 0.0f);
+    } else if (i % 4 == 1) {
+      boxes_data[i] =
+          std::max(std::min(boxes_data[i], im_info_data[0] - 1), 0.0f);
+    } else if (i % 4 == 2) {
+      boxes_data[i] =
+          std::max(std::min(boxes_data[i], im_info_data[1] - 1), 0.0f);
+    } else {
+      boxes_data[i] =
+          std::max(std::min(boxes_data[i], im_info_data[0] - 1), 0.0f);
+    }
+  }
+}
+
+template <class T>
+void FilterBoxes(const platform::DeviceContext &ctx, Tensor *boxes,
+                 float min_size, const Tensor &im_info, Tensor *keep) {
+  const T *im_info_data = im_info.data<T>();
+  T *boxes_data = boxes->mutable_data<T>(ctx.GetPlace());
+  T im_scale = im_info_data[2];
+  keep->Resize({boxes->dims()[0], 1});
+  min_size = std::max(min_size, 1.0f);
+  int *keep_data = keep->mutable_data<int>(ctx.GetPlace());
+
+  int keep_len = 0;
+  for (int i = 0; i < boxes->dims()[0]; ++i) {
+    T ws = boxes_data[4 * i + 2] - boxes_data[4 * i] + 1;
+    T hs = boxes_data[4 * i + 3] - boxes_data[4 * i + 1] + 1;
+    T ws_origin_scale =
+        (boxes_data[4 * i + 2] - boxes_data[4 * i]) / im_scale + 1;
+    T hs_origin_scale =
+        (boxes_data[4 * i + 3] - boxes_data[4 * i + 1]) / im_scale + 1;
+    T x_ctr = boxes_data[4 * i] + ws / 2;
+    T y_ctr = boxes_data[4 * i + 1] + hs / 2;
+    if (ws_origin_scale >= min_size && hs_origin_scale >= min_size &&
+        x_ctr <= im_info_data[1] && y_ctr <= im_info_data[0]) {
+      keep_data[keep_len++] = i;
+    }
+  }
+  keep->Resize({keep_len});
+}
+
+bool SortScorePairDescend(const std::pair<float, int> &pair1,
+                          const std::pair<float, int> &pair2) {
+  return pair1.first > pair2.first;
+}
+
+template <class T>
+void GetMaxScoreIndex(const std::vector<T> &scores,
+                      std::vector<std::pair<T, int>> *sorted_indices) {
+  for (size_t i = 0; i < scores.size(); ++i) {
+    sorted_indices->push_back(std::make_pair(scores[i], i));
+  }
+  // Sort the score pair according to the scores in descending order
+  std::stable_sort(sorted_indices->begin(), sorted_indices->end(),
+                   SortScorePairDescend);
+}
+
+template <class T>
+T BBoxArea(const T *box, const bool normalized) {
+  if (box[2] < box[0] || box[3] < box[1]) {
+    // If coordinate values are is invalid
+    // (e.g. xmax < xmin or ymax < ymin), return 0.
+    return static_cast<T>(0.);
+  } else {
+    const T w = box[2] - box[0];
+    const T h = box[3] - box[1];
+    if (normalized) {
+      return w * h;
+    } else {
+      // If coordinate values are not within range [0, 1].
+      return (w + 1) * (h + 1);
+    }
+  }
+}
+
+template <class T>
+T JaccardOverlap(const T *box1, const T *box2, const bool normalized) {
+  if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] ||
+      box2[3] < box1[1]) {
+    return static_cast<T>(0.);
+  } else {
+    const T inter_xmin = std::max(box1[0], box2[0]);
+    const T inter_ymin = std::max(box1[1], box2[1]);
+    const T inter_xmax = std::min(box1[2], box2[2]);
+    const T inter_ymax = std::min(box1[3], box2[3]);
+    const T inter_w = std::max(0.0f, inter_xmax - inter_xmin + 1);
+    const T inter_h = std::max(0.0f, inter_ymax - inter_ymin + 1);
+    const T inter_area = inter_w * inter_h;
+    const T bbox1_area = BBoxArea<T>(box1, normalized);
+    const T bbox2_area = BBoxArea<T>(box2, normalized);
+    return inter_area / (bbox1_area + bbox2_area - inter_area);
+  }
+}
+
+template <class T>
+Tensor NMS(const platform::DeviceContext &ctx, Tensor *bbox, Tensor *scores,
+           const T nms_threshold, const float eta) {
+  PADDLE_ENFORCE_NOT_NULL(bbox);
+  int64_t num_boxes = bbox->dims()[0];
+  // 4: [xmin ymin xmax ymax]
+  int64_t box_size = bbox->dims()[1];
+
+  std::vector<T> scores_data(num_boxes);
+  std::copy_n(scores->data<T>(), num_boxes, scores_data.begin());
+  std::vector<std::pair<T, int>> sorted_indices;
+  GetMaxScoreIndex<T>(scores_data, &sorted_indices);
+
+  std::vector<int> selected_indices;
+  int selected_num = 0;
+  T adaptive_threshold = nms_threshold;
+  const T *bbox_data = bbox->data<T>();
+  bool flag;
+  while (sorted_indices.size() != 0) {
+    int idx = sorted_indices.front().second;
+    flag = true;
+    for (size_t k = 0; k < selected_indices.size(); ++k) {
+      if (flag) {
+        const int kept_idx = selected_indices[k];
+        T overlap = JaccardOverlap<T>(bbox_data + idx * box_size,
+                                      bbox_data + kept_idx * box_size, false);
+        flag = (overlap <= adaptive_threshold);
+      } else {
+        break;
+      }
+    }
+    if (flag) {
+      selected_indices.push_back(idx);
+      selected_num++;
+    }
+    sorted_indices.erase(sorted_indices.begin());
+    if (flag && eta < 1 && adaptive_threshold > 0.5) {
+      adaptive_threshold *= eta;
+    }
+  }
+  Tensor keep_nms;
+  keep_nms.Resize({selected_num});
+  int *keep_data = keep_nms.mutable_data<int>(ctx.GetPlace());
+  for (int i = 0; i < selected_num; ++i) {
+    keep_data[i] = selected_indices[i];
+  }
+
+  return keep_nms;
+}
+
+template <typename DeviceContext, typename T>
+class GenerateProposalsKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *scores = context.Input<Tensor>("Scores");
+    auto *bbox_deltas = context.Input<Tensor>("BboxDeltas");
+    auto *im_info = context.Input<Tensor>("ImInfo");
+    auto *anchors = context.Input<Tensor>("Anchors");
+    auto *variances = context.Input<Tensor>("Variances");
+
+    auto *rpn_rois = context.Output<LoDTensor>("RpnRois");
+    auto *rpn_roi_probs = context.Output<LoDTensor>("RpnRoiProbs");
+
+    int pre_nms_top_n = context.Attr<int>("pre_nms_topN");
+    int post_nms_top_n = context.Attr<int>("post_nms_topN");
+    float nms_thresh = context.Attr<float>("nms_thresh");
+    float min_size = context.Attr<float>("min_size");
+    float eta = context.Attr<float>("eta");
+
+    auto &dev_ctx = context.template device_context<DeviceContext>();
+
+    auto scores_dim = scores->dims();
+    int64_t num = scores_dim[0];
+    int64_t c_score = scores_dim[1];
+    int64_t h_score = scores_dim[2];
+    int64_t w_score = scores_dim[3];
+
+    auto bbox_dim = bbox_deltas->dims();
+    int64_t c_bbox = bbox_dim[1];
+    int64_t h_bbox = bbox_dim[2];
+    int64_t w_bbox = bbox_dim[3];
+
+    rpn_rois->mutable_data<T>({bbox_deltas->numel() / 4, 4},
+                              context.GetPlace());
+    rpn_roi_probs->mutable_data<T>({scores->numel(), 1}, context.GetPlace());
+
+    Tensor bbox_deltas_swap, scores_swap;
+    bbox_deltas_swap.mutable_data<T>({num, h_bbox, w_bbox, c_bbox},
+                                     dev_ctx.GetPlace());
+    scores_swap.mutable_data<T>({num, h_score, w_score, c_score},
+                                dev_ctx.GetPlace());
+
+    math::Transpose<DeviceContext, T, 4> trans;
+    std::vector<int> axis = {0, 2, 3, 1};
+    trans(dev_ctx, *bbox_deltas, &bbox_deltas_swap, axis);
+    trans(dev_ctx, *scores, &scores_swap, axis);
+
+    framework::LoD lod;
+    std::vector<size_t> lod0(1, 0);
+    Tensor *anchor = const_cast<framework::Tensor *>(anchors);
+    anchor->Resize({anchors->numel() / 4, 4});
+    Tensor *var = const_cast<framework::Tensor *>(variances);
+    var->Resize({var->numel() / 4, 4});
+
+    int64_t num_proposals = 0;
+    for (int64_t i = 0; i < num; ++i) {
+      Tensor im_info_slice = im_info->Slice(i, i + 1);
+      Tensor bbox_deltas_slice = bbox_deltas_swap.Slice(i, i + 1);
+      Tensor scores_slice = scores_swap.Slice(i, i + 1);
+
+      bbox_deltas_slice.Resize({h_bbox * w_bbox * c_bbox / 4, 4});
+      scores_slice.Resize({h_score * w_score * c_score, 1});
+
+      std::pair<Tensor, Tensor> tensor_pair =
+          ProposalForOneImage(dev_ctx, im_info_slice, *anchor, *var,
+                              bbox_deltas_slice, scores_slice, pre_nms_top_n,
+                              post_nms_top_n, nms_thresh, min_size, eta);
+      Tensor proposals = tensor_pair.first;
+      Tensor scores = tensor_pair.second;
+
+      framework::VisitDataType(
+          framework::ToDataType(rpn_rois->type()),
+          AppendProposalsFunctor(rpn_rois, 4 * num_proposals, &proposals));
+      framework::VisitDataType(
+          framework::ToDataType(rpn_roi_probs->type()),
+          AppendProposalsFunctor(rpn_roi_probs, num_proposals, &scores));
+
+      num_proposals += proposals.dims()[0];
+      lod0.emplace_back(num_proposals);
+    }
+
+    lod.emplace_back(lod0);
+    rpn_rois->set_lod(lod);
+    rpn_roi_probs->set_lod(lod);
+    rpn_rois->Resize({num_proposals, 4});
+    rpn_roi_probs->Resize({num_proposals, 1});
+  }
+
+  std::pair<Tensor, Tensor> ProposalForOneImage(
+      const DeviceContext &ctx, const Tensor &im_info_slice,
+      const Tensor &anchors, const Tensor &variances,
+      const Tensor &bbox_deltas_slice,  // [M, 4]
+      const Tensor &scores_slice,       // [N, 1]
+      int pre_nms_top_n, int post_nms_top_n, float nms_thresh, float min_size,
+      float eta) const {
+    auto *scores_data = scores_slice.data<T>();
+
+    // Sort index
+    Tensor index_t;
+    index_t.Resize({scores_slice.numel()});
+    int *index = index_t.mutable_data<int>(ctx.GetPlace());
+    for (int i = 0; i < scores_slice.numel(); ++i) {
+      index[i] = i;
+    }
+    std::function<bool(const int64_t &, const int64_t &)> compare =
+        [scores_data](const int64_t &i, const int64_t &j) {
+          return scores_data[i] > scores_data[j];
+        };
+
+    if (pre_nms_top_n <= 0 || pre_nms_top_n >= scores_slice.numel()) {
+      std::sort(index, index + scores_slice.numel(), compare);
+    } else {
+      std::nth_element(index, index + pre_nms_top_n,
+                       index + scores_slice.numel(), compare);
+      index_t.Resize({pre_nms_top_n});
+    }
+
+    Tensor scores_sel, bbox_sel, anchor_sel, var_sel;
+    scores_sel.mutable_data<T>({index_t.numel(), 1}, ctx.GetPlace());
+    bbox_sel.mutable_data<T>({index_t.numel(), 4}, ctx.GetPlace());
+    anchor_sel.mutable_data<T>({index_t.numel(), 4}, ctx.GetPlace());
+    var_sel.mutable_data<T>({index_t.numel(), 4}, ctx.GetPlace());
+
+    CPUGather<T>(ctx, scores_slice, index_t, &scores_sel);
+    CPUGather<T>(ctx, bbox_deltas_slice, index_t, &bbox_sel);
+    CPUGather<T>(ctx, anchors, index_t, &anchor_sel);
+    CPUGather<T>(ctx, variances, index_t, &var_sel);
+
+    Tensor proposals;
+    proposals.mutable_data<T>({index_t.numel(), 4}, ctx.GetPlace());
+    BoxCoder<T>(ctx, &anchor_sel, &bbox_sel, &var_sel, &proposals);
+
+    ClipTiledBoxes<T>(ctx, im_info_slice, &proposals);
+
+    Tensor keep;
+    FilterBoxes<T>(ctx, &proposals, min_size, im_info_slice, &keep);
+
+    Tensor scores_filter;
+    bbox_sel.mutable_data<T>({keep.numel(), 4}, ctx.GetPlace());
+    scores_filter.mutable_data<T>({keep.numel(), 1}, ctx.GetPlace());
+    CPUGather<T>(ctx, proposals, keep, &bbox_sel);
+    CPUGather<T>(ctx, scores_sel, keep, &scores_filter);
+    if (nms_thresh <= 0) {
+      return std::make_pair(bbox_sel, scores_filter);
+    }
+
+    Tensor keep_nms = NMS<T>(ctx, &bbox_sel, &scores_filter, nms_thresh, eta);
+
+    if (post_nms_top_n > 0 && post_nms_top_n < keep_nms.numel()) {
+      keep_nms.Resize({post_nms_top_n});
+    }
+
+    proposals.mutable_data<T>({keep_nms.numel(), 4}, ctx.GetPlace());
+    scores_sel.mutable_data<T>({keep_nms.numel(), 1}, ctx.GetPlace());
+    CPUGather<T>(ctx, bbox_sel, keep_nms, &proposals);
+    CPUGather<T>(ctx, scores_filter, keep_nms, &scores_sel);
+
+    return std::make_pair(proposals, scores_sel);
+  }
+};
+
+class GenerateProposalsOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Scores", "The scores of anchors should be foreground.");
+    AddInput("BboxDeltas", "bbox_deltas.");
+    AddInput("ImInfo", "Information for image reshape.");
+    AddInput("Anchors", "All anchors.");
+    AddInput("Variances", " variances");
+
+    AddOutput("RpnRois", "Anchors.");
+    AddOutput("RpnRoiProbs", "Anchors.");
+    AddAttr<int>("pre_nms_topN", "pre_nms_topN");
+    AddAttr<int>("post_nms_topN", "post_nms_topN");
+    AddAttr<float>("nms_thresh", "nms_thres");
+    AddAttr<float>("min_size", "min size");
+    AddAttr<float>("eta", "eta");
+    AddComment(R"DOC(
+Generate Proposals OP
+
+This operator proposes rois according to each box with their probability to be a foreground object and 
+the box can be calculated by anchors. Bbox_deltais and scores are the output of RPN. Final proposals
+could be used to train detection net.
+
+Scores is the probability for each box to be an object. In format of (N, A, H, W) where N is batch size, A is number
+of anchors, H and W are height and width of the feature map.
+BboxDeltas is the differece between predicted box locatoin and anchor location. In format of (N, 4*A, H, W)
+
+For generating proposals, this operator transposes and resizes scores and bbox_deltas in size of (H*W*A, 1) and (H*W*A, 4) and 
+ calculate box locations as proposals candidates. Then clip boxes to image and remove predicted boxes with small area. 
+Finally, apply nms to get final proposals as output.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(generate_proposals, ops::GenerateProposalsOp,
+                  ops::GenerateProposalsOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    generate_proposals,
+    ops::GenerateProposalsKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/detection/mine_hard_examples_op.cc b/paddle/fluid/operators/detection/mine_hard_examples_op.cc
index d4a09bae3a98e4518f9885c1e9182f7033a0d262..54a4b87ec8f13c4d474aad4cc0b8159cd5f59d1c 100644
--- a/paddle/fluid/operators/detection/mine_hard_examples_op.cc
+++ b/paddle/fluid/operators/detection/mine_hard_examples_op.cc
@@ -227,6 +227,9 @@ class MineHardExamplesOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_GT(
           neg_pos_ratio, 0.0f,
           "neg_pos_ratio must greater than zero in max_negative mode");
+      PADDLE_ENFORCE_LT(
+          neg_dist_threshold, 1.0f,
+          "neg_dist_threshold must less than one in max_negative mode");
       PADDLE_ENFORCE_GT(
           neg_dist_threshold, 0.0f,
           "neg_dist_threshold must greater than zero in max_negative mode");
diff --git a/paddle/fluid/operators/detection/prior_box_op.cc b/paddle/fluid/operators/detection/prior_box_op.cc
index 4e35c38e4e03d4d0f00601812fdc4803519b89ae..b5cb6a724c095eb849f3a184f13843e1a0cca92f 100644
--- a/paddle/fluid/operators/detection/prior_box_op.cc
+++ b/paddle/fluid/operators/detection/prior_box_op.cc
@@ -149,6 +149,13 @@ class PriorBoxOpMaker : public framework::OpProtoAndCheckerMaker {
                    "(float) "
                    "Prior boxes center offset.")
         .SetDefault(0.5);
+    AddAttr<bool>(
+        "min_max_aspect_ratios_order",
+        "(bool) If set True, the output prior box is in order of"
+        "[min, max, aspect_ratios], which is consistent with Caffe."
+        "Please note, this order affects the weights order of convolution layer"
+        "followed by and does not affect the final detection results.")
+        .SetDefault(false);
     AddComment(R"DOC(
 Prior box operator
 Generate prior boxes for SSD(Single Shot MultiBox Detector) algorithm.
diff --git a/paddle/fluid/operators/detection/prior_box_op.cu b/paddle/fluid/operators/detection/prior_box_op.cu
index f67e6ca91c0852b5a3be35d23246884d1157caa4..1ea8cfc1d2af8cc6c332768a467cdcd4c0166319 100644
--- a/paddle/fluid/operators/detection/prior_box_op.cu
+++ b/paddle/fluid/operators/detection/prior_box_op.cu
@@ -28,8 +28,8 @@ __global__ void GenPriorBox(T* out, const T* aspect_ratios, const int height,
                             const int im_width, const int as_num,
                             const T offset, const T step_width,
                             const T step_height, const T* min_sizes,
-                            const T* max_sizes, const int min_num,
-                            bool is_clip) {
+                            const T* max_sizes, const int min_num, bool is_clip,
+                            bool min_max_aspect_ratios_order) {
   int num_priors = max_sizes ? as_num * min_num + min_num : as_num * min_num;
   int box_num = height * width * num_priors;
   for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < box_num;
@@ -44,14 +44,28 @@ __global__ void GenPriorBox(T* out, const T* aspect_ratios, const int height,
     T min_size = min_sizes[m];
     if (max_sizes) {
       int s = p % (as_num + 1);
-      if (s < as_num) {
-        T ar = aspect_ratios[s];
-        bw = min_size * sqrt(ar) / 2.;
-        bh = min_size / sqrt(ar) / 2.;
+      if (!min_max_aspect_ratios_order) {
+        if (s < as_num) {
+          T ar = aspect_ratios[s];
+          bw = min_size * sqrt(ar) / 2.;
+          bh = min_size / sqrt(ar) / 2.;
+        } else {
+          T max_size = max_sizes[m];
+          bw = sqrt(min_size * max_size) / 2.;
+          bh = bw;
+        }
       } else {
-        T max_size = max_sizes[m];
-        bw = sqrt(min_size * max_size) / 2.;
-        bh = bw;
+        if (s == 0) {
+          bw = bh = min_size / 2.;
+        } else if (s == 1) {
+          T max_size = max_sizes[m];
+          bw = sqrt(min_size * max_size) / 2.;
+          bh = bw;
+        } else {
+          T ar = aspect_ratios[s - 1];
+          bw = min_size * sqrt(ar) / 2.;
+          bh = min_size / sqrt(ar) / 2.;
+        }
       }
     } else {
       int s = p % as_num;
@@ -94,6 +108,8 @@ class PriorBoxOpCUDAKernel : public framework::OpKernel<T> {
     auto variances = ctx.Attr<std::vector<float>>("variances");
     auto flip = ctx.Attr<bool>("flip");
     auto clip = ctx.Attr<bool>("clip");
+    auto min_max_aspect_ratios_order =
+        ctx.Attr<bool>("min_max_aspect_ratios_order");
 
     std::vector<float> aspect_ratios;
     ExpandAspectRatios(input_aspect_ratio, flip, &aspect_ratios);
@@ -149,7 +165,7 @@ class PriorBoxOpCUDAKernel : public framework::OpKernel<T> {
     GenPriorBox<T><<<grid, block, 0, stream>>>(
         boxes->data<T>(), r.data<T>(), height, width, im_height, im_width,
         aspect_ratios.size(), offset, step_width, step_height, min.data<T>(),
-        max_data, min_num, clip);
+        max_data, min_num, clip, min_max_aspect_ratios_order);
 
     framework::Tensor v;
     framework::TensorFromVector(variances, ctx.device_context(), &v);
diff --git a/paddle/fluid/operators/detection/prior_box_op.h b/paddle/fluid/operators/detection/prior_box_op.h
index 1c62fd8d2c4d4e4deba4ca6442efbaff83e36c35..4e226abbb51c271502f0ca5419d488643b5a1a82 100644
--- a/paddle/fluid/operators/detection/prior_box_op.h
+++ b/paddle/fluid/operators/detection/prior_box_op.h
@@ -68,6 +68,8 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
     auto variances = ctx.Attr<std::vector<float>>("variances");
     auto flip = ctx.Attr<bool>("flip");
     auto clip = ctx.Attr<bool>("clip");
+    auto min_max_aspect_ratios_order =
+        ctx.Attr<bool>("min_max_aspect_ratios_order");
 
     std::vector<float> aspect_ratios;
     ExpandAspectRatios(input_aspect_ratio, flip, &aspect_ratios);
@@ -108,26 +110,59 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
         int idx = 0;
         for (size_t s = 0; s < min_sizes.size(); ++s) {
           auto min_size = min_sizes[s];
-          // priors with different aspect ratios
-          for (size_t r = 0; r < aspect_ratios.size(); ++r) {
-            float ar = aspect_ratios[r];
-            box_width = min_size * sqrt(ar) / 2.;
-            box_height = min_size / sqrt(ar) / 2.;
-            e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
-            e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
-            e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
-            e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
-            idx++;
-          }
-          if (max_sizes.size() > 0) {
-            auto max_size = max_sizes[s];
-            // square prior with size sqrt(minSize * maxSize)
-            box_width = box_height = sqrt(min_size * max_size) / 2.;
+          if (min_max_aspect_ratios_order) {
+            box_width = box_height = min_size / 2.;
             e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
             e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
             e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
             e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
             idx++;
+            if (max_sizes.size() > 0) {
+              auto max_size = max_sizes[s];
+              // square prior with size sqrt(minSize * maxSize)
+              box_width = box_height = sqrt(min_size * max_size) / 2.;
+              e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
+              e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
+              e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
+              e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
+              idx++;
+            }
+            // priors with different aspect ratios
+            for (size_t r = 0; r < aspect_ratios.size(); ++r) {
+              float ar = aspect_ratios[r];
+              if (fabs(ar - 1.) < 1e-6) {
+                continue;
+              }
+              box_width = min_size * sqrt(ar) / 2.;
+              box_height = min_size / sqrt(ar) / 2.;
+              e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
+              e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
+              e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
+              e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
+              idx++;
+            }
+          } else {
+            // priors with different aspect ratios
+            for (size_t r = 0; r < aspect_ratios.size(); ++r) {
+              float ar = aspect_ratios[r];
+              box_width = min_size * sqrt(ar) / 2.;
+              box_height = min_size / sqrt(ar) / 2.;
+              e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
+              e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
+              e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
+              e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
+              idx++;
+            }
+            if (max_sizes.size() > 0) {
+              auto max_size = max_sizes[s];
+              // square prior with size sqrt(minSize * maxSize)
+              box_width = box_height = sqrt(min_size * max_size) / 2.;
+              e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
+              e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
+              e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
+              e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
+              idx++;
+            }
           }
         }
       }
diff --git a/paddle/fluid/operators/detection/rpn_target_assign_op.cc b/paddle/fluid/operators/detection/rpn_target_assign_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..dda423efd35b96f5e1d7c55389818f46ef3d8694
--- /dev/null
+++ b/paddle/fluid/operators/detection/rpn_target_assign_op.cc
@@ -0,0 +1,550 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <random>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/detection/bbox_util.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+class RpnTargetAssignOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Anchor"),
+                   "Input(Anchor) of RpnTargetAssignOp should not be null");
+    PADDLE_ENFORCE(ctx->HasInput("GtBoxes"),
+                   "Input(GtBoxes) of RpnTargetAssignOp should not be null");
+    PADDLE_ENFORCE(ctx->HasInput("IsCrowd"),
+                   "Input(Anchor) of RpnTargetAssignOp should not be null");
+    PADDLE_ENFORCE(ctx->HasInput("ImInfo"),
+                   "Input(ImInfo) of RpnTargetAssignOp should not be null");
+
+    PADDLE_ENFORCE(
+        ctx->HasOutput("LocationIndex"),
+        "Output(LocationIndex) of RpnTargetAssignOp should not be null");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("ScoreIndex"),
+        "Output(ScoreIndex) of RpnTargetAssignOp should not be null");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("TargetLabel"),
+        "Output(TargetLabel) of RpnTargetAssignOp should not be null");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("TargetBBox"),
+        "Output(TargetBBox) of RpnTargetAssignOp should not be null");
+
+    auto anchor_dims = ctx->GetInputDim("Anchor");
+    auto gt_boxes_dims = ctx->GetInputDim("GtBoxes");
+    auto is_crowd_dims = ctx->GetInputDim("IsCrowd");
+    auto im_info_dims = ctx->GetInputDim("ImInfo");
+    PADDLE_ENFORCE_EQ(anchor_dims.size(), 2,
+                      "The rank of Input(Anchor) must be 2.");
+    PADDLE_ENFORCE_EQ(gt_boxes_dims.size(), 2,
+                      "The rank of Input(GtBoxes) must be 2.");
+    PADDLE_ENFORCE_EQ(im_info_dims.size(), 2,
+                      "The rank of Input(ImInfo) must be 2.");
+
+    ctx->SetOutputDim("LocationIndex", {-1});
+    ctx->SetOutputDim("ScoreIndex", {-1});
+    ctx->SetOutputDim("TargetLabel", {-1, 1});
+    ctx->SetOutputDim("TargetBBox", {-1, 4});
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(
+            ctx.Input<framework::LoDTensor>("Anchor")->type()),
+        platform::CPUPlace());
+  }
+};
+
+template <typename T>
+void AppendRpns(LoDTensor* out, int64_t offset, Tensor* to_add) {
+  auto* out_data = out->data<T>();
+  auto* to_add_data = to_add->data<T>();
+  memcpy(out_data + offset, to_add_data, to_add->numel() * sizeof(T));
+}
+
+template <typename T>
+std::vector<Tensor> FilterStraddleAnchor(
+    const platform::CPUDeviceContext& context, const Tensor* anchor,
+    const float rpn_straddle_thresh, T im_height, T im_width) {
+  std::vector<int> inds_inside;
+  int anchor_num = anchor->dims()[0];
+  auto* anchor_data = anchor->data<T>();
+  if (rpn_straddle_thresh >= 0) {
+    int index;
+    for (int i = 0; i < anchor_num; ++i) {
+      index = i * 4;
+      if ((anchor_data[index + 0] >= -rpn_straddle_thresh) &&
+          (anchor_data[index + 1] >= -rpn_straddle_thresh) &&
+          (anchor_data[index + 2] < im_width + rpn_straddle_thresh) &&
+          (anchor_data[index + 3] < im_height + rpn_straddle_thresh)) {
+        inds_inside.emplace_back(i);
+      }
+    }
+  } else {
+    for (int i = 0; i < anchor_num; ++i) {
+      inds_inside.emplace_back(i);
+    }
+  }
+  int inside_num = inds_inside.size();
+  Tensor inds_inside_t;
+  int* inds_inside_data =
+      inds_inside_t.mutable_data<int>({inside_num}, context.GetPlace());
+  std::copy(inds_inside.begin(), inds_inside.end(), inds_inside_data);
+  Tensor inside_anchor_t;
+  T* inside_anchor_data =
+      inside_anchor_t.mutable_data<T>({inside_num, 4}, context.GetPlace());
+  Gather<T>(anchor->data<T>(), 4, inds_inside_data, inside_num,
+            inside_anchor_data);
+  std::vector<Tensor> res;
+  res.emplace_back(inds_inside_t);
+  res.emplace_back(inside_anchor_t);
+  return res;
+}
+
+template <typename T>
+Tensor FilterCrowdGt(const platform::CPUDeviceContext& context,
+                     Tensor* gt_boxes, Tensor* is_crowd) {
+  int gt_num = gt_boxes->dims()[0];
+  std::vector<int> not_crowd_inds;
+  auto* is_crowd_data = is_crowd->data<int>();
+  for (int i = 0; i < gt_num; ++i) {
+    if (is_crowd_data[i] == 0) {
+      not_crowd_inds.emplace_back(i);
+    }
+  }
+  int ncrowd_num = not_crowd_inds.size();
+  Tensor ncrowd_gt_boxes;
+  T* ncrowd_gt_boxes_data =
+      ncrowd_gt_boxes.mutable_data<T>({ncrowd_num, 4}, context.GetPlace());
+  Gather<T>(gt_boxes->data<T>(), 4, not_crowd_inds.data(), ncrowd_num,
+            ncrowd_gt_boxes_data);
+  return ncrowd_gt_boxes;
+}
+
+void ReservoirSampling(const int num, std::vector<int>* inds,
+                       std::minstd_rand engine, bool use_random) {
+  std::uniform_real_distribution<float> uniform(0, 1);
+  size_t len = inds->size();
+  if (len > static_cast<size_t>(num)) {
+    if (use_random) {
+      for (size_t i = num; i < len; ++i) {
+        int rng_ind = std::floor(uniform(engine) * i);
+        if (rng_ind < num)
+          std::iter_swap(inds->begin() + rng_ind, inds->begin() + i);
+      }
+    }
+    inds->resize(num);
+  }
+}
+
+template <typename T>
+void ScoreAssign(const T* anchor_by_gt_overlap_data,
+                 const Tensor& anchor_to_gt_max, const Tensor& gt_to_anchor_max,
+                 const int rpn_batch_size_per_im, const float rpn_fg_fraction,
+                 const float rpn_positive_overlap,
+                 const float rpn_negative_overlap, std::vector<int>* fg_inds,
+                 std::vector<int>* bg_inds, std::vector<int>* tgt_lbl,
+                 std::minstd_rand engine, bool use_random) {
+  float epsilon = 0.00001;
+  int anchor_num = anchor_to_gt_max.dims()[0];
+  int gt_num = gt_to_anchor_max.dims()[0];
+  std::vector<int> target_label(anchor_num, -1);
+  std::vector<int> fg_inds_fake;
+  std::vector<int> bg_inds_fake;
+  const T* anchor_to_gt_max_data = anchor_to_gt_max.data<T>();
+  const T* gt_to_anchor_max_data = gt_to_anchor_max.data<T>();
+  // TODO(buxingyuan): Match with Detectron now
+  // but it seems here is a bug in two directions assignment
+  // in which the later one may overwrites the former one.
+  for (int64_t i = 0; i < anchor_num; ++i) {
+    bool is_anchors_with_max_overlap = false;
+    for (int64_t j = 0; j < gt_num; ++j) {
+      T value = anchor_by_gt_overlap_data[i * gt_num + j];
+      T diff = std::abs(value - gt_to_anchor_max_data[j]);
+      if (diff < epsilon) {
+        is_anchors_with_max_overlap = true;
+        break;
+      }
+    }
+    bool is_anchor_great_than_thresh =
+        (anchor_to_gt_max_data[i] >= rpn_positive_overlap);
+    if (is_anchors_with_max_overlap || is_anchor_great_than_thresh) {
+      fg_inds_fake.push_back(i);
+    }
+  }
+
+  // Reservoir Sampling
+  int fg_num = static_cast<int>(rpn_fg_fraction * rpn_batch_size_per_im);
+  ReservoirSampling(fg_num, &fg_inds_fake, engine, use_random);
+  fg_num = static_cast<int>(fg_inds_fake.size());
+  for (int64_t i = 0; i < fg_num; ++i) {
+    target_label[fg_inds_fake[i]] = 1;
+  }
+
+  int bg_num = rpn_batch_size_per_im - fg_num;
+  for (int64_t i = 0; i < anchor_num; ++i) {
+    if (anchor_to_gt_max_data[i] < rpn_negative_overlap) {
+      bg_inds_fake.push_back(i);
+    }
+  }
+  ReservoirSampling(bg_num, &bg_inds_fake, engine, use_random);
+  bg_num = static_cast<int>(bg_inds_fake.size());
+  for (int64_t i = 0; i < bg_num; ++i) {
+    target_label[bg_inds_fake[i]] = 0;
+  }
+
+  for (int64_t i = 0; i < anchor_num; ++i) {
+    if (target_label[i] == 1) fg_inds->emplace_back(i);
+    if (target_label[i] == 0) bg_inds->emplace_back(i);
+  }
+  fg_num = fg_inds->size();
+  bg_num = bg_inds->size();
+
+  tgt_lbl->resize(fg_num + bg_num, 0);
+  std::vector<int> fg_lbl(fg_num, 1);
+  std::vector<int> bg_lbl(bg_num, 0);
+  std::copy(fg_lbl.begin(), fg_lbl.end(), tgt_lbl->data());
+  std::copy(bg_lbl.begin(), bg_lbl.end(), tgt_lbl->data() + fg_num);
+}
+
+template <typename T>
+std::vector<Tensor> SampleRpnFgBgGt(const platform::CPUDeviceContext& ctx,
+                                    const Tensor& anchor_by_gt_overlap,
+                                    const int rpn_batch_size_per_im,
+                                    const float rpn_positive_overlap,
+                                    const float rpn_negative_overlap,
+                                    const float rpn_fg_fraction,
+                                    std::minstd_rand engine, bool use_random) {
+  auto* anchor_by_gt_overlap_data = anchor_by_gt_overlap.data<T>();
+  int anchor_num = anchor_by_gt_overlap.dims()[0];
+  int gt_num = anchor_by_gt_overlap.dims()[1];
+
+  std::vector<int> fg_inds;
+  std::vector<int> bg_inds;
+  std::vector<int> gt_inds;
+  std::vector<int> tgt_lbl;
+
+  // Calculate the max IoU between anchors and gt boxes
+  // Map from anchor to gt box that has highest overlap
+  auto place = ctx.GetPlace();
+  Tensor anchor_to_gt_max, anchor_to_gt_argmax, gt_to_anchor_max;
+  anchor_to_gt_max.mutable_data<T>({anchor_num}, place);
+  int* argmax = anchor_to_gt_argmax.mutable_data<int>({anchor_num}, place);
+  gt_to_anchor_max.mutable_data<T>({gt_num}, place);
+
+  auto anchor_by_gt_overlap_et =
+      framework::EigenMatrix<T>::From(anchor_by_gt_overlap);
+  auto anchor_to_gt_max_et =
+      framework::EigenVector<T>::Flatten(anchor_to_gt_max);
+  auto gt_to_anchor_max_et =
+      framework::EigenVector<T>::Flatten(gt_to_anchor_max);
+  auto anchor_to_gt_argmax_et =
+      framework::EigenVector<int>::Flatten(anchor_to_gt_argmax);
+  anchor_to_gt_max_et =
+      anchor_by_gt_overlap_et.maximum(Eigen::DSizes<int, 1>(1));
+  anchor_to_gt_argmax_et =
+      anchor_by_gt_overlap_et.argmax(1).template cast<int>();
+  gt_to_anchor_max_et =
+      anchor_by_gt_overlap_et.maximum(Eigen::DSizes<int, 1>(0));
+
+  // Follow the Faster RCNN's implementation
+  ScoreAssign(anchor_by_gt_overlap_data, anchor_to_gt_max, gt_to_anchor_max,
+              rpn_batch_size_per_im, rpn_fg_fraction, rpn_positive_overlap,
+              rpn_negative_overlap, &fg_inds, &bg_inds, &tgt_lbl, engine,
+              use_random);
+
+  int fg_num = fg_inds.size();
+  int bg_num = bg_inds.size();
+  gt_inds.reserve(fg_num);
+  for (int i = 0; i < fg_num; ++i) {
+    gt_inds.emplace_back(argmax[fg_inds[i]]);
+  }
+
+  Tensor loc_index_t, score_index_t, tgt_lbl_t, gt_inds_t;
+  int* loc_index_data = loc_index_t.mutable_data<int>({fg_num}, place);
+  int* score_index_data =
+      score_index_t.mutable_data<int>({fg_num + bg_num}, place);
+  int* tgt_lbl_data = tgt_lbl_t.mutable_data<int>({fg_num + bg_num}, place);
+  int* gt_inds_data = gt_inds_t.mutable_data<int>({fg_num}, place);
+  std::copy(fg_inds.begin(), fg_inds.end(), loc_index_data);
+  std::copy(fg_inds.begin(), fg_inds.end(), score_index_data);
+  std::copy(bg_inds.begin(), bg_inds.end(), score_index_data + fg_num);
+  std::copy(tgt_lbl.begin(), tgt_lbl.end(), tgt_lbl_data);
+  std::copy(gt_inds.begin(), gt_inds.end(), gt_inds_data);
+  std::vector<Tensor> loc_score_tgtlbl_gt;
+  loc_score_tgtlbl_gt.emplace_back(loc_index_t);
+  loc_score_tgtlbl_gt.emplace_back(score_index_t);
+  loc_score_tgtlbl_gt.emplace_back(tgt_lbl_t);
+  loc_score_tgtlbl_gt.emplace_back(gt_inds_t);
+
+  return loc_score_tgtlbl_gt;
+}
+
+template <typename T>
+class RpnTargetAssignKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* anchor = context.Input<Tensor>("Anchor");  // (H*W*A) * 4
+    auto* gt_boxes = context.Input<LoDTensor>("GtBoxes");
+    auto* is_crowd = context.Input<LoDTensor>("IsCrowd");
+    auto* im_info = context.Input<LoDTensor>("ImInfo");
+
+    auto* loc_index = context.Output<LoDTensor>("LocationIndex");
+    auto* score_index = context.Output<LoDTensor>("ScoreIndex");
+    auto* tgt_bbox = context.Output<LoDTensor>("TargetBBox");
+    auto* tgt_lbl = context.Output<LoDTensor>("TargetLabel");
+
+    PADDLE_ENFORCE_EQ(gt_boxes->lod().size(), 1UL,
+                      "RpnTargetAssignOp gt_boxes needs 1 level of LoD");
+    PADDLE_ENFORCE_EQ(is_crowd->lod().size(), 1UL,
+                      "RpnTargetAssignOp is_crowd needs 1 level of LoD");
+    int64_t anchor_num = static_cast<int64_t>(anchor->dims()[0]);
+    int64_t batch_num = static_cast<int64_t>(gt_boxes->lod().back().size() - 1);
+
+    int rpn_batch_size_per_im = context.Attr<int>("rpn_batch_size_per_im");
+    float rpn_straddle_thresh = context.Attr<float>("rpn_straddle_thresh");
+    float rpn_positive_overlap = context.Attr<float>("rpn_positive_overlap");
+    float rpn_negative_overlap = context.Attr<float>("rpn_negative_overlap");
+    float rpn_fg_fraction = context.Attr<float>("rpn_fg_fraction");
+    bool use_random = context.Attr<bool>("use_random");
+
+    int64_t max_num = batch_num * rpn_batch_size_per_im;
+    auto place = context.GetPlace();
+
+    loc_index->mutable_data<int>({max_num}, place);
+    score_index->mutable_data<int>({max_num}, place);
+    tgt_bbox->mutable_data<T>({max_num, 4}, place);
+    tgt_lbl->mutable_data<int>({max_num, 1}, place);
+
+    auto& dev_ctx = context.device_context<platform::CPUDeviceContext>();
+
+    std::random_device rnd;
+    std::minstd_rand engine;
+    int seed = rnd();
+    engine.seed(seed);
+
+    framework::LoD lod_loc, loc_score;
+    std::vector<size_t> lod0_loc(1, 0);
+    std::vector<size_t> lod0_score(1, 0);
+
+    int total_loc_num = 0;
+    int total_score_num = 0;
+    auto gt_boxes_lod = gt_boxes->lod().back();
+    auto is_crowd_lod = is_crowd->lod().back();
+    for (int i = 0; i < batch_num; ++i) {
+      Tensor gt_boxes_slice =
+          gt_boxes->Slice(gt_boxes_lod[i], gt_boxes_lod[i + 1]);
+      Tensor is_crowd_slice =
+          is_crowd->Slice(is_crowd_lod[i], is_crowd_lod[i + 1]);
+      Tensor im_info_slice = im_info->Slice(i, i + 1);
+      auto* im_info_data = im_info_slice.data<T>();
+      auto im_height = im_info_data[0];
+      auto im_width = im_info_data[1];
+      auto im_scale = im_info_data[2];
+
+      // Filter straddle anchor
+      std::vector<Tensor> filter_output = FilterStraddleAnchor<T>(
+          dev_ctx, anchor, rpn_straddle_thresh, im_height, im_width);
+      Tensor inds_inside = filter_output[0];
+      Tensor inside_anchor = filter_output[1];
+
+      // Filter crowd gt
+      Tensor ncrowd_gt_boxes =
+          FilterCrowdGt<T>(dev_ctx, &gt_boxes_slice, &is_crowd_slice);
+      auto ncrowd_gt_boxes_et =
+          framework::EigenTensor<T, 2>::From(ncrowd_gt_boxes);
+      ncrowd_gt_boxes_et = ncrowd_gt_boxes_et * im_scale;
+
+      Tensor anchor_by_gt_overlap;
+      anchor_by_gt_overlap.mutable_data<T>(
+          {inside_anchor.dims()[0], ncrowd_gt_boxes.dims()[0]}, place);
+      BboxOverlaps<T>(inside_anchor, ncrowd_gt_boxes, &anchor_by_gt_overlap);
+
+      auto loc_score_tgtlbl_gt = SampleRpnFgBgGt<T>(
+          dev_ctx, anchor_by_gt_overlap, rpn_batch_size_per_im,
+          rpn_positive_overlap, rpn_negative_overlap, rpn_fg_fraction, engine,
+          use_random);
+
+      Tensor sampled_loc_index = loc_score_tgtlbl_gt[0];
+      Tensor sampled_score_index = loc_score_tgtlbl_gt[1];
+      Tensor sampled_tgtlbl = loc_score_tgtlbl_gt[2];
+      Tensor sampled_gt_index = loc_score_tgtlbl_gt[3];
+
+      int loc_num = sampled_loc_index.dims()[0];
+      int score_num = sampled_score_index.dims()[0];
+      // unmap to all anchor
+      Tensor sampled_loc_index_unmap, sampled_score_index_unmap;
+      sampled_loc_index_unmap.mutable_data<int>({loc_num}, place);
+      sampled_score_index_unmap.mutable_data<int>({score_num}, place);
+      Gather<int>(inds_inside.data<int>(), 1, sampled_loc_index.data<int>(),
+                  loc_num, sampled_loc_index_unmap.data<int>());
+      Gather<int>(inds_inside.data<int>(), 1, sampled_score_index.data<int>(),
+                  score_num, sampled_score_index_unmap.data<int>());
+
+      // get target bbox deltas
+      Tensor sampled_anchor, sampled_gt, sampled_tgt_bbox;
+      auto* sampled_anchor_data =
+          sampled_anchor.mutable_data<T>({loc_num, 4}, place);
+      auto* sampled_gt_data = sampled_gt.mutable_data<T>({loc_num, 4}, place);
+      Gather<T>(anchor->data<T>(), 4, sampled_loc_index_unmap.data<int>(),
+                loc_num, sampled_anchor_data);
+      Gather<T>(ncrowd_gt_boxes.data<T>(), 4, sampled_gt_index.data<int>(),
+                loc_num, sampled_gt_data);
+      sampled_tgt_bbox.mutable_data<T>({loc_num, 4}, place);
+      BoxToDelta<T>(loc_num, sampled_anchor, sampled_gt, nullptr, false,
+                    &sampled_tgt_bbox);
+
+      // Add anchor offset
+      int anchor_offset = i * anchor_num;
+      auto sampled_loc_index_unmap_et =
+          framework::EigenTensor<int, 1>::From(sampled_loc_index_unmap);
+      sampled_loc_index_unmap_et = sampled_loc_index_unmap_et + anchor_offset;
+      auto sampled_score_index_unmap_et =
+          framework::EigenTensor<int, 1>::From(sampled_score_index_unmap);
+      sampled_score_index_unmap_et =
+          sampled_score_index_unmap_et + anchor_offset;
+      AppendRpns<int>(loc_index, total_loc_num, &sampled_loc_index_unmap);
+      AppendRpns<int>(score_index, total_score_num, &sampled_score_index_unmap);
+      AppendRpns<T>(tgt_bbox, total_loc_num * 4, &sampled_tgt_bbox);
+      AppendRpns<int>(tgt_lbl, total_score_num, &sampled_tgtlbl);
+      total_loc_num += loc_num;
+
+      total_score_num += score_num;
+      lod0_loc.emplace_back(total_loc_num);
+      lod0_score.emplace_back(total_score_num);
+    }
+
+    PADDLE_ENFORCE_LE(total_loc_num, max_num);
+    PADDLE_ENFORCE_LE(total_score_num, max_num);
+
+    lod_loc.emplace_back(lod0_loc);
+    loc_score.emplace_back(lod0_score);
+    loc_index->set_lod(lod_loc);
+    score_index->set_lod(loc_score);
+    tgt_bbox->set_lod(lod_loc);
+    tgt_lbl->set_lod(loc_score);
+    loc_index->Resize({total_loc_num});
+    score_index->Resize({total_score_num});
+    tgt_bbox->Resize({total_loc_num, 4});
+    tgt_lbl->Resize({total_score_num, 1});
+  }
+};
+
+class RpnTargetAssignOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Anchor",
+             "(Tensor) input anchor is a 2-D Tensor with shape [H*W*A, 4].");
+    AddInput("GtBoxes",
+             "(LoDTensor) input groud-truth bbox with shape [K, 4].");
+    AddInput("IsCrowd",
+             "(LoDTensor) input which indicates groud-truth is crowd.");
+    AddInput("ImInfo",
+             "(LoDTensor) input image information with shape [N, 3]. "
+             "N is the batch size, each image information includes height, "
+             "width and scale.");
+    AddAttr<int>("rpn_batch_size_per_im",
+                 "Total number of RPN examples per image.")
+        .SetDefault(256);
+    AddAttr<float>(
+        "rpn_straddle_thresh",
+        "Remove RPN anchors that go outside the image by straddle_thresh "
+        "pixels, "
+        "Set to -1 or a large value, e.g. 100000, to disable pruning anchors.");
+    AddAttr<float>(
+        "rpn_positive_overlap",
+        "Minimum overlap required between an anchor and ground-truth "
+        "box for the (anchor, gt box) pair to be a positive example.")
+        .SetDefault(0.7);
+    AddAttr<float>(
+        "rpn_negative_overlap",
+        "Maximum overlap allowed between an anchor and ground-truth "
+        "box for the (anchor, gt box) pair to be a negative examples.")
+        .SetDefault(0.3);
+    AddAttr<float>(
+        "rpn_fg_fraction",
+        "Target fraction of RoI minibatch that "
+        "is labeled foreground (i.e. class > 0), 0-th class is background.")
+        .SetDefault(0.25);
+    AddAttr<bool>("use_random",
+                  "A flag indicating whether to use a ReservoirSampling. "
+                  "NOTE: DO NOT set this flag to false in training. "
+                  "Setting this flag to false is only useful in unittest.")
+        .SetDefault(true);
+    AddOutput(
+        "LocationIndex",
+        "(Tensor), The indexes of foreground anchors in all RPN anchors, the "
+        "shape of the LocationIndex is [F], F depends on the value of input "
+        "tensor and attributes.");
+    AddOutput(
+        "ScoreIndex",
+        "(Tensor), The indexes of foreground and background anchors in all "
+        "RPN anchors(The rest anchors are ignored). The shape of the "
+        "ScoreIndex is [F + B], F and B are sampled foreground and backgroud "
+        " number.");
+    AddOutput("TargetBBox",
+              "(Tensor), The target bbox deltas with shape "
+              "[F, 4], F is the sampled foreground number.");
+    AddOutput(
+        "TargetLabel",
+        "(Tensor<int>), The target labels of each anchor with shape "
+        "[F + B, 1], F and B are sampled foreground and backgroud number.");
+    AddComment(R"DOC(
+This operator can be, for a given set of ground truth bboxes and the
+anchors, to assign classification and regression targets to each prediction.
+The ScoreIndex and LocationIndex will be generated according to the anchor-groundtruth IOU.
+The rest anchors would not contibute to the RPN training loss
+
+ScoreIndex is composed of foreground anchor indexes(positive labels) and
+background anchor indexes(negative labels). LocationIndex is exactly same
+as the foreground anchor indexes since we can not assign regression target to 
+the background anchors.
+
+The classification targets(TargetLabel) is a binary class label (of being
+an object or not). Following the paper of Faster-RCNN, the positive labels
+are two kinds of anchors: (i) the anchor/anchors with the highest IoU
+overlap with a ground-truth box, or (ii) an anchor that has an IoU overlap
+higher than rpn_positive_overlap(0.7) with any ground-truth box. Note that
+a single ground-truth box may assign positive labels to multiple anchors.
+A non-positive anchor is when its IoU ratio is lower than rpn_negative_overlap
+(0.3) for all ground-truth boxes. Anchors that are neither positive nor
+negative do not contribute to the training objective.
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(rpn_target_assign, ops::RpnTargetAssignOp,
+                  ops::RpnTargetAssignOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(rpn_target_assign, ops::RpnTargetAssignKernel<float>,
+                       ops::RpnTargetAssignKernel<double>);
diff --git a/paddle/fluid/operators/detection/target_assign_op.h b/paddle/fluid/operators/detection/target_assign_op.h
index 3d529737414d54f05e8c82ede1d6068e5d261110..7f989dfca699d498432f8df3f86c44723faeb980 100644
--- a/paddle/fluid/operators/detection/target_assign_op.h
+++ b/paddle/fluid/operators/detection/target_assign_op.h
@@ -106,7 +106,11 @@ class TargetAssignKernel : public framework::OpKernel<T> {
     int64_t k = x->dims()[2];
 
     auto x_lod = x->lod().back();
+#if defined(PADDLE_WITH_CUDA)
     size_t* x_lod_data = x_lod.MutableData(ctx.GetPlace());
+#else
+    size_t* x_lod_data = x_lod.data();
+#endif
 
     TargetAssignFunctor<T, WT> functor(x_data, match_idx_data, x_lod_data,
                                        mismatch_value, n, m, p, k, out_data,
@@ -121,7 +125,11 @@ class TargetAssignKernel : public framework::OpKernel<T> {
       PADDLE_ENFORCE_EQ(neg_indices->lod().size(), 1UL);
       const int* neg_idx_data = neg_indices->data<int>();
       auto neg_lod = neg_indices->lod().back();
+#if defined(PADDLE_WITH_CUDA)
       size_t* neg_lod_data = neg_lod.MutableData(ctx.GetPlace());
+#else
+      size_t* neg_lod_data = neg_lod.data();
+#endif
       NegTargetAssignFunctor<DeviceContext, T, WT> neg_trg_functor;
       neg_trg_functor(device_ctx, neg_idx_data, neg_lod_data, n, m, k,
                       mismatch_value, out_data, out_wt_data);
diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt
index 675ca36774beb72cc1e9b136ad0b18ce061689ac..56734b81e8716a0c0c37a11e35c9118ee7b55020 100644
--- a/paddle/fluid/operators/distributed/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed/CMakeLists.txt
@@ -1,33 +1,44 @@
+if(NOT WITH_DISTRIBUTE)
+    return()
+endif()
+
+if(WITH_GRPC)
+    set(cc_generic_services "false")
+else()
+    set(cc_generic_services "true")
+endif()
+configure_file(send_recv.proto.in ${CMAKE_CURRENT_SOURCE_DIR}/send_recv.proto @ONLY)
+
 if(WITH_GRPC)
-  grpc_library(sendrecvop_grpc SRCS bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc
-      request_handler_impl.cc rpc_client.cc rpc_server.cc grpc_server.cc variable_response.cc PROTO send_recv.proto DEPS lod_tensor
-      selected_rows memory)
+  grpc_library(sendrecvop_grpc SRCS grpc_bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc
+        request_handler_impl.cc rpc_client.cc rpc_server.cc grpc_server.cc variable_response.cc grpc_variable_response.cc grpc_serde.cc
+      PROTO send_recv.proto 
+      DEPS lod_tensor selected_rows memory)
   set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
   set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-  cc_test(serde_test SRCS grpc_serde_test.cc variable_response.cc DEPS grpc++_unsecure grpc_unsecure gpr
-          cares zlib protobuf sendrecvop_grpc scope profiler math_function SERIAL)
-  cc_test(grpc_server_test SRCS rpc_server_test.cc DEPS sendrecvop_grpc
-          grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor
-          proto_desc lookup_table_op SERIAL)
+  cc_test(grpc_serde_test SRCS grpc_serde_test.cc 
+    DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc scope profiler math_function SERIAL)
+  cc_test(rpc_server_test SRCS rpc_server_test.cc
+    DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor  proto_desc lookup_sparse_table_op SERIAL)
+  cc_test(varhandle_test SRCS varhandle_test.cc)
   return()
 endif()
 
 
 set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
-set_source_files_properties(brpc_server.cc brpc_client.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-brpc_library(sendrecvop_brpc SRCS brpc_client.cc brpc_server.cc rpc_server.cc  rpc_client.cc request_handler_impl.cc
+
+set_source_files_properties(brpc_server.cc brpc_client.cc rpc_server_test.cc brpc_serde_test.cc
+    brpc_variable_response.cc brpc_sendrecvop_utils.cc brpc_rdma_pool.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+
+brpc_library(sendrecvop_brpc SRCS brpc_client.cc brpc_server.cc rpc_server.cc rpc_client.cc request_handler_impl.cc brpc_sendrecvop_utils.cc 
+    brpc_variable_response.cc variable_response.cc sendrecvop_utils.cc brpc_rdma_pool.cc
   PROTO send_recv.proto
   DEPS lod_tensor selected_rows memory)
 
-find_library(OPENSSL_CRYPTO_LIBRARY_STATIC NAMES libcrypto.so)
-ADD_LIBRARY(crypto SHARED IMPORTED GLOBAL)
-SET_PROPERTY(TARGET crypto PROPERTY IMPORTED_LOCATION ${OPENSSL_CRYPTO_LIBRARY_STATIC})
-
+set(brpc_test_depends sendrecvop_brpc brpc ssl crypto protobuf leveldb gflags glog executor proto_desc lookup_table_op snappystream snappy)
 
-find_library(OPENSSL_SSL_LIBRARY_STATIC NAMES libssl.so)
-ADD_LIBRARY(ssl SHARED IMPORTED GLOBAL)
-SET_PROPERTY(TARGET ssl PROPERTY IMPORTED_LOCATION ${OPENSSL_SSL_LIBRARY_STATIC})
+cc_test(brpc_server_test SRCS rpc_server_test.cc 
+    DEPS ${brpc_test_depends} SERIAL)
 
-cc_test(brpc_server_test SRCS rpc_server_test.cc DEPS sendrecvop_brpc 
-       brpc protobuf leveldb gflags glog
-       protobuf executor proto_desc lookup_table_op snappystream snappy ssl crypto SERIAL)
+cc_test(brpc_serde_test SRCS brpc_serde_test.cc 
+    DEPS ${brpc_test_depends} SERIAL)
diff --git a/paddle/fluid/operators/distributed/bytebuffer_stream.cc b/paddle/fluid/operators/distributed/grpc_bytebuffer_stream.cc
similarity index 96%
rename from paddle/fluid/operators/distributed/bytebuffer_stream.cc
rename to paddle/fluid/operators/distributed/grpc_bytebuffer_stream.cc
index 6e91b447db838c9095432eda22e9e1171e938d31..d192f54ee0c924b772045d9b6a01701f640e07c7 100644
--- a/paddle/fluid/operators/distributed/bytebuffer_stream.cc
+++ b/paddle/fluid/operators/distributed/grpc_bytebuffer_stream.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 //       file and did some modifications so that we can send gRPC
 //       requests without too much copying of the tensor data.
 
-#include "paddle/fluid/operators/distributed/bytebuffer_stream.h"
+#include "paddle/fluid/operators/distributed/grpc_bytebuffer_stream.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/distributed/bytebuffer_stream.h b/paddle/fluid/operators/distributed/grpc_bytebuffer_stream.h
similarity index 87%
rename from paddle/fluid/operators/distributed/bytebuffer_stream.h
rename to paddle/fluid/operators/distributed/grpc_bytebuffer_stream.h
index e7de172c79c30761483b5d96f5bad19860208832..e9074574cdd163bbf7e62939df9283352706f840 100644
--- a/paddle/fluid/operators/distributed/bytebuffer_stream.h
+++ b/paddle/fluid/operators/distributed/grpc_bytebuffer_stream.h
@@ -24,6 +24,7 @@ limitations under the License. */
 #include "google/protobuf/io/coded_stream.h"
 #include "google/protobuf/io/zero_copy_stream.h"
 #include "grpc++/grpc++.h"
+#include "paddle/fluid/operators/distributed/variable_response.h"
 
 namespace grpc {
 // A ZeroCopyInputStream that reads from grpc_byte_buffer
@@ -107,25 +108,6 @@ class GrpcBufferReader final
 namespace paddle {
 namespace operators {
 namespace distributed {
-// Source provides a way for a particular RPC implementation to provide
-// received data to ParseFrom.
-class Source {
- public:
-  virtual ~Source() {}
-
-  // Return the stream that contains the data to be parsed.
-  // Note that this method might be invoked more than once if
-  // ParseFrom needs to fall back to a more expensive parsing method.
-  // Every call must return a stream pointing at the beginning of
-  // the serialized RecvTensorResponse.
-  //
-  // Note that a subsequent call to contents() invalidates previous
-  // results of contents().
-  //
-  // Ownership of the returned stream is retained by the Source and
-  // should not be deleted by the caller.
-  virtual ::google::protobuf::io::ZeroCopyInputStream* contents() = 0;
-};
 
 // A ZeroCopyInputStream that reads from a grpc::ByteBuffer.
 class GrpcByteBufferSource
diff --git a/paddle/fluid/operators/distributed/grpc_client.cc b/paddle/fluid/operators/distributed/grpc_client.cc
index 4a09f3870d64d8e14b2db41ff3ea7c2f9e67b558..e22bc552f85b85c75f06b4158f2abac2d3843256 100644
--- a/paddle/fluid/operators/distributed/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc_client.cc
@@ -20,6 +20,7 @@ limitations under the License. */
 
 #include "glog/logging.h"  // For VLOG
 #include "paddle/fluid/framework/threadpool.h"
+#include "paddle/fluid/operators/distributed/grpc_serde.h"
 #include "paddle/fluid/operators/distributed/request_handler.h"
 #include "paddle/fluid/platform/profiler.h"
 
@@ -35,23 +36,20 @@ void GRPCClient::InitEventLoop() {
   client_thread_.reset(new std::thread(std::bind(&GRPCClient::Proceed, this)));
 }
 
-void GRPCClient::SendBeginPass() {
-  for (auto& it : channels_) {
-    VLOG(3) << "send begin pass to: " << it.first;
-    this->AsyncSendBeginPass(it.first);
-  }
-  this->Wait();
-}
-
-void GRPCClient::SendEndPass() {
-  for (auto& it : channels_) {
-    VLOG(3) << "send end pass to " << it.first;
-    this->AsyncSendEndPass(it.first);
+void GRPCClient::SendComplete() {
+  std::unique_lock<std::mutex> lk(completed_mutex_);
+  if (!completed_) {
+    for (auto& it : channels_) {
+      VLOG(3) << "send complete message to " << it.first;
+      this->AsyncSendComplete(it.first);
+    }
+    PADDLE_ENFORCE(this->Wait(), "internal grpc error");
+    completed_ = true;
   }
-  this->Wait();
 }
 
 GRPCClient::~GRPCClient() {
+  stopped_ = true;
   Wait();
   cq_.Shutdown();
   {
@@ -59,40 +57,34 @@ GRPCClient::~GRPCClient() {
     for (auto& it : channels_) {
       it.second.reset();
     }
+    channels_.clear();
   }
   client_thread_->join();
 }
 
-bool GRPCClient::AsyncSendVar(const std::string& ep,
-                              const platform::DeviceContext& ctx,
-                              const framework::Scope& scope,
-                              const std::string& var_name, int64_t time_out) {
+VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep,
+                                      const platform::DeviceContext& ctx,
+                                      const framework::Scope& scope,
+                                      const std::string& var_name,
+                                      int64_t time_out) {
   const platform::DeviceContext* p_ctx = &ctx;
   const std::string ep_val = ep;
   const std::string var_name_val = var_name;
   const framework::Scope* p_scope = &scope;
   const auto ch = GetChannel(ep_val);
+  SendProcessor* s = new SendProcessor(ch);
+  VarHandlePtr h(new VarHandle(ep, "Send", var_name_val, p_ctx, p_scope));
+  s->Prepare(h, time_out);
 
-  framework::AsyncIO([var_name_val, p_ctx, ep_val, p_scope, time_out, ch,
-                      this] {
+  framework::AsyncIO([var_name_val, p_scope, p_ctx, s, this] {
     auto* var = p_scope->FindVar(var_name_val);
 
     ::grpc::ByteBuffer req;
     SerializeToByteBuffer(var_name_val, var, *p_ctx, &req);
 
-    // varhandle
-    VarHandle var_h;
-    var_h.ep = ep_val;
-    var_h.scope = p_scope;
-    var_h.name = var_name_val;
-    var_h.ctx = p_ctx;
-    var_h.method = "Send";
-
-    VLOG(3) << var_h.String() << " begin";
+    VLOG(3) << s->GetVarHandlePtr()->String() << " begin";
 
     // stub context
-    SendProcessor* s = new SendProcessor(ch);
-    s->Prepare(var_h, time_out);
     s->response_call_back_ = nullptr;
 
     auto call = s->stub_g_.PrepareUnaryCall(
@@ -102,13 +94,13 @@ bool GRPCClient::AsyncSendVar(const std::string& ep,
   });
   req_count_++;
 
-  return true;
+  return h;
 }
 
 void ProcGetResponse(const VarHandle& var_h,
                      const ::grpc::ByteBuffer& ret_msg) {
   framework::Variable* outvar = nullptr;
-  DeserializeFromByteBuffer(ret_msg, *var_h.ctx, var_h.scope, &outvar);
+  DeserializeFromByteBuffer(ret_msg, *var_h.ctx(), var_h.scope(), &outvar);
 }
 
 template <typename T>
@@ -119,37 +111,30 @@ void RequestToByteBuffer(const T& proto, ::grpc::ByteBuffer* result) {
   result->Swap(&tmp);
 }
 
-bool GRPCClient::AsyncGetVar(const std::string& ep,
-                             const platform::DeviceContext& ctx,
-                             const framework::Scope& scope,
-                             const std::string& var_name, int64_t time_out) {
+VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep,
+                                     const platform::DeviceContext& ctx,
+                                     const framework::Scope& scope,
+                                     const std::string& var_name,
+                                     int64_t time_out) {
   const platform::DeviceContext* p_ctx = &ctx;
   const std::string ep_val = ep;
   const std::string var_name_val = var_name;
   const framework::Scope* p_scope = &scope;
   const auto ch = GetChannel(ep_val);
+  GetProcessor* s = new GetProcessor(ch);
+  VarHandlePtr h(new VarHandle(ep, "Get", var_name_val, p_ctx, p_scope));
+  s->Prepare(h, time_out);
 
-  framework::AsyncIO([var_name_val, ep_val, p_scope, p_ctx, time_out, ch,
-                      this] {
+  framework::AsyncIO([var_name_val, p_scope, p_ctx, s, this] {
     // prepare input
     sendrecv::VariableMessage req;
     req.set_varname(var_name_val);
     ::grpc::ByteBuffer buf;
     RequestToByteBuffer<sendrecv::VariableMessage>(req, &buf);
 
-    // var handle
-    VarHandle var_h;
-    var_h.ep = ep_val;
-    var_h.scope = p_scope;
-    var_h.name = var_name_val;
-    var_h.ctx = p_ctx;
-    var_h.method = "Get";
-
-    VLOG(3) << var_h.String() << " begin";
+    VLOG(3) << s->GetVarHandlePtr()->String() << " begin";
 
     // stub context
-    GetProcessor* s = new GetProcessor(ch);
-    s->Prepare(var_h, time_out);
     s->response_call_back_ = ProcGetResponse;
 
     auto call = s->stub_g_.PrepareUnaryCall(
@@ -160,42 +145,36 @@ bool GRPCClient::AsyncGetVar(const std::string& ep,
 
   req_count_++;
 
-  return true;
+  return h;
 }
 
-bool GRPCClient::AsyncPrefetchVar(const std::string& ep,
-                                  const platform::DeviceContext& ctx,
-                                  const framework::Scope& scope,
-                                  const std::string& in_var_name,
-                                  const std::string& out_var_name,
-                                  int64_t time_out) {
+VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep,
+                                          const platform::DeviceContext& ctx,
+                                          const framework::Scope& scope,
+                                          const std::string& in_var_name,
+                                          const std::string& out_var_name,
+                                          int64_t time_out) {
   const platform::DeviceContext* p_ctx = &ctx;
   const std::string ep_val = ep;
   const std::string in_var_name_val = in_var_name;
   const std::string out_var_name_val = out_var_name;
   const framework::Scope* p_scope = &scope;
   const auto ch = GetChannel(ep_val);
+  GetProcessor* s = new GetProcessor(ch);
+  VarHandlePtr h(
+      new VarHandle(ep, "Prefetch", out_var_name_val, p_ctx, p_scope));
+  s->Prepare(h, time_out);
 
   framework::AsyncIO([in_var_name_val, out_var_name_val, ep_val, p_scope, p_ctx,
-                      time_out, ch, this] {
+                      time_out, s, this] {
     auto* var = p_scope->FindVar(in_var_name_val);
 
     ::grpc::ByteBuffer req;
     SerializeToByteBuffer(in_var_name_val, var, *p_ctx, &req, out_var_name_val);
 
-    // var handle
-    VarHandle var_h;
-    var_h.ep = ep_val;
-    var_h.scope = p_scope;
-    var_h.name = out_var_name_val;
-    var_h.ctx = p_ctx;
-    var_h.method = "Prefetch";
-
-    VLOG(3) << var_h.String() << " begin";
+    VLOG(3) << s->GetVarHandlePtr()->String() << " begin";
 
     // stub context
-    GetProcessor* s = new GetProcessor(ch);
-    s->Prepare(var_h, time_out);
     s->response_call_back_ = ProcGetResponse;
 
     auto call = s->stub_g_.PrepareUnaryCall(
@@ -206,69 +185,68 @@ bool GRPCClient::AsyncPrefetchVar(const std::string& ep,
   });
 
   req_count_++;
-  return true;
+  return h;
 }
 
-void GRPCClient::AsyncSendBatchBarrier(const std::string& ep,
-                                       int64_t time_out) {
+VarHandlePtr GRPCClient::AsyncSendBatchBarrier(const std::string& ep,
+                                               int64_t time_out) {
   const auto ch = GetChannel(ep);
 
   BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
-  s->Prepare(time_out);
+  VarHandlePtr h(new VarHandle(ep, "BatchBarrier", BATCH_BARRIER_MESSAGE,
+                               nullptr, nullptr));
+  s->Prepare(h, time_out);
 
   sendrecv::VariableMessage req;
   req.set_varname(BATCH_BARRIER_MESSAGE);
   auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
   rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
   req_count_++;
+  return h;
 }
 
-void GRPCClient::AsyncSendFetchBarrier(const std::string& ep,
-                                       int64_t time_out) {
+VarHandlePtr GRPCClient::AsyncSendFetchBarrier(const std::string& ep,
+                                               int64_t time_out) {
   const auto ch = GetChannel(ep);
   FetchBarrierProcessor* s = new FetchBarrierProcessor(ch);
-  s->Prepare(time_out);
+  VarHandlePtr h(new VarHandle(ep, "FetchBarrier", FETCH_BARRIER_MESSAGE,
+                               nullptr, nullptr));
+  s->Prepare(h, time_out);
 
   sendrecv::VariableMessage req;
   req.set_varname(FETCH_BARRIER_MESSAGE);
   auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_);
   rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
   req_count_++;
+  return h;
 }
 
-void GRPCClient::AsyncSendBeginPass(const std::string& ep, int64_t time_out) {
+VarHandlePtr GRPCClient::AsyncSendComplete(const std::string& ep,
+                                           int64_t time_out) {
   const auto ch = GetChannel(ep);
 
   BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
-  s->Prepare(time_out);
+  VarHandlePtr h(
+      new VarHandle(ep, "SendComplete", COMPLETE_MESSAGE, nullptr, nullptr));
+  s->Prepare(h, time_out);
 
   sendrecv::VariableMessage req;
-  req.set_varname(BEGIN_PASS_MESSAGE);
+  req.set_varname(COMPLETE_MESSAGE);
   auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
   rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
   req_count_++;
+  return h;
 }
 
-void GRPCClient::AsyncSendEndPass(const std::string& ep, int64_t time_out) {
-  const auto ch = GetChannel(ep);
-
-  FetchBarrierProcessor* s = new FetchBarrierProcessor(ch);
-  s->Prepare(time_out);
-
-  sendrecv::VariableMessage req;
-  req.set_varname(END_PASS_MESSAGE);
-  auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_);
-  rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
-  req_count_++;
-}
-
-void GRPCClient::AsyncCheckpointNotify(const std::string& ep,
-                                       const std::string& dir,
-                                       int64_t time_out) {
+VarHandlePtr GRPCClient::AsyncCheckpointNotify(const std::string& ep,
+                                               const std::string& dir,
+                                               int64_t time_out) {
   const auto ch = GetChannel(ep);
 
   CheckpointNotifyProcessor* s = new CheckpointNotifyProcessor(ch);
-  s->Prepare(time_out);
+  VarHandlePtr h(new VarHandle(ep, "CheckPointNotify", CHECKPOINT_SAVE_MESSAGE,
+                               nullptr, nullptr));
+  s->Prepare(h, time_out);
 
   sendrecv::VariableMessage req;
   req.set_varname(CHECKPOINT_SAVE_MESSAGE);
@@ -277,35 +255,55 @@ void GRPCClient::AsyncCheckpointNotify(const std::string& ep,
   auto rpc = s->stub_->AsyncCheckpointNotify(s->context_.get(), req, &cq_);
   rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
   req_count_++;
+  return h;
 }
 
-void GRPCClient::Wait() {
+bool GRPCClient::Wait() {
   std::unique_lock<std::mutex> lk(sync_mutex_);
-  sync_cond_.wait(lk, [this] { return req_count_ == 0; });
+  sync_cond_.wait(lk, [this] { return (req_count_ == 0 || ok_ == false); });
+  return ok_;
 }
 
 void GRPCClient::Proceed() {
   void* tag = nullptr;
   bool ok = false;
 
-  while (cq_.Next(&tag, &ok)) {
+  VLOG(3) << "GRPCClient Proceed begin";
+  while (!stopped_ && cq_.Next(&tag, &ok)) {
     BaseProcessor* c = static_cast<BaseProcessor*>(tag);
     GPR_ASSERT(ok);
     PADDLE_ENFORCE(c);
     if (c->status_.ok()) {
-      VLOG(3) << c->var_h_.String() << " process";
+      VLOG(3) << c->GetVarHandlePtr()->String() << " process";
       c->Process();
+    } else if (c->status_.error_code() == grpc::StatusCode::DEADLINE_EXCEEDED) {
+      LOG(ERROR) << c->GetVarHandlePtr()->String()
+                 << " meets grpc error:" << c->status_.error_message();
+      {
+        std::lock_guard<std::mutex> lk(sync_mutex_);
+        ok_ = false;
+      }
+      c->Finish(false);
     } else {
-      LOG(FATAL) << c->var_h_.String()
+      LOG(FATAL) << c->GetVarHandlePtr()->String()
                  << " meets grpc error:" << c->status_.error_message();
+      c->Finish(false);
     }
-    delete c;
+
+    bool notify = false;
     {
       std::lock_guard<std::mutex> lk(sync_mutex_);
       req_count_--;
+      notify = (req_count_ <= 0 || !c->status_.ok());
+    }
+
+    delete c;
+
+    if (notify) {
+      sync_cond_.notify_all();
     }
-    sync_cond_.notify_all();
   }
+  VLOG(3) << "GRPCClient Proceed end";
 }
 
 std::shared_ptr<grpc::Channel> GRPCClient::GetChannel(const std::string& ep) {
diff --git a/paddle/fluid/operators/distributed/grpc_client.h b/paddle/fluid/operators/distributed/grpc_client.h
index 5dae20155edcf9edd746a5d9a9bbe0ccd789f431..75a3662316462a222760bfbb7d7906c70f46d143 100644
--- a/paddle/fluid/operators/distributed/grpc_client.h
+++ b/paddle/fluid/operators/distributed/grpc_client.h
@@ -38,7 +38,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/operators/distributed/request_handler.h"
 #include "paddle/fluid/operators/distributed/rpc_client.h"
+#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
+#include "paddle/fluid/operators/distributed/send_recv.pb.h"
 #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
 #include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
 
@@ -46,36 +49,18 @@ namespace paddle {
 namespace operators {
 namespace distributed {
 
-struct VarHandle {
-  // RPC endpoint.
-  std::string ep;
-  const platform::DeviceContext* ctx;
-  const framework::Scope* scope;
-  // Variable name.
-  std::string name;
-  // RPC method name.
-  std::string method;
-
-  std::string String() const {
-    std::ostringstream s;
-    s << method << " name:[" << name << "], ep:[" << ep << "]";
-    return s.str();
-  }
-};
-
 void ProcGetResponse(const VarHandle& var_h, const grpc::ByteBuffer& msg);
 
 class BaseProcessor {
  public:
-  explicit BaseProcessor(std::shared_ptr<grpc::Channel> ch) {
-    context_ = nullptr;
-  }
+  BaseProcessor() { context_ = nullptr; }
 
   virtual ~BaseProcessor() {}
 
-  virtual void Prepare(const VarHandle& var_info, int64_t time_out) {
+  virtual void Prepare(VarHandlePtr h, int64_t time_out) {
+    var_h_ = h;
+
     context_.reset(new grpc::ClientContext());
-    var_h_ = var_info;
     context_->set_wait_for_ready(true);
     if (time_out) {
       std::chrono::system_clock::time_point deadline =
@@ -85,21 +70,21 @@ class BaseProcessor {
     }
   }
 
-  virtual void Prepare(int64_t time_out) {
-    context_.reset(new grpc::ClientContext());
-    context_->set_wait_for_ready(true);
-
-    std::chrono::system_clock::time_point deadline =
-        std::chrono::system_clock::now() + std::chrono::milliseconds(time_out);
-
-    context_->set_deadline(deadline);
+  void Process() {
+    ProcessImpl();
+    var_h_->Finish(true);
   }
 
-  virtual void Process() = 0;
+  VarHandlePtr GetVarHandlePtr() { return var_h_; }
+  bool Wait() { return var_h_->Wait(); }
+  void Finish(bool ok) { return var_h_->Finish(ok); }
+  virtual void ProcessImpl() = 0;
 
   std::unique_ptr<grpc::ClientContext> context_;
   grpc::Status status_;
-  VarHandle var_h_;
+
+ protected:
+  VarHandlePtr var_h_;
 };
 
 typedef std::function<void(const VarHandle&, const ::grpc::ByteBuffer&)>
@@ -108,13 +93,13 @@ typedef std::function<void(const VarHandle&, const ::grpc::ByteBuffer&)>
 class SendProcessor : public BaseProcessor {
  public:
   explicit SendProcessor(std::shared_ptr<grpc::Channel> ch)
-      : BaseProcessor(ch), stub_g_(ch) {}
+      : BaseProcessor(), stub_g_(ch) {}
 
   virtual ~SendProcessor() {}
 
-  virtual void Process() {
+  void ProcessImpl() override {
     if (response_call_back_) {
-      response_call_back_(var_h_, reply_);
+      response_call_back_(*var_h_.get(), reply_);
     }
   }
 
@@ -129,13 +114,13 @@ typedef std::function<void(const VarHandle&, const ::grpc::ByteBuffer&)>
 class GetProcessor : public BaseProcessor {
  public:
   explicit GetProcessor(std::shared_ptr<grpc::Channel> ch)
-      : BaseProcessor(ch), stub_g_(ch) {}
+      : BaseProcessor(), stub_g_(ch) {}
 
   virtual ~GetProcessor() {}
 
-  virtual void Process() {
+  void ProcessImpl() override {
     if (response_call_back_) {
-      response_call_back_(var_h_, reply_);
+      response_call_back_(*var_h_.get(), reply_);
     }
   }
 
@@ -147,13 +132,13 @@ class GetProcessor : public BaseProcessor {
 class BatchBarrierProcessor : public BaseProcessor {
  public:
   explicit BatchBarrierProcessor(std::shared_ptr<grpc::Channel> ch)
-      : BaseProcessor(ch) {
+      : BaseProcessor() {
     stub_ = sendrecv::SendRecvService::NewStub(ch);
   }
 
   virtual ~BatchBarrierProcessor() {}
 
-  virtual void Process() {}
+  void ProcessImpl() override {}
   sendrecv::VoidMessage reply_;
   std::unique_ptr<sendrecv::SendRecvService::Stub> stub_;
 };
@@ -161,13 +146,13 @@ class BatchBarrierProcessor : public BaseProcessor {
 class FetchBarrierProcessor : public BaseProcessor {
  public:
   explicit FetchBarrierProcessor(std::shared_ptr<grpc::Channel> ch)
-      : BaseProcessor(ch) {
+      : BaseProcessor() {
     stub_ = sendrecv::SendRecvService::NewStub(ch);
   }
 
   virtual ~FetchBarrierProcessor() {}
 
-  virtual void Process() {}
+  void ProcessImpl() override {}
   sendrecv::VariableMessage reply_;
   std::unique_ptr<sendrecv::SendRecvService::Stub> stub_;
 };
@@ -175,57 +160,57 @@ class FetchBarrierProcessor : public BaseProcessor {
 class CheckpointNotifyProcessor : public BaseProcessor {
  public:
   explicit CheckpointNotifyProcessor(std::shared_ptr<grpc::Channel> ch)
-      : BaseProcessor(ch) {
+      : BaseProcessor() {
     stub_ = sendrecv::SendRecvService::NewStub(ch);
   }
 
   virtual ~CheckpointNotifyProcessor() {}
 
-  virtual void Process() {}
+  void ProcessImpl() override {}
   sendrecv::VoidMessage reply_;
   std::unique_ptr<sendrecv::SendRecvService::Stub> stub_;
 };
 
 class GRPCClient : public RPCClient {
  public:
-  GRPCClient() {}
+  GRPCClient() : ok_(true), completed_(false), stopped_(false) {}
   virtual ~GRPCClient();
 
-  bool AsyncSendVar(const std::string& ep, const platform::DeviceContext& ctx,
-                    const framework::Scope& scope, const std::string& var_name,
-                    int64_t time_out = FLAGS_rpc_deadline) override;
-
-  bool AsyncGetVar(const std::string& ep, const platform::DeviceContext& ctx,
-                   const framework::Scope& scope, const std::string& var_name,
-                   int64_t time_out = FLAGS_rpc_deadline) override;
+  VarHandlePtr AsyncSendVar(const std::string& ep,
+                            const platform::DeviceContext& ctx,
+                            const framework::Scope& scope,
+                            const std::string& var_name,
+                            int64_t time_out = FLAGS_rpc_deadline) override;
 
-  bool AsyncPrefetchVar(const std::string& ep,
-                        const platform::DeviceContext& ctx,
-                        const framework::Scope& scope,
-                        const std::string& in_var_name,
-                        const std::string& out_var_name,
-                        int64_t time_out = FLAGS_rpc_deadline) override;
+  VarHandlePtr AsyncGetVar(const std::string& ep,
+                           const platform::DeviceContext& ctx,
+                           const framework::Scope& scope,
+                           const std::string& var_name,
+                           int64_t time_out = FLAGS_rpc_deadline) override;
 
-  void AsyncSendBatchBarrier(const std::string& ep,
-                             int64_t time_out = FLAGS_rpc_deadline) override;
+  VarHandlePtr AsyncPrefetchVar(const std::string& ep,
+                                const platform::DeviceContext& ctx,
+                                const framework::Scope& scope,
+                                const std::string& in_var_name,
+                                const std::string& out_var_name,
+                                int64_t time_out = FLAGS_rpc_deadline) override;
 
-  void AsyncSendFetchBarrier(const std::string& ep,
-                             int64_t time_out = FLAGS_rpc_deadline) override;
+  VarHandlePtr AsyncSendBatchBarrier(
+      const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override;
 
-  void AsyncCheckpointNotify(const std::string& ep, const std::string& dir,
-                             int64_t time_out = FLAGS_rpc_deadline) override;
+  VarHandlePtr AsyncSendFetchBarrier(
+      const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override;
 
-  void AsyncSendBeginPass(const std::string& ep,
-                          int64_t time_out = FLAGS_rpc_deadline) override;
+  VarHandlePtr AsyncCheckpointNotify(
+      const std::string& ep, const std::string& dir,
+      int64_t time_out = FLAGS_rpc_deadline) override;
 
-  void AsyncSendEndPass(const std::string& ep,
-                        int64_t time_out = FLAGS_rpc_deadline) override;
+  VarHandlePtr AsyncSendComplete(
+      const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override;
 
-  void Wait() override;
+  bool Wait() override;
 
-  void SendBeginPass() override;
-
-  void SendEndPass() override;
+  void SendComplete() override;
 
  protected:
   void InitImpl() override;
@@ -247,10 +232,17 @@ class GRPCClient : public RPCClient {
   std::mutex sync_mutex_;
   std::condition_variable sync_cond_;
   std::atomic<int64_t> req_count_{0};
+  bool ok_;
 
   // mutex for GetChannel thread safety
   std::mutex chan_mutex_;
   DISABLE_COPY_AND_ASSIGN(GRPCClient);
+
+  // mutex for sending complete message only once
+  std::mutex completed_mutex_;
+  bool completed_;
+
+  volatile bool stopped_;
 };
 
 }  // namespace distributed
diff --git a/paddle/fluid/operators/distributed/grpc_serde.cc b/paddle/fluid/operators/distributed/grpc_serde.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3f8796713a6b89a308113981614673e07e8d367f
--- /dev/null
+++ b/paddle/fluid/operators/distributed/grpc_serde.cc
@@ -0,0 +1,157 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_CUDA
+#include <nccl.h>
+#endif
+#include <sys/time.h>
+#include <thread>  // NOLINT
+
+#include "google/protobuf/io/coded_stream.h"
+#include "google/protobuf/io/zero_copy_stream.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/operators/distributed/grpc_bytebuffer_stream.h"
+#include "paddle/fluid/operators/distributed/grpc_serde.h"
+#include "paddle/fluid/operators/distributed/grpc_variable_response.h"
+#include "paddle/fluid/operators/distributed/proto_encoder_helper.h"
+#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
+#include "paddle/fluid/platform/profiler.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
+                           const platform::DeviceContext& ctx,
+                           ::grpc::ByteBuffer* msg,
+                           const std::string& out_name) {
+  // Default DestroyCallback does nothing, When using GPU
+  // the CPU buffer need to be freed.
+  DestroyCallback destroy_callback = [](void* backing) {};
+  VarMsg request;
+  void* payload = nullptr;
+  size_t payload_size;
+
+  request.set_varname(name);
+  // Note: normally the profiler is enabled in 1 trainer, hence only
+  // 1 trainer returns true for ShouldSendProfileState(). It tells PS
+  // servers the trainer's profiling state so that PS can follow the
+  // trainer.
+  if (platform::ShouldSendProfileState()) {
+    if (platform::IsProfileEnabled()) {
+      request.set_profile(platform::kEnableProfiler);
+    } else {
+      request.set_profile(platform::kDisableProfiler);
+    }
+  }
+  if (!out_name.empty()) {
+    request.set_out_varname(out_name);
+  }
+  if (var->IsType<framework::LoDTensor>()) {
+    request.set_type(::sendrecv::LOD_TENSOR);
+    GetTensorPayload(var, ctx, &request, &payload, &payload_size);
+  } else if (var->IsType<framework::SelectedRows>()) {
+    request.set_type(::sendrecv::SELECTED_ROWS);
+    GetSelectedRowsPayload(var, ctx, &request, &payload, &payload_size);
+#ifdef PADDLE_WITH_CUDA
+  } else if (var->IsType<ncclUniqueId>()) {
+    request.set_type(::sendrecv::NCCL_ID);
+#endif
+  } else {
+    PADDLE_THROW("Serialize does not support type: %s",
+                 typeid(var->Type()).name());
+  }
+
+  if (platform::is_gpu_place(ctx.GetPlace())) {
+#ifdef PADDLE_WITH_CUDA
+    // GPU data is copied to CPU buffer when sending,
+    // free the buffer when possible.
+    destroy_callback = [](void* backing) {
+      platform::CUDAPinnedPlace cuda_pinned;
+      memory::Free(cuda_pinned, backing);
+    };
+#endif
+  }
+
+  std::string header;
+  request.AppendToString(&header);
+  auto buffer = std::unique_ptr<char[]>(new char[1024]);
+  void* buf = buffer.get();
+  ProtoEncodeHelper e(static_cast<char*>(buf), 1024);
+  e.WriteRawBytes(std::string(header.data(), header.size()));
+// NCCLID is copied directly to the message, return bytebuffer
+// with only one slice if serializing NCCLID.
+#ifdef PADDLE_WITH_CUDA
+  if (var->IsType<ncclUniqueId>()) {
+    e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber,
+                              NCCL_UNIQUE_ID_BYTES);
+    const ncclUniqueId& uid = var->Get<ncclUniqueId>();
+    e.WriteRawBytes(std::string(uid.internal, NCCL_UNIQUE_ID_BYTES));
+
+    // for serialize NCCL_ID
+    ::grpc::Slice slices(e.size());
+    memcpy(const_cast<uint8_t*>(slices.begin()), e.data(), e.size());
+    ::grpc::ByteBuffer tmp(&slices, 1);
+    msg->Swap(&tmp);
+    return;
+  }
+#endif
+
+  e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber, payload_size);
+  // steal reference of tensor data
+  ::grpc::Slice slices[4];  // metadata, tensor, rows meta, rows
+  int num_slices = 2;       // only SelectedRows have rows buffer
+  slices[0] = ::grpc::Slice(e.size());
+  memcpy(const_cast<uint8_t*>(slices[0].begin()), e.data(), e.size());
+  slices[1] = ::grpc::Slice(
+      grpc_slice_new_with_user_data(payload, payload_size, destroy_callback,
+                                    static_cast<char*>(payload)),
+      ::grpc::Slice::STEAL_REF);
+
+  if (var->IsType<framework::SelectedRows>()) {
+    auto* slr = var->GetMutable<framework::SelectedRows>();
+    ProtoEncodeHelper e2(static_cast<char*>(buf), 128);
+    size_t rows_memory_size =
+        slr->rows().size() * framework::SizeOfType(typeid(int64_t));
+    e2.WriteVarlengthBeginning(VarMsg::kRowsFieldNumber, rows_memory_size);
+    slices[2] = ::grpc::Slice(e2.size());
+    memcpy(const_cast<uint8_t*>(slices[2].begin()), e2.data(), e2.size());
+
+    slices[3] = ::grpc::Slice(
+        grpc_slice_new_with_user_data(
+            const_cast<void*>(
+                reinterpret_cast<const void*>(slr->rows().data())),
+            rows_memory_size, [](void* backing) {},
+            const_cast<char*>(
+                reinterpret_cast<const char*>(slr->rows().data()))),
+        ::grpc::Slice::STEAL_REF);
+    num_slices = 4;
+  }
+
+  ::grpc::ByteBuffer tmp(&slices[0], num_slices);
+  msg->Swap(&tmp);
+}
+
+void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
+                               const platform::DeviceContext& ctx,
+                               const framework::Scope* scope,
+                               framework::Variable** var) {
+  operators::distributed::GRPCVariableResponse resp(scope, &ctx);
+  PADDLE_ENFORCE(resp.Parse(msg) == 0, "parse bytebuffer to tensor error!");
+  *var = resp.GetVar();
+}
+
+}  // namespace distributed
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/grpc_serde.h b/paddle/fluid/operators/distributed/grpc_serde.h
new file mode 100644
index 0000000000000000000000000000000000000000..450c41dcd6b1bf9a33d3bbef3a1c94a2f83ff322
--- /dev/null
+++ b/paddle/fluid/operators/distributed/grpc_serde.h
@@ -0,0 +1,50 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <sys/time.h>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/framework/var_type.h"
+#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
+
+#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
+#include "paddle/fluid/operators/distributed/send_recv.pb.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+typedef void (*DestroyCallback)(void*);
+
+void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
+                           const platform::DeviceContext& ctx,
+                           ::grpc::ByteBuffer* msg,
+                           const std::string& out_varname = std::string());
+
+void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
+                               const platform::DeviceContext& ctx,
+                               const framework::Scope* scope,
+                               framework::Variable** var);
+
+}  // namespace distributed
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/grpc_serde_test.cc b/paddle/fluid/operators/distributed/grpc_serde_test.cc
index 3d107b533bcb7bfef3f9b13ec99afbd579a62e52..96ea05e74ed76768248a27ab435dc801b7d1b995 100644
--- a/paddle/fluid/operators/distributed/grpc_serde_test.cc
+++ b/paddle/fluid/operators/distributed/grpc_serde_test.cc
@@ -21,8 +21,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/operators/distributed/grpc_serde.h"
+#include "paddle/fluid/operators/distributed/grpc_variable_response.h"
 #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
-#include "paddle/fluid/operators/distributed/variable_response.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/string/printf.h"
@@ -84,7 +86,7 @@ void RunSerdeTestSelectedRows(platform::Place place) {
   // operators::distributed::DeserializeFromByteBuffer(msg, ctx, &var2);
   framework::Scope scope;
   scope.Var("myvar");
-  operators::distributed::VariableResponse resp(&scope, &ctx);
+  operators::distributed::GRPCVariableResponse resp(&scope, &ctx);
   EXPECT_EQ(resp.Parse(msg), 0);
 
   framework::Variable* var2 = resp.GetVar();
@@ -171,7 +173,7 @@ void RunTestLodTensor(platform::Place place, int from_type = 0) {
   // deserialize zero-copy
   framework::Scope scope;
   scope.Var("myvar");
-  operators::distributed::VariableResponse resp(&scope, &ctx);
+  operators::distributed::GRPCVariableResponse resp(&scope, &ctx);
   if (from_type == 0) {
     EXPECT_EQ(resp.Parse(msg), 0);
   } else {
diff --git a/paddle/fluid/operators/distributed/grpc_server.cc b/paddle/fluid/operators/distributed/grpc_server.cc
index f35e268f6ad36da02f17db2feb3fbf1fdf6c1e41..8edb00276df3ade1b320fbf2873e8b54ff3e1464 100644
--- a/paddle/fluid/operators/distributed/grpc_server.cc
+++ b/paddle/fluid/operators/distributed/grpc_server.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <limits>
 #include <string>
 
+#include "paddle/fluid/operators/distributed/grpc_serde.h"
 #include "paddle/fluid/operators/distributed/grpc_server.h"
 
 using ::grpc::ServerAsyncResponseWriter;
@@ -84,9 +85,9 @@ class RequestSend final : public RequestBase {
                        ::grpc::ServerCompletionQueue* cq,
                        RequestHandler* request_handler, int req_id)
       : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) {
-    request_.reset(new VariableResponse(request_handler->scope(),
-                                        request_handler->dev_ctx(),
-                                        !request_handler->sync_mode()));
+    request_.reset(new GRPCVariableResponse(request_handler->scope(),
+                                            request_handler->dev_ctx(),
+                                            !request_handler->sync_mode()));
     int method_id = static_cast<int>(distributed::GrpcMethod::kSendVariable);
     service_->RequestAsyncUnary(
         method_id, &ctx_, request_.get(), &responder_, cq_, cq_,
@@ -109,7 +110,7 @@ class RequestSend final : public RequestBase {
 
  protected:
   sendrecv::VoidMessage reply_;
-  std::shared_ptr<VariableResponse> request_;
+  std::shared_ptr<GRPCVariableResponse> request_;
   ServerAsyncResponseWriter<sendrecv::VoidMessage> responder_;
 };
 
@@ -161,8 +162,8 @@ class RequestPrefetch final : public RequestBase {
       : RequestBase(service, cq, request_handler, req_id),
         responder_(&ctx_),
         local_scope_(nullptr) {
-    request_.reset(new VariableResponse(request_handler->scope(),
-                                        request_handler->dev_ctx(), true));
+    request_.reset(new GRPCVariableResponse(request_handler->scope(),
+                                            request_handler->dev_ctx(), true));
     int method_id =
         static_cast<int>(distributed::GrpcMethod::kPrefetchVariable);
     service_->RequestAsyncUnary(
@@ -194,7 +195,7 @@ class RequestPrefetch final : public RequestBase {
   }
 
  protected:
-  std::shared_ptr<VariableResponse> request_;
+  std::shared_ptr<GRPCVariableResponse> request_;
   ::grpc::ByteBuffer reply_;
   ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
   framework::Scope* local_scope_;
@@ -206,8 +207,8 @@ class RequestCheckpointNotify final : public RequestBase {
                                    ::grpc::ServerCompletionQueue* cq,
                                    RequestHandler* request_handler, int req_id)
       : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) {
-    request_.reset(new VariableResponse(request_handler->scope(),
-                                        request_handler->dev_ctx()));
+    request_.reset(new GRPCVariableResponse(request_handler->scope(),
+                                            request_handler->dev_ctx()));
     int method_id =
         static_cast<int>(distributed::GrpcMethod::kCheckpointNotify);
     service_->RequestAsyncUnary(
@@ -234,7 +235,7 @@ class RequestCheckpointNotify final : public RequestBase {
   }
 
  protected:
-  std::shared_ptr<VariableResponse> request_;
+  std::shared_ptr<GRPCVariableResponse> request_;
   sendrecv::VoidMessage reply_;
   ServerAsyncResponseWriter<sendrecv::VoidMessage> responder_;
 };
diff --git a/paddle/fluid/operators/distributed/grpc_service.h b/paddle/fluid/operators/distributed/grpc_service.h
index cdc4e7b79276d6aac55aeac8ac121ca28d2cc1f0..9ae9a31a003cbb1f808fd1127a5dd78511aa3e99 100644
--- a/paddle/fluid/operators/distributed/grpc_service.h
+++ b/paddle/fluid/operators/distributed/grpc_service.h
@@ -23,8 +23,7 @@
 #include <grpc++/impl/codegen/stub_options.h>
 #include <grpc++/impl/codegen/sync_stream.h>
 #include <grpc++/support/byte_buffer.h>
-#include "paddle/fluid/operators/distributed/variable_response.h"
-
+#include "paddle/fluid/operators/distributed/grpc_variable_response.h"
 #include "paddle/fluid/platform/profiler.h"
 
 // NOTE: This method was originally created by tensorflow
@@ -42,17 +41,18 @@ class ServerContext;
 // Support parsing/unparsing of tensorflow::VariableResponse.
 // Wire-format is identical to RecvVariableResponse.
 template <>
-class SerializationTraits<paddle::operators::distributed::VariableResponse> {
+class SerializationTraits<
+    paddle::operators::distributed::GRPCVariableResponse> {
  public:
   static Status Serialize(
-      const paddle::operators::distributed::VariableResponse& msg,
+      const paddle::operators::distributed::GRPCVariableResponse& msg,
       grpc_byte_buffer** bp, bool* own_buffer) {
     PADDLE_ENFORCE(false, "SerializationTraits::Serialize not implemented!");
     return Status();
   }
   static Status Deserialize(
       grpc_byte_buffer* buffer,
-      paddle::operators::distributed::VariableResponse* msg,
+      paddle::operators::distributed::GRPCVariableResponse* msg,
       int max_message_size = INT_MAX) {
     if (buffer == nullptr) {
       return Status(StatusCode::INTERNAL, "No payload");
diff --git a/paddle/fluid/operators/distributed/grpc_variable_response.cc b/paddle/fluid/operators/distributed/grpc_variable_response.cc
new file mode 100644
index 0000000000000000000000000000000000000000..34d47f3ec0f3025109447b66078b724607d2953a
--- /dev/null
+++ b/paddle/fluid/operators/distributed/grpc_variable_response.cc
@@ -0,0 +1,308 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+#include <utility>
+#include <vector>
+#ifdef PADDLE_WITH_CUDA
+#include <nccl.h>
+#endif
+
+#include "paddle/fluid/operators/distributed/grpc_variable_response.h"
+#include "paddle/fluid/platform/profiler.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+enum WireType {
+  WIRETYPE_VARINT = 0,
+  WIRETYPE_LENGTH_DELIMITED = 2,
+};
+
+inline int GetTagFieldNumber(uint32_t tag) { return tag >> 3; }
+
+inline WireType GetTagWireType(uint32_t tag) {
+  return static_cast<WireType>(tag & 0x7);
+}
+
+bool ReadVarintSizeAsInt(::google::protobuf::io::CodedInputStream* input,
+                         int* result) {
+  uint64_t v;
+  if (input->ReadVarint64(&v) && v <= static_cast<uint64_t>(INT_MAX)) {
+    *result = static_cast<int>(v);
+    return true;
+  } else {
+    return false;
+  }
+}
+
+int GRPCVariableResponse::Parse(const ::grpc::ByteBuffer& byte_buffer) {
+  GrpcByteBufferSource source;
+  source.Init(byte_buffer);
+  GrpcByteBufferSourceWrapper r(&source);
+
+  return Parse(&r);
+}
+
+bool ParseLodData(::google::protobuf::io::CodedInputStream* input,
+                  std::vector<int64_t>* lod) {
+  while (true) {
+    auto p = input->ReadTagWithCutoff(127);
+    int tag = GetTagFieldNumber(p.first);
+    WireType wt = GetTagWireType(p.first);
+
+    if (!p.second) {
+      return (tag == 0);
+    }
+
+    switch (tag) {
+      case sendrecv::VariableMessage_LodData::kLodDataFieldNumber: {
+        uint64_t v;
+        if (wt == WIRETYPE_VARINT) {
+          if (!input->ReadVarint64(&v)) {
+            return false;
+          }
+          lod->push_back(v);
+          break;
+        }
+
+        if (wt == WIRETYPE_LENGTH_DELIMITED) {
+          int num_bytes = 0;
+          if (!input->ReadVarintSizeAsInt(&num_bytes)) {
+            return tag;
+          }
+          int start_pos = input->CurrentPosition();
+          while (input->CurrentPosition() - start_pos < num_bytes) {
+            uint64_t v;
+            if (!input->ReadVarint64(&v)) {
+              return tag;
+            }
+            lod->push_back(v);
+          }
+          break;
+        }
+
+        return false;
+      }
+      default: { return false; }
+    }
+  }
+
+  return true;
+}
+
+int GRPCVariableResponse::Parse(Source* source) {
+  ::google::protobuf::io::ZeroCopyInputStream* input_stream =
+      source->contents();
+  ::google::protobuf::io::CodedInputStream input(input_stream);
+  input.SetTotalBytesLimit(INT_MAX, INT_MAX);
+
+  while (true) {
+    auto p = input.ReadTagWithCutoff(127);
+    int tag = GetTagFieldNumber(p.first);
+    WireType wt = GetTagWireType(p.first);
+    if (!p.second) {
+      if (tag != 0) {
+        return -1;
+      }
+      return 0;
+    }
+
+    switch (tag) {
+      case sendrecv::VariableMessage::kVarnameFieldNumber: {
+        uint32_t length;
+        if ((wt != WIRETYPE_LENGTH_DELIMITED) || !input.ReadVarint32(&length)) {
+          return tag;
+        }
+
+        std::string temp;
+        if (!input.ReadString(&temp, length)) {
+          return tag;
+        }
+
+        meta_.set_varname(temp);
+        break;
+      }
+      case sendrecv::VariableMessage::kTypeFieldNumber: {
+        uint32_t v;
+        if ((wt != WIRETYPE_VARINT) || !input.ReadVarint32(&v)) {
+          return tag;
+        }
+
+        meta_.set_type(static_cast<::sendrecv::VarType>(v));
+        break;
+      }
+      case sendrecv::VariableMessage::kDataTypeFieldNumber: {
+        uint32_t v = 0;
+        if ((wt != WIRETYPE_VARINT) || !input.ReadVarint32(&v)) {
+          return tag;
+        }
+
+        meta_.set_data_type(static_cast<::sendrecv::VariableMessage_Type>(v));
+        break;
+      }
+      case sendrecv::VariableMessage::kDimsFieldNumber: {
+        // not packed
+        if (wt == WIRETYPE_VARINT) {
+          uint64_t v;
+          if (!input.ReadVarint64(&v)) {
+            return tag;
+          }
+          meta_.add_dims(v);
+          break;
+        }
+
+        // packed
+        if (wt == WIRETYPE_LENGTH_DELIMITED) {
+          int num_bytes = 0;
+          if (!input.ReadVarintSizeAsInt(&num_bytes)) {
+            return tag;
+          }
+          int start_pos = input.CurrentPosition();
+          while (input.CurrentPosition() - start_pos < num_bytes) {
+            uint64_t v;
+            if (!input.ReadVarint64(&v)) {
+              return tag;
+            }
+            meta_.add_dims(v);
+          }
+          break;
+        }
+        return tag;
+      }
+      case sendrecv::VariableMessage::kLodLevelFieldNumber: {
+        uint64_t v = 0;
+        if ((wt != WIRETYPE_VARINT) || !input.ReadVarint64(&v)) {
+          return tag;
+        }
+        meta_.set_lod_level(static_cast<int64_t>(v));
+        break;
+      }
+      case sendrecv::VariableMessage::kLodFieldNumber: {
+        int length = 0;
+        if (wt != WIRETYPE_LENGTH_DELIMITED ||
+            !ReadVarintSizeAsInt(&input, &length)) {
+          return tag;
+        }
+
+        std::pair<::google::protobuf::io::CodedInputStream::Limit, int> p =
+            input.IncrementRecursionDepthAndPushLimit(length);
+
+        std::vector<int64_t> lod_data;
+        if (p.second < 0 || !ParseLodData(&input, &lod_data)) {
+          return tag;
+        }
+
+        if (!input.DecrementRecursionDepthAndPopLimit(p.first)) {
+          return tag;
+        }
+
+        if (lod_data.size() == 0) {
+          break;
+        }
+
+        auto lod = meta_.add_lod();
+        for (uint32_t i = 0; i < lod_data.size(); i++) {
+          lod->add_lod_data(lod_data[i]);
+        }
+        break;
+      }
+      case sendrecv::VariableMessage::kSlrHeightFieldNumber: {
+        uint64_t v = 0;
+        if ((wt != WIRETYPE_VARINT) || !input.ReadVarint64(&v)) {
+          return tag;
+        }
+        meta_.set_slr_height(static_cast<int64_t>(v));
+        break;
+      }
+      case sendrecv::VariableMessage::kSerializedFieldNumber: {
+        int num_bytes = 0;
+        if (wt != WIRETYPE_LENGTH_DELIMITED ||
+            !ReadVarintSizeAsInt(&input, &num_bytes)) {
+          return tag;
+        }
+
+        if (!ProcSerializedField(tag, &input, num_bytes)) {
+          return tag;
+        }
+
+        break;
+      }
+      case sendrecv::VariableMessage::kRowsFieldNumber: {
+        PADDLE_ENFORCE((meta_.type() == sendrecv::SELECTED_ROWS ||
+                        meta_.type() == sendrecv::LOD_TENSOR) &&
+                           meta_.varname() != "",
+                       "meta info should be got first!");
+
+        int num_bytes = 0;
+        if (wt != WIRETYPE_LENGTH_DELIMITED ||
+            !ReadVarintSizeAsInt(&input, &num_bytes)) {
+          return tag;
+        }
+
+        if (!CopySelectRowsData(&input, *dev_ctx_, num_bytes)) {
+          return tag;
+        }
+        break;
+      }
+      case sendrecv::VariableMessage::kOutVarnameFieldNumber: {
+        uint32_t length;
+        if ((wt != WIRETYPE_LENGTH_DELIMITED) || !input.ReadVarint32(&length)) {
+          return tag;
+        }
+
+        std::string temp;
+        if (!input.ReadString(&temp, length)) {
+          return tag;
+        }
+
+        meta_.set_out_varname(temp);
+        break;
+      }
+      case sendrecv::VariableMessage::kProfileFieldNumber: {
+        uint64_t profiling = 0;
+        if (!input.ReadVarint64(&profiling)) {
+          return tag;
+        }
+        meta_.set_profile(profiling);
+        int64_t listener_id = platform::ListenerId();
+        if (listener_id <= 0) {
+          break;
+        }
+        if (profiling == platform::kEnableProfiler &&
+            !platform::IsProfileEnabled()) {
+          platform::EnableProfiler(platform::ProfilerState::kCPU);
+        } else if (profiling == platform::kDisableProfiler &&
+                   platform::IsProfileEnabled()) {
+          // TODO(panyx0718): Should we allow to customize file dir.
+          platform::DisableProfiler(
+              platform::EventSortingKey::kDefault,
+              string::Sprintf("/tmp/profile_ps_%lld", listener_id));
+        }
+        break;
+      }
+      default: {
+        // Unknown tag, return unknown error.
+        return -1;
+      }
+    }
+  }
+
+  return 0;
+}
+
+};  // namespace distributed
+};  // namespace operators
+};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/grpc_variable_response.h b/paddle/fluid/operators/distributed/grpc_variable_response.h
new file mode 100644
index 0000000000000000000000000000000000000000..89df07c92cd33bcb76c8539b5566d74fa21bba5e
--- /dev/null
+++ b/paddle/fluid/operators/distributed/grpc_variable_response.h
@@ -0,0 +1,58 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/var_type.h"
+
+#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
+#include "paddle/fluid/operators/distributed/send_recv.pb.h"
+
+#include "google/protobuf/io/coded_stream.h"
+#include "google/protobuf/io/zero_copy_stream.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/distributed/grpc_bytebuffer_stream.h"
+#include "paddle/fluid/operators/distributed/variable_response.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+class GRPCVariableResponse : public VariableResponse {
+ public:
+  GRPCVariableResponse(const framework::Scope* scope,
+                       const platform::DeviceContext* dev_ctx,
+                       bool create_scope = false)
+      : VariableResponse(scope, dev_ctx, create_scope) {}
+
+  virtual ~GRPCVariableResponse() {}
+
+  int Parse(Source* source) override;
+
+  // return:
+  // 0:ok.
+  // -1: unkown error.
+  // other: number of error field.
+  int Parse(const ::grpc::ByteBuffer& byte_buffer);
+};
+
+};  // namespace distributed
+};  // namespace operators
+};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/proto_encoder_helper.h b/paddle/fluid/operators/distributed/proto_encoder_helper.h
index 2fab02e32fe18ee04f86a69bb5bae1cbe7c6762c..d2b0eb6ca6de1984dc7cfc2a662c88d5e56e1e05 100644
--- a/paddle/fluid/operators/distributed/proto_encoder_helper.h
+++ b/paddle/fluid/operators/distributed/proto_encoder_helper.h
@@ -82,8 +82,10 @@ class ProtoEncodeHelper {
       : base_(buf), p_(buf), limit_(base_ + max_size) {}
 
   ~ProtoEncodeHelper() {
+#define REPLACE_ENFORCE_GLOG 1
     // Make sure callers didn't do operations that went over max_size promised
-    PADDLE_ENFORCE_LE(p_, limit_);
+    paddle::platform::throw_on_error(p_ <= limit_);
+#undef REPLACE_ENFORCE_GLOG
   }
 
   const char* data() const { return base_; }
diff --git a/paddle/fluid/operators/distributed/request_handler.h b/paddle/fluid/operators/distributed/request_handler.h
index 271306d5d20f1b849a81a9bfa6436f2faf261204..3c3f9d17c871ac1cb4df83db17cf489d5b9e0563 100644
--- a/paddle/fluid/operators/distributed/request_handler.h
+++ b/paddle/fluid/operators/distributed/request_handler.h
@@ -28,6 +28,7 @@
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/var_type.h"
+#include "paddle/fluid/platform/macros.h"
 
 namespace paddle {
 namespace operators {
@@ -43,14 +44,83 @@ constexpr char kRequestPassBarrier[] = "RequestPassBarrier";
 #define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV"
 #define FETCH_BARRIER_MESSAGE "FETCH_BARRIER@RECV"
 #define COMPLETE_MESSAGE "COMPLETE@RECV"
-#define BEGIN_PASS_MESSAGE "BEGIN_PASS@RECV"
-#define END_PASS_MESSAGE "END_PASS@RECV"
 
 #define CHECKPOINT_SAVE_MESSAGE "SAVE@CHECKPOINTNOTIFY"
 #define CHECKPOINT_LOAD_MESSAGE "LOAD@CHECKPOINTNOTIFY"
 
 class RPCServer;
 
+class VarHandle {
+ public:
+  VarHandle(const std::string ep, const std::string& method,
+            const std::string& name,
+            const platform::DeviceContext* p_ctx = nullptr,
+            const framework::Scope* p_scope = nullptr)
+      : ok_(kVarHandleDefaultState) {
+    ep_ = ep;
+    ctx_ = p_ctx;
+    scope_ = p_scope;
+    name_ = name;
+    method_ = method;
+  }
+
+  virtual ~VarHandle() {}
+
+ public:
+  bool Wait() {
+    {
+      std::unique_lock<std::mutex> lk(sync_mutex_);
+      wait_cond_.wait(lk, [this] { return ok_ != kVarHandleDefaultState; });
+    }
+    VLOG(7) << "VarHandle wait:" << ok_;
+    return ok_ != 0;
+  }
+
+  void Finish(bool ok) {
+    {
+      std::unique_lock<std::mutex> lk(sync_mutex_);
+      ok_ = ok;
+    }
+    VLOG(7) << "VarHandle finish:" << ok;
+    wait_cond_.notify_all();
+  }
+
+  std::string String() const {
+    std::ostringstream s;
+    s << method_ << " name:[" << name_ << "], ep:[" << ep_ << "], ok:[" << ok_
+      << "]";
+    return s.str();
+  }
+
+  std::string ep() const { return ep_; }
+  const platform::DeviceContext* ctx() const { return ctx_; }
+  const framework::Scope* scope() const { return scope_; }
+  std::string name() const { return name_; }
+  std::string method() const { return method_; }
+
+ protected:
+  // RPC endpoint.
+  std::string ep_;
+  const platform::DeviceContext* ctx_;
+  const framework::Scope* scope_;
+  // Variable name.
+  std::string name_;
+  // RPC method name.
+  std::string method_;
+
+ protected:
+  std::mutex sync_mutex_;
+  std::condition_variable wait_cond_;
+  int ok_;
+
+  static const int kVarHandleDefaultState = -1;
+
+ private:
+  DISABLE_COPY_AND_ASSIGN(VarHandle);
+};
+
+typedef std::shared_ptr<VarHandle> VarHandlePtr;
+
 class RequestHandler {
  public:
   explicit RequestHandler(bool sync_mode)
diff --git a/paddle/fluid/operators/distributed/request_handler_impl.cc b/paddle/fluid/operators/distributed/request_handler_impl.cc
index 5e6bff20f5f8c06e1497c697e3aabf7b9cb94ad6..849e412504eb9180b746db65fd4fa353ed0c05a1 100644
--- a/paddle/fluid/operators/distributed/request_handler_impl.cc
+++ b/paddle/fluid/operators/distributed/request_handler_impl.cc
@@ -39,52 +39,39 @@ bool RequestSendHandler::Handle(const std::string& varname,
                                 const std::string& out_var_name) {
   VLOG(4) << "RequestSendHandler:" << varname;
 
-  // Async
-  if (!sync_mode_) {
-    try {
-      executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(),
-                                    scope);
-    } catch (std::exception& e) {
-      LOG(ERROR) << "async: run sub program error " << e.what();
-      return false;
-    }
-    return true;
-  }
-
   // Sync
   if (varname == BATCH_BARRIER_MESSAGE) {
-    VLOG(3) << "sync: recv batch barrier message";
+    VLOG(3) << "sync: recv BATCH_BARRIER_MESSAGE";
     rpc_server_->IncreaseBatchBarrier(kRequestSend);
-  } else if (varname == BEGIN_PASS_MESSAGE) {
-    VLOG(3) << "sync: recv begin pass message";
-    rpc_server_->WaitCond(kRequestSend);
-    rpc_server_->BeginPass();
+  } else if (varname == COMPLETE_MESSAGE) {
+    VLOG(3) << "sync: recv complete message";
+    rpc_server_->Complete();
   } else {
-    VLOG(3) << "sync: received var_name: " << varname;
-    rpc_server_->WaitCond(kRequestSend);
-    VLOG(3) << "sync: processing received var: " << varname;
-
-    if (invar == nullptr) {
-      LOG(ERROR) << "sync: Can not find server side var: " << varname;
-      PADDLE_THROW("sync: Can not find server side var");
-      return false;
-    }
-    if (invar->IsType<framework::SelectedRows>()) {
-      std::unique_lock<std::mutex> lock(mutex_sparse_vars_);
-      sparse_vars_.push_back(invar);
+    // Async
+    if (!sync_mode_) {
+      VLOG(3) << "async process var: " << varname;
+      rpc_server_->Profiler().OneStep();
+      try {
+        executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(),
+                                      scope);
+      } catch (std::exception& e) {
+        LOG(ERROR) << "async: run sub program error " << e.what();
+        return false;
+      }
+      return true;
+    } else {  // sync
+      rpc_server_->WaitCond(kRequestSend);
+      VLOG(3) << "sync: processing received var: " << varname;
+
+      if (invar == nullptr) {
+        LOG(FATAL) << "sync: Can not find server side var: " << varname;
+        return false;
+      }
     }
   }
   return true;
 }
 
-void RequestSendHandler::ResetSparseVarRecorder() {
-  std::unique_lock<std::mutex> lock(mutex_sparse_vars_);
-  for (auto* var : sparse_vars_) {
-    var->GetMutable<framework::SelectedRows>()->mutable_rows()->clear();
-  }
-  sparse_vars_.clear();
-}
-
 bool RequestGetHandler::Handle(const std::string& varname,
                                framework::Scope* scope,
                                framework::Variable* invar,
@@ -95,14 +82,12 @@ bool RequestGetHandler::Handle(const std::string& varname,
     if (varname == FETCH_BARRIER_MESSAGE) {
       VLOG(3) << "sync: recv fetch barrier message";
       rpc_server_->IncreaseBatchBarrier(kRequestGet);
-    } else if (varname == END_PASS_MESSAGE) {
-      rpc_server_->EndPass();
     } else {
       rpc_server_->WaitCond(kRequestGet);
       *outvar = scope_->FindVar(varname);
     }
   } else {
-    if (varname != FETCH_BARRIER_MESSAGE && varname != END_PASS_MESSAGE) {
+    if (varname != FETCH_BARRIER_MESSAGE && varname != COMPLETE_MESSAGE) {
       *outvar = scope_->FindVar(varname);
     }
   }
@@ -133,12 +118,13 @@ bool RequestCheckpointHandler::Handle(const std::string& varname,
       checkpoint_notify_id != -1,
       "when checkpoint_notify_id = -1, there should be no RPC invoke.");
 
-  auto* lt_var = scope->FindVar(LOOKUP_TABLE_PATH)->GetMutable<std::string>();
+  // TODO(tangwei12): find out why scope will be error.
+  auto* lt_var = scope_->FindVar(LOOKUP_TABLE_PATH)->GetMutable<std::string>();
   lt_var->clear();
   lt_var->append(out_var_name);
   VLOG(4) << "RequestCheckpointHandler update var kLookupTablePath to: "
           << out_var_name;
-  executor_->RunPreparedContext(checkpoint_prepared_ctx_.get(), scope);
+  executor_->RunPreparedContext(checkpoint_prepared_ctx_.get(), scope_);
   return true;
 }
 
diff --git a/paddle/fluid/operators/distributed/request_handler_impl.h b/paddle/fluid/operators/distributed/request_handler_impl.h
index 87185500f2ffc3a8578eea339cc7a1e2b0e46631..8be5b21bb89a580f4091de19186fd2d7e5802478 100644
--- a/paddle/fluid/operators/distributed/request_handler_impl.h
+++ b/paddle/fluid/operators/distributed/request_handler_impl.h
@@ -41,11 +41,6 @@ class RequestSendHandler final : public RequestHandler {
   bool Handle(const std::string& varname, framework::Scope* scope,
               framework::Variable* var, framework::Variable** outvar,
               const std::string& out_var_name = "") override;
-  void ResetSparseVarRecorder();
-
- private:
-  std::mutex mutex_sparse_vars_;
-  std::vector<framework::Variable*> sparse_vars_;
 };
 
 class RequestGetHandler final : public RequestHandler {
diff --git a/paddle/fluid/operators/distributed/rpc_client.h b/paddle/fluid/operators/distributed/rpc_client.h
index 6479d3a97bafba37b74a1d1c04852a6e60e01be8..3539ee5e459d6dfe0b6510806464bcc6817910bb 100644
--- a/paddle/fluid/operators/distributed/rpc_client.h
+++ b/paddle/fluid/operators/distributed/rpc_client.h
@@ -14,12 +14,14 @@
 
 #pragma once
 
+#include <condition_variable>  // NOLINT
 #include <string>
 #include "gflags/gflags.h"
 
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/operators/distributed/request_handler.h"
 
 DECLARE_int32(rpc_deadline);
 
@@ -31,48 +33,43 @@ class RPCClient {
  public:
   RPCClient() {}
   virtual ~RPCClient() {}
-  virtual bool AsyncSendVar(const std::string& ep,
-                            const platform::DeviceContext& ctx,
-                            const framework::Scope& scope,
-                            const std::string& var_name,
-                            int64_t time_out = FLAGS_rpc_deadline) = 0;
-
-  virtual bool AsyncGetVar(const std::string& ep,
-                           const platform::DeviceContext& ctx,
-                           const framework::Scope& scope,
-                           const std::string& var_name,
-                           int64_t time_out = FLAGS_rpc_deadline) = 0;
-
-  virtual bool AsyncPrefetchVar(const std::string& ep,
-                                const platform::DeviceContext& ctx,
-                                const framework::Scope& scope,
-                                const std::string& in_var_name,
-                                const std::string& out_var_name,
-                                int64_t time_out = FLAGS_rpc_deadline) = 0;
-
-  virtual void AsyncSendBatchBarrier(const std::string& ep,
-                                     int64_t time_out = FLAGS_rpc_deadline) = 0;
-
-  virtual void AsyncSendFetchBarrier(const std::string& ep,
-                                     int64_t time_out = FLAGS_rpc_deadline) = 0;
-
-  virtual void AsyncCheckpointNotify(const std::string& ep,
-                                     const std::string& dir,
-                                     int64_t time_out = FLAGS_rpc_deadline) = 0;
-
-  virtual void AsyncSendBeginPass(const std::string& ep,
-                                  int64_t time_out = FLAGS_rpc_deadline) = 0;
-
-  virtual void AsyncSendEndPass(const std::string& ep,
-                                int64_t time_out = FLAGS_rpc_deadline) = 0;
-
-  // BeginePass/EndPass tells all the pserver that start/end a pass, so that
-  // the pserver can increase/reduce it's barrier count, and continue to train
+  virtual VarHandlePtr AsyncSendVar(const std::string& ep,
+                                    const platform::DeviceContext& ctx,
+                                    const framework::Scope& scope,
+                                    const std::string& var_name,
+                                    int64_t time_out = FLAGS_rpc_deadline) = 0;
+
+  virtual VarHandlePtr AsyncGetVar(const std::string& ep,
+                                   const platform::DeviceContext& ctx,
+                                   const framework::Scope& scope,
+                                   const std::string& var_name,
+                                   int64_t time_out = FLAGS_rpc_deadline) = 0;
+
+  virtual VarHandlePtr AsyncPrefetchVar(
+      const std::string& ep, const platform::DeviceContext& ctx,
+      const framework::Scope& scope, const std::string& in_var_name,
+      const std::string& out_var_name,
+      int64_t time_out = FLAGS_rpc_deadline) = 0;
+
+  virtual VarHandlePtr AsyncSendBatchBarrier(
+      const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) = 0;
+
+  virtual VarHandlePtr AsyncSendFetchBarrier(
+      const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) = 0;
+
+  virtual VarHandlePtr AsyncCheckpointNotify(
+      const std::string& ep, const std::string& dir,
+      int64_t time_out = FLAGS_rpc_deadline) = 0;
+
+  virtual VarHandlePtr AsyncSendComplete(
+      const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) = 0;
+
+  // Complete tells all the pserver instances that finishe the training,
+  // the pserver can reduce it's barrier count, and continue to train
   // with other trainers.
-  virtual void SendBeginPass() = 0;
-  virtual void SendEndPass() = 0;
+  virtual void SendComplete() = 0;
 
-  virtual void Wait() = 0;
+  virtual bool Wait() = 0;
 
   template <typename T>
   static RPCClient* GetInstance() {
diff --git a/paddle/fluid/operators/distributed/rpc_server.cc b/paddle/fluid/operators/distributed/rpc_server.cc
index d49ee34eeaf4e80f6fd4f8cdc548cc2b938d0f2a..084480ae48b8b9267ade1a840f6a70519cb28e48 100644
--- a/paddle/fluid/operators/distributed/rpc_server.cc
+++ b/paddle/fluid/operators/distributed/rpc_server.cc
@@ -18,11 +18,44 @@
 #include <string>
 
 #include "paddle/fluid/operators/distributed/rpc_server.h"
+#include "paddle/fluid/platform/profiler.h"
+
+DEFINE_int32(rpc_server_profile_period, 0,
+             "the period of listen_and_serv to do profile");
+DEFINE_string(rpc_server_profile_path, "/dev/null",
+              "the profile log file path");
 
 namespace paddle {
 namespace operators {
 namespace distributed {
 
+RPCServerProfiler::RPCServerProfiler(int profile_period,
+                                     const std::string& profile_log_path)
+    : profile_period_(profile_period), profile_log_path_(profile_log_path) {
+  step_ = 0;
+}
+
+void RPCServerProfiler::OneStep() {
+  PADDLE_ENFORCE_LE(step_, profile_period_,
+                    "step_ should not be larger then "
+                    "profile_period_");
+  if (profile_period_ <= 0) {
+    return;
+  }
+
+  if (step_ == 0) {
+    auto pf_state = paddle::platform::ProfilerState::kCPU;
+    paddle::platform::EnableProfiler(pf_state);
+  }
+  if (step_ == profile_period_) {
+    paddle::platform::DisableProfiler(paddle::platform::EventSortingKey::kTotal,
+                                      profile_log_path_);
+    step_ = 0;
+  } else {
+    step_++;
+  }
+}
+
 void RPCServer::ShutDown() {
   LOG(INFO) << "RPCServer ShutDown ";
   ShutDownImpl();
@@ -64,21 +97,12 @@ void RPCServer::IncreaseBatchBarrier(const std::string rpc_name) {
   }
 }
 
-void RPCServer::BeginPass() {
-  VLOG(4) << "RPCServer begin increase pass barrier";
-  {
-    std::unique_lock<std::mutex> lock(mutex_);
-    client_num_++;
-    VLOG(4) << "increase client_num to: " << client_num_;
-  }
-  barrier_cond_.notify_all();
-}
-
-void RPCServer::EndPass() {
-  VLOG(4) << "RPCServer begin increase pass barrier";
+void RPCServer::Complete() {
   {
     std::unique_lock<std::mutex> lock(mutex_);
     client_num_--;
+    need_reset_all_vars_ = true;
+
     VLOG(4) << "decrease client_num to: " << client_num_;
     if (cur_cond_.load() == rpc_cond_map_[kRequestGet]) {
       barrier_counter_[kRequestGet]--;
@@ -87,12 +111,23 @@ void RPCServer::EndPass() {
   barrier_cond_.notify_all();
 }
 
+bool RPCServer::NeedResetAllVars() {
+  std::unique_lock<std::mutex> lock(mutex_);
+  return need_reset_all_vars_;
+}
+
+int RPCServer::GetClientNum() {
+  std::unique_lock<std::mutex> lock(mutex_);
+  return client_num_;
+}
+
 void RPCServer::ResetBarrierCounter() {
   VLOG(3) << "RPCServer ResetBarrierCounter ";
   std::unique_lock<std::mutex> lock(mutex_);
   for (auto& t : barrier_counter_) {
     t.second = 0;
   }
+  need_reset_all_vars_ = false;
 }
 
 void RPCServer::RegisterRPC(const std::string& rpc_name,
diff --git a/paddle/fluid/operators/distributed/rpc_server.h b/paddle/fluid/operators/distributed/rpc_server.h
index 833991c8aa6e7cfd10f2aa52f9218be7ff8ccebf..d88e8c640ffb5ea44e88318cc973c9a783862435 100644
--- a/paddle/fluid/operators/distributed/rpc_server.h
+++ b/paddle/fluid/operators/distributed/rpc_server.h
@@ -19,20 +19,38 @@
 #include <thread>  // NOLINT
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/operators/distributed/request_handler.h"
 
+DECLARE_int32(rpc_server_profile_period);
+DECLARE_string(rpc_server_profile_path);
+
 namespace paddle {
 namespace operators {
 namespace distributed {
 
+class RPCServerProfiler {
+ public:
+  RPCServerProfiler(int profile_period, const std::string& profile_log_path);
+  void OneStep();
+
+ private:
+  const int profile_period_;
+  std::string profile_log_path_;
+  int step_;
+};
+
 class RPCServer {
  public:
   explicit RPCServer(const std::string& address, int client_num)
       : cur_cond_(0),
+        profiler_(FLAGS_rpc_server_profile_period,
+                  FLAGS_rpc_server_profile_path),
         bind_address_(address),
         exit_flag_(false),
         selected_port_(0),
-        client_num_(client_num) {}
+        client_num_(client_num),
+        need_reset_all_vars_(false) {}
 
   virtual ~RPCServer() {}
   virtual void StartServer() = 0;
@@ -44,7 +62,7 @@ class RPCServer {
 
   int GetSelectedPort() const { return selected_port_; }
 
-  int GetClientNum() const;
+  int GetClientNum();
 
   void SavePort() const;
 
@@ -64,10 +82,12 @@ class RPCServer {
   void WaitCond(const std::string& rpc_name);
   void IncreaseBatchBarrier(const std::string rpc_name);
 
-  void BeginPass();
-  void EndPass();
+  void Complete();
 
   void ResetBarrierCounter();
+  RPCServerProfiler& Profiler() { return profiler_; }
+
+  bool NeedResetAllVars();
 
  protected:
   virtual void ShutDownImpl() = 0;
@@ -80,12 +100,14 @@ class RPCServer {
   std::unordered_map<std::string, int> rpc_cond_map_;
   std::atomic<int> cur_cond_;
   std::condition_variable rpc_cond_;
+  RPCServerProfiler profiler_;
 
  protected:
   std::string bind_address_;
   std::atomic<int> exit_flag_;
   int selected_port_;
   int client_num_;
+  bool need_reset_all_vars_;
 
   std::unordered_map<std::string, RequestHandler*> rpc_call_map_;
   std::unordered_map<std::string, int> rpc_thread_num_;
diff --git a/paddle/fluid/operators/distributed/rpc_server_test.cc b/paddle/fluid/operators/distributed/rpc_server_test.cc
index a0693cffabcc561b0adfafc2c49027a890dd5efc..d6176e1443d2a441af7878e5efe99796d486bb7a 100644
--- a/paddle/fluid/operators/distributed/rpc_server_test.cc
+++ b/paddle/fluid/operators/distributed/rpc_server_test.cc
@@ -30,7 +30,7 @@ namespace framework = paddle::framework;
 namespace platform = paddle::platform;
 namespace distributed = paddle::operators::distributed;
 
-USE_OP(lookup_table);
+USE_NO_KERNEL_OP(lookup_sparse_table);
 
 std::unique_ptr<distributed::RPCServer> g_rpc_service;
 std::unique_ptr<distributed::RequestHandler> g_req_handler;
@@ -42,13 +42,13 @@ framework::BlockDesc* AppendPrefetchBlcok(framework::ProgramDesc* program) {
   framework::VariableNameMap input({{"W", {"w"}}, {"Ids", {"ids"}}});
   framework::VariableNameMap output({{"Output", {"out"}}});
   auto op = block->AppendOp();
-  op->SetType("lookup_table");
+  op->SetType("lookup_sparse_table");
   op->SetInput("W", {"w"});
   op->SetInput("Ids", {"ids"});
   op->SetOutput("Out", {"out"});
 
   auto& out = *root_block->Var("out");
-  out.SetType(framework::proto::VarType::SELECTED_ROWS);
+  out.SetType(framework::proto::VarType::LOD_TENSOR);
   out.SetShape({10, 10});
 
   return block;
@@ -59,30 +59,28 @@ void CreateVarsOnScope(framework::Scope* scope, platform::CPUPlace* place) {
   w_var->GetMutable<framework::SelectedRows>();
 
   auto out_var = scope->Var("out");
-  out_var->GetMutable<framework::SelectedRows>();
+  out_var->GetMutable<framework::LoDTensor>();
 
   auto ids_var = scope->Var("ids");
-  ids_var->GetMutable<framework::SelectedRows>();
+  ids_var->GetMutable<framework::LoDTensor>();
 }
 
 void InitTensorsOnClient(framework::Scope* scope, platform::CPUPlace* place,
                          int64_t rows_numel) {
   CreateVarsOnScope(scope, place);
-  auto ids_var = scope->Var("ids")->GetMutable<framework::SelectedRows>();
-  auto rows = ids_var->mutable_rows();
-  for (int64_t i = 0; i < rows_numel; ++i) rows->push_back(i * 2);
-  ids_var->mutable_value()->Resize({rows_numel, 1});
-  ids_var->mutable_value()->mutable_data<float>(*place);
+  auto ids_var = scope->Var("ids")->GetMutable<framework::LoDTensor>();
+  int64_t* ids_ptr =
+      ids_var->mutable_data<int64_t>(framework::DDim({rows_numel, 1}), *place);
+  for (int64_t i = 0; i < rows_numel; ++i) ids_ptr[i] = i * 2;
 }
 
 void InitTensorsOnServer(framework::Scope* scope, platform::CPUPlace* place,
                          int64_t rows_numel) {
   CreateVarsOnScope(scope, place);
   auto w = scope->Var("w")->GetMutable<framework::SelectedRows>();
-  auto rows = w->mutable_rows();
-  for (int64_t i = 0; i < rows_numel; ++i) rows->push_back(i);
   auto w_value = w->mutable_value();
   w_value->Resize({rows_numel, 10});
+  for (int64_t i = 0; i < rows_numel; ++i) w->AutoGrownIndex(i, true);
 
   auto ptr = w_value->mutable_data<float>(*place);
 
@@ -91,7 +89,7 @@ void InitTensorsOnServer(framework::Scope* scope, platform::CPUPlace* place,
   }
 }
 
-void StartServer() {
+void StartServer(const std::string& rpc_name) {
   framework::ProgramDesc program;
   framework::Scope scope;
   platform::CPUPlace place;
@@ -107,14 +105,14 @@ void StartServer() {
                      std::shared_ptr<framework::ExecutorPrepareContext>>
       prefetch_var_name_to_prepared;
   prefetch_var_name_to_prepared[in_var_name] = prepared[0];
+
   g_req_handler->SetProgram(&program);
   g_req_handler->SetPrefetchPreparedCtx(&prefetch_var_name_to_prepared);
   g_req_handler->SetDevCtx(&ctx);
   g_req_handler->SetScope(&scope);
   g_req_handler->SetExecutor(&exe);
 
-  g_rpc_service->RegisterRPC(distributed::kRequestPrefetch,
-                             g_req_handler.get());
+  g_rpc_service->RegisterRPC(rpc_name, g_req_handler.get());
   g_req_handler->SetRPCServer(g_rpc_service.get());
 
   std::thread server_thread(
@@ -129,7 +127,7 @@ TEST(PREFETCH, CPU) {
   distributed::RPCClient* client =
       distributed::RPCClient::GetInstance<RPCCLIENT_T>();
 
-  std::thread server_thread(StartServer);
+  std::thread server_thread(StartServer, distributed::kRequestPrefetch);
   g_rpc_service->WaitServerReady();
 
   int port = g_rpc_service->GetSelectedPort();
@@ -148,11 +146,11 @@ TEST(PREFETCH, CPU) {
     client->AsyncPrefetchVar(ep, ctx, scope, in_var_name, out_var_name);
     client->Wait();
     auto var = scope.Var(out_var_name);
-    auto value = var->GetMutable<framework::SelectedRows>()->value();
-    auto ptr = value.mutable_data<float>(place);
+    auto value = var->GetMutable<framework::LoDTensor>();
+    auto ptr = value->mutable_data<float>(place);
 
     for (int64_t i = 0; i < rows_numel; ++i) {
-      EXPECT_EQ(ptr[0 + i * value.dims()[1]], static_cast<float>(i * 2));
+      EXPECT_EQ(ptr[0 + i * value->dims()[1]], static_cast<float>(i * 2));
     }
   }
 
@@ -162,3 +160,24 @@ TEST(PREFETCH, CPU) {
   g_rpc_service.reset(nullptr);
   g_req_handler.reset(nullptr);
 }
+
+TEST(COMPLETE, CPU) {
+  g_req_handler.reset(new distributed::RequestSendHandler(true));
+  g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 2));
+  distributed::RPCClient* client =
+      distributed::RPCClient::GetInstance<RPCCLIENT_T>();
+  PADDLE_ENFORCE(client != nullptr);
+  std::thread server_thread(StartServer, distributed::kRequestSend);
+  g_rpc_service->WaitServerReady();
+  int port = g_rpc_service->GetSelectedPort();
+  std::string ep = paddle::string::Sprintf("127.0.0.1:%d", port);
+  client->AsyncSendComplete(ep);
+  client->Wait();
+
+  EXPECT_EQ(g_rpc_service->GetClientNum(), 1);
+
+  g_rpc_service->ShutDown();
+  server_thread.join();
+  g_rpc_service.reset(nullptr);
+  g_req_handler.reset(nullptr);
+}
diff --git a/paddle/fluid/operators/distributed/send_recv.proto b/paddle/fluid/operators/distributed/send_recv.proto.in
similarity index 97%
rename from paddle/fluid/operators/distributed/send_recv.proto
rename to paddle/fluid/operators/distributed/send_recv.proto.in
index e0902320cff003797b12ed0204f7f99c44554b62..8b0a09abe1d05dda10eda0030eb91cb9ca40683e 100644
--- a/paddle/fluid/operators/distributed/send_recv.proto
+++ b/paddle/fluid/operators/distributed/send_recv.proto.in
@@ -1,3 +1,4 @@
+
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. Licensed under
 the Apache License, Version 2.0 (the "License"); you may not use this file
 except in compliance with the License.
@@ -14,7 +15,7 @@ limitations under the License. */
 syntax = "proto3";
 package sendrecv;
 
-// option cc_generic_services = true;
+option cc_generic_services = @cc_generic_services@;
 
 service SendRecvService {
   // For parameter server round-robin like hashing, do not split tensors.
diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.cc b/paddle/fluid/operators/distributed/sendrecvop_utils.cc
index 98129d9f1014c39347e3409533f2bc10092611d2..6a3f8fd544bc5d669b725765a863b42ec069a7b6 100644
--- a/paddle/fluid/operators/distributed/sendrecvop_utils.cc
+++ b/paddle/fluid/operators/distributed/sendrecvop_utils.cc
@@ -12,21 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
-
 #ifdef PADDLE_WITH_CUDA
 #include <nccl.h>
 #endif
 #include <sys/time.h>
 #include <thread>  // NOLINT
 
-#include "google/protobuf/io/coded_stream.h"
-#include "google/protobuf/io/zero_copy_stream.h"
 #include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/operators/distributed/bytebuffer_stream.h"
-#include "paddle/fluid/operators/distributed/proto_encoder_helper.h"
+#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
 #include "paddle/fluid/operators/distributed/variable_response.h"
-#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace operators {
@@ -34,6 +28,13 @@ namespace distributed {
 
 using VarMsg = sendrecv::VariableMessage;
 
+#ifdef PADDLE_WITH_CUDA
+void* GetVarPayLoad(const std::string varname, int64_t size) {
+  platform::CUDAPinnedPlace cuda_pinned;
+  return memory::Alloc(cuda_pinned, size);
+}
+#endif
+
 void GetTensorPayload(framework::Variable* var,
                       const platform::DeviceContext& ctx, VarMsg* request,
                       void** payload, size_t* payload_size) {
@@ -58,15 +59,17 @@ void GetTensorPayload(framework::Variable* var,
   if (platform::is_gpu_place(ctx.GetPlace())) {
 #ifdef PADDLE_WITH_CUDA
     PADDLE_ENFORCE(platform::is_gpu_place(tensor.place()));
-    platform::CUDAPinnedPlace cuda_pinned;
+    // platform::CUDAPinnedPlace cuda_pinned;
     auto& gpu_dev_ctx = static_cast<const platform::CUDADeviceContext&>(ctx);
     auto copy_size = tensor.numel() * framework::SizeOfType(tensor.type());
-    *payload = memory::Alloc(cuda_pinned, copy_size);
+    *payload = GetVarPayLoad(request->varname(), copy_size);
 
+    platform::CUDAPinnedPlace cuda_pinned;
     memory::Copy(cuda_pinned, *payload,
                  boost::get<platform::CUDAPlace>(tensor.place()),
                  reinterpret_cast<const void*>(tensor.data<void>()), copy_size,
                  gpu_dev_ctx.stream());
+
     ctx.Wait();
 #endif
   } else {
@@ -91,10 +94,11 @@ void GetSelectedRowsPayload(framework::Variable* var,
   auto* tensor = slr->mutable_value();
   if (platform::is_gpu_place(ctx.GetPlace())) {
 #ifdef PADDLE_WITH_CUDA
-    platform::CUDAPinnedPlace cuda_pinned;
     auto& gpu_dev_ctx = static_cast<const platform::CUDADeviceContext&>(ctx);
     auto copy_size = tensor->numel() * framework::SizeOfType(tensor->type());
-    *payload = memory::Alloc(cuda_pinned, copy_size);
+    *payload = GetVarPayLoad(request->varname(), copy_size);
+
+    platform::CUDAPinnedPlace cuda_pinned;
     memory::Copy(cuda_pinned, *payload,
                  boost::get<platform::CUDAPlace>(tensor->place()),
                  reinterpret_cast<const void*>(tensor->data<void>()), copy_size,
@@ -107,126 +111,6 @@ void GetSelectedRowsPayload(framework::Variable* var,
   *payload_size = tensor->numel() * framework::SizeOfType(tensor->type());
 }
 
-void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
-                           const platform::DeviceContext& ctx,
-                           ::grpc::ByteBuffer* msg,
-                           const std::string& out_name) {
-  // Default DestroyCallback does nothing, When using GPU
-  // the CPU buffer need to be freed.
-  DestroyCallback destroy_callback = [](void* backing) {};
-  VarMsg request;
-  void* payload = nullptr;
-  size_t payload_size;
-
-  request.set_varname(name);
-  // Note: normally the profiler is enabled in 1 trainer, hence only
-  // 1 trainer returns true for ShouldSendProfileState(). It tells PS
-  // servers the trainer's profiling state so that PS can follow the
-  // trainer.
-  if (platform::ShouldSendProfileState()) {
-    if (platform::IsProfileEnabled()) {
-      request.set_profile(platform::kEnableProfiler);
-    } else {
-      request.set_profile(platform::kDisableProfiler);
-    }
-  }
-  if (!out_name.empty()) {
-    request.set_out_varname(out_name);
-  }
-  if (var->IsType<framework::LoDTensor>()) {
-    request.set_type(::sendrecv::LOD_TENSOR);
-    GetTensorPayload(var, ctx, &request, &payload, &payload_size);
-  } else if (var->IsType<framework::SelectedRows>()) {
-    request.set_type(::sendrecv::SELECTED_ROWS);
-    GetSelectedRowsPayload(var, ctx, &request, &payload, &payload_size);
-#ifdef PADDLE_WITH_CUDA
-  } else if (var->IsType<ncclUniqueId>()) {
-    request.set_type(::sendrecv::NCCL_ID);
-#endif
-  } else {
-    PADDLE_THROW("Serialize does not support type: %s",
-                 typeid(var->Type()).name());
-  }
-
-  if (platform::is_gpu_place(ctx.GetPlace())) {
-#ifdef PADDLE_WITH_CUDA
-    // GPU data is copied to CPU buffer when sending,
-    // free the buffer when possible.
-    destroy_callback = [](void* backing) {
-      platform::CUDAPinnedPlace cuda_pinned;
-      memory::Free(cuda_pinned, backing);
-    };
-#endif
-  }
-
-  std::string header;
-  request.AppendToString(&header);
-  auto buffer = std::unique_ptr<char[]>(new char[1024]);
-  void* buf = buffer.get();
-  ProtoEncodeHelper e(static_cast<char*>(buf), 1024);
-  e.WriteRawBytes(std::string(header.data(), header.size()));
-// NCCLID is copied directly to the message, return bytebuffer
-// with only one slice if serializing NCCLID.
-#ifdef PADDLE_WITH_CUDA
-  if (var->IsType<ncclUniqueId>()) {
-    e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber,
-                              NCCL_UNIQUE_ID_BYTES);
-    const ncclUniqueId& uid = var->Get<ncclUniqueId>();
-    e.WriteRawBytes(std::string(uid.internal, NCCL_UNIQUE_ID_BYTES));
-
-    // for serialize NCCL_ID
-    ::grpc::Slice slices(e.size());
-    memcpy(const_cast<uint8_t*>(slices.begin()), e.data(), e.size());
-    ::grpc::ByteBuffer tmp(&slices, 1);
-    msg->Swap(&tmp);
-    return;
-  }
-#endif
-
-  e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber, payload_size);
-  // steal reference of tensor data
-  ::grpc::Slice slices[4];  // metadata, tensor, rows meta, rows
-  int num_slices = 2;       // only SelectedRows have rows buffer
-  slices[0] = ::grpc::Slice(e.size());
-  memcpy(const_cast<uint8_t*>(slices[0].begin()), e.data(), e.size());
-  slices[1] = ::grpc::Slice(
-      grpc_slice_new_with_user_data(payload, payload_size, destroy_callback,
-                                    static_cast<char*>(payload)),
-      ::grpc::Slice::STEAL_REF);
-
-  if (var->IsType<framework::SelectedRows>()) {
-    auto* slr = var->GetMutable<framework::SelectedRows>();
-    ProtoEncodeHelper e2(static_cast<char*>(buf), 128);
-    size_t rows_memory_size =
-        slr->rows().size() * framework::SizeOfType(typeid(int64_t));
-    e2.WriteVarlengthBeginning(VarMsg::kRowsFieldNumber, rows_memory_size);
-    slices[2] = ::grpc::Slice(e2.size());
-    memcpy(const_cast<uint8_t*>(slices[2].begin()), e2.data(), e2.size());
-
-    slices[3] = ::grpc::Slice(
-        grpc_slice_new_with_user_data(
-            const_cast<void*>(
-                reinterpret_cast<const void*>(slr->rows().data())),
-            rows_memory_size, [](void* backing) {},
-            const_cast<char*>(
-                reinterpret_cast<const char*>(slr->rows().data()))),
-        ::grpc::Slice::STEAL_REF);
-    num_slices = 4;
-  }
-
-  ::grpc::ByteBuffer tmp(&slices[0], num_slices);
-  msg->Swap(&tmp);
-}
-
-void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
-                               const platform::DeviceContext& ctx,
-                               const framework::Scope* scope,
-                               framework::Variable** var) {
-  operators::distributed::VariableResponse resp(scope, &ctx);
-  PADDLE_ENFORCE(resp.Parse(msg) == 0, "parse bytebuffer to tensor error!");
-  *var = resp.GetVar();
-}
-
 }  // namespace distributed
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.h b/paddle/fluid/operators/distributed/sendrecvop_utils.h
index fe25e73fa608727ba0bb912a82776b330ec8d83a..4d08d3c77afa3c1f2b4d7602f7199558bb5a79c0 100644
--- a/paddle/fluid/operators/distributed/sendrecvop_utils.h
+++ b/paddle/fluid/operators/distributed/sendrecvop_utils.h
@@ -25,24 +25,21 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/framework/var_type.h"
 
-#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
 #include "paddle/fluid/operators/distributed/send_recv.pb.h"
 
 namespace paddle {
 namespace operators {
 namespace distributed {
 
-typedef void (*DestroyCallback)(void*);
+using VarMsg = sendrecv::VariableMessage;
 
-void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
-                           const platform::DeviceContext& ctx,
-                           ::grpc::ByteBuffer* msg,
-                           const std::string& out_varname = std::string());
+void GetTensorPayload(framework::Variable* var,
+                      const platform::DeviceContext& ctx, VarMsg* request,
+                      void** payload, size_t* payload_size);
 
-void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
-                               const platform::DeviceContext& ctx,
-                               const framework::Scope* scope,
-                               framework::Variable** var);
+void GetSelectedRowsPayload(framework::Variable* var,
+                            const platform::DeviceContext& ctx, VarMsg* request,
+                            void** payload, size_t* payload_size);
 
 inline std::type_index ToTypeIndex(sendrecv::VariableMessage::Type type) {
   switch (type) {
diff --git a/paddle/fluid/operators/distributed/varhandle_test.cc b/paddle/fluid/operators/distributed/varhandle_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a0fcaf886475c5e03d959ffd6af22b2123526b9f
--- /dev/null
+++ b/paddle/fluid/operators/distributed/varhandle_test.cc
@@ -0,0 +1,55 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <unistd.h>
+#include <string>
+#include <thread>  // NOLINT
+
+#include "google/protobuf/text_format.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/operators/distributed/request_handler.h"
+
+using paddle::operators::distributed::VarHandlePtr;
+using paddle::operators::distributed::VarHandle;
+
+void WaitTrue(VarHandlePtr s) { EXPECT_TRUE(s->Wait()); }
+
+void WaitFalse(VarHandlePtr s) { EXPECT_FALSE(s->Wait()); }
+
+TEST(VarHandle, Run) {
+  std::vector<VarHandlePtr> a;
+  for (int i = 0; i < 12; i++) {
+    VarHandlePtr s(new VarHandle("", "", "", nullptr, nullptr));
+    a.push_back(s);
+  }
+
+  std::vector<std::unique_ptr<std::thread>> t;
+  for (int i = 0; i < 6; i++) {
+    t.emplace_back(new std::thread(WaitFalse, a[i]));
+  }
+
+  for (int i = 0; i < 6; i++) {
+    a[i]->Finish(false);
+    t[i]->join();
+  }
+
+  for (int i = 6; i < 12; i++) {
+    t.emplace_back(new std::thread(WaitTrue, a[i]));
+  }
+
+  for (int i = 6; i < 12; i++) {
+    a[i]->Finish(true);
+    t[i]->join();
+  }
+}
diff --git a/paddle/fluid/operators/distributed/variable_response.cc b/paddle/fluid/operators/distributed/variable_response.cc
index 45832c60bf9172497afabac927ba39a7cbfb9a52..1617cc1b95216b118cf2c2122dbe8b6c106554c3 100644
--- a/paddle/fluid/operators/distributed/variable_response.cc
+++ b/paddle/fluid/operators/distributed/variable_response.cc
@@ -1,4 +1,4 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -13,50 +13,20 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/distributed/variable_response.h"
-
-#include <string>
-#include <utility>
 #include <vector>
-#ifdef PADDLE_WITH_CUDA
-#include <nccl.h>
-#endif
-#include "paddle/fluid/platform/profiler.h"
-
-#include "paddle/fluid/operators/distributed/send_recv.pb.h"
 #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
 
 namespace paddle {
 namespace operators {
 namespace distributed {
 
-enum WireType {
-  WIRETYPE_VARINT = 0,
-  WIRETYPE_LENGTH_DELIMITED = 2,
-};
-
-inline int GetTagFieldNumber(uint32_t tag) { return tag >> 3; }
-
-inline WireType GetTagWireType(uint32_t tag) {
-  return static_cast<WireType>(tag & 0x7);
-}
-
-bool ReadVarintSizeAsInt(::google::protobuf::io::CodedInputStream* input,
-                         int* result) {
-  uint64_t v;
-  if (input->ReadVarint64(&v) && v <= static_cast<uint64_t>(INT_MAX)) {
-    *result = static_cast<int>(v);
-    return true;
-  } else {
-    return false;
-  }
-}
-
-bool ReadRaw(::google::protobuf::io::CodedInputStream* input,
-             const platform::DeviceContext& dev_ctx, platform::Place place,
-             void* dest, int size) {
+bool VariableResponse::ReadRaw(::google::protobuf::io::CodedInputStream* input,
+                               const platform::DeviceContext& dev_ctx,
+                               platform::Place place, void* dest,
+                               int64_t size) {
   const void* data = NULL;
   int size_to_write = 0;
-  int length = size;
+  int64_t length = size;
   int total_written = 0;
 
   if (platform::is_gpu_place(place)) {
@@ -181,6 +151,7 @@ bool VariableResponse::CopySelectRowsData(
     ::google::protobuf::io::CodedInputStream* input,
     const platform::DeviceContext& ctx, int length) {
   auto* slr = GetVar()->GetMutable<framework::SelectedRows>();
+  slr->mutable_rows()->clear();
   slr->mutable_rows()->resize(length /
                               framework::SizeOfType(typeid(int64_t)));  // int64
   int64_t* rows_data = slr->mutable_rows()->data();
@@ -194,294 +165,54 @@ bool VariableResponse::CopySelectRowsData(
   return true;
 }
 
-bool ParseLodData(::google::protobuf::io::CodedInputStream* input,
-                  std::vector<int64_t>* lod) {
-  while (true) {
-    auto p = input->ReadTagWithCutoff(127);
-    int tag = GetTagFieldNumber(p.first);
-    WireType wt = GetTagWireType(p.first);
-
-    if (!p.second) {
-      return (tag == 0);
-    }
-
-    switch (tag) {
-      case sendrecv::VariableMessage_LodData::kLodDataFieldNumber: {
-        uint64_t v;
-        if (wt == WIRETYPE_VARINT) {
-          if (!input->ReadVarint64(&v)) {
-            return false;
-          }
-          lod->push_back(v);
-          break;
-        }
-
-        if (wt == WIRETYPE_LENGTH_DELIMITED) {
-          int num_bytes = 0;
-          if (!input->ReadVarintSizeAsInt(&num_bytes)) {
-            return tag;
-          }
-          int start_pos = input->CurrentPosition();
-          while (input->CurrentPosition() - start_pos < num_bytes) {
-            uint64_t v;
-            if (!input->ReadVarint64(&v)) {
-              return tag;
-            }
-            lod->push_back(v);
-          }
-          break;
-        }
+bool VariableResponse::ProcSerializedField(
+    int tag, ::google::protobuf::io::CodedInputStream* input,
+    int64_t num_bytes) {
+  PADDLE_ENFORCE((meta_.type() == sendrecv::SELECTED_ROWS ||
+                  meta_.type() == sendrecv::LOD_TENSOR ||
+                  meta_.type() == sendrecv::NCCL_ID) &&
+                     meta_.varname() != "",
+                 "meta info should be got first!");
 
+  if (meta_.type() == sendrecv::NCCL_ID) {
+#ifdef PADDLE_WITH_CUDA
+    auto* var = scope_->FindVar(meta_.varname());
+    if (var != nullptr) {
+      ncclUniqueId* id = var->GetMutable<ncclUniqueId>();
+      if (!ReadRaw(input, *dev_ctx_, platform::CPUPlace(), id->internal,
+                   num_bytes)) {
         return false;
       }
-      default: { return false; }
     }
-  }
-
-  return true;
-}
-
-int VariableResponse::Parse(const ::grpc::ByteBuffer& byte_buffer) {
-  GrpcByteBufferSource source;
-  source.Init(byte_buffer);
-  GrpcByteBufferSourceWrapper r(&source);
-
-  return Parse(&r);
-}
-
-int VariableResponse::Parse(Source* source) {
-  ::google::protobuf::io::ZeroCopyInputStream* input_stream =
-      source->contents();
-  ::google::protobuf::io::CodedInputStream input(input_stream);
-  input.SetTotalBytesLimit(INT_MAX, INT_MAX);
-
-  while (true) {
-    auto p = input.ReadTagWithCutoff(127);
-    int tag = GetTagFieldNumber(p.first);
-    WireType wt = GetTagWireType(p.first);
-    if (!p.second) {
-      if (tag != 0) {
-        return -1;
-      }
-      return 0;
-    }
-
-    switch (tag) {
-      case sendrecv::VariableMessage::kVarnameFieldNumber: {
-        uint32_t length;
-        if ((wt != WIRETYPE_LENGTH_DELIMITED) || !input.ReadVarint32(&length)) {
-          return tag;
-        }
-
-        std::string temp;
-        if (!input.ReadString(&temp, length)) {
-          return tag;
-        }
-
-        meta_.set_varname(temp);
-        break;
-      }
-      case sendrecv::VariableMessage::kTypeFieldNumber: {
-        uint32_t v;
-        if ((wt != WIRETYPE_VARINT) || !input.ReadVarint32(&v)) {
-          return tag;
-        }
-
-        meta_.set_type(static_cast<::sendrecv::VarType>(v));
-        break;
-      }
-      case sendrecv::VariableMessage::kDataTypeFieldNumber: {
-        uint32_t v = 0;
-        if ((wt != WIRETYPE_VARINT) || !input.ReadVarint32(&v)) {
-          return tag;
-        }
-
-        meta_.set_data_type(static_cast<::sendrecv::VariableMessage_Type>(v));
-        break;
-      }
-      case sendrecv::VariableMessage::kDimsFieldNumber: {
-        // not packed
-        if (wt == WIRETYPE_VARINT) {
-          uint64_t v;
-          if (!input.ReadVarint64(&v)) {
-            return tag;
-          }
-          meta_.add_dims(v);
-          break;
-        }
-
-        // packed
-        if (wt == WIRETYPE_LENGTH_DELIMITED) {
-          int num_bytes = 0;
-          if (!input.ReadVarintSizeAsInt(&num_bytes)) {
-            return tag;
-          }
-          int start_pos = input.CurrentPosition();
-          while (input.CurrentPosition() - start_pos < num_bytes) {
-            uint64_t v;
-            if (!input.ReadVarint64(&v)) {
-              return tag;
-            }
-            meta_.add_dims(v);
-          }
-          break;
-        }
-        return tag;
-      }
-      case sendrecv::VariableMessage::kLodLevelFieldNumber: {
-        uint64_t v = 0;
-        if ((wt != WIRETYPE_VARINT) || !input.ReadVarint64(&v)) {
-          return tag;
-        }
-        meta_.set_lod_level(static_cast<int64_t>(v));
-        break;
-      }
-      case sendrecv::VariableMessage::kLodFieldNumber: {
-        int length = 0;
-        if (wt != WIRETYPE_LENGTH_DELIMITED ||
-            !ReadVarintSizeAsInt(&input, &length)) {
-          return tag;
-        }
-
-        std::pair<::google::protobuf::io::CodedInputStream::Limit, int> p =
-            input.IncrementRecursionDepthAndPushLimit(length);
-
-        std::vector<int64_t> lod_data;
-        if (p.second < 0 || !ParseLodData(&input, &lod_data)) {
-          return tag;
-        }
-
-        if (!input.DecrementRecursionDepthAndPopLimit(p.first)) {
-          return false;
-        }
-
-        if (lod_data.size() == 0) {
-          break;
-        }
-
-        auto lod = meta_.add_lod();
-        for (uint32_t i = 0; i < lod_data.size(); i++) {
-          lod->add_lod_data(lod_data[i]);
-        }
-        break;
-      }
-      case sendrecv::VariableMessage::kSlrHeightFieldNumber: {
-        uint64_t v = 0;
-        if ((wt != WIRETYPE_VARINT) || !input.ReadVarint64(&v)) {
-          return tag;
-        }
-        meta_.set_slr_height(static_cast<int64_t>(v));
-        break;
-      }
-      case sendrecv::VariableMessage::kSerializedFieldNumber: {
-        PADDLE_ENFORCE((meta_.type() == sendrecv::SELECTED_ROWS ||
-                        meta_.type() == sendrecv::LOD_TENSOR ||
-                        meta_.type() == sendrecv::NCCL_ID) &&
-                           meta_.varname() != "",
-                       "meta info should be got first!");
-
-        int num_bytes = 0;
-        if (wt != WIRETYPE_LENGTH_DELIMITED ||
-            !ReadVarintSizeAsInt(&input, &num_bytes)) {
-          return tag;
-        }
-
-        if (meta_.type() == sendrecv::NCCL_ID) {
-#ifdef PADDLE_WITH_CUDA
-          auto* var = scope_->FindVar(meta_.varname());
-          if (var != nullptr) {
-            ncclUniqueId* id = var->GetMutable<ncclUniqueId>();
-            if (!ReadRaw(&input, *dev_ctx_, platform::CPUPlace(), id->internal,
-                         num_bytes)) {
-              return tag;
-            }
-          }
-          break;
+    return true;
 #else
-          PADDLE_THROW("Not compiled with CUDA!");
+    PADDLE_THROW("Not compiled with CUDA!");
+    return false;
 #endif
-        }
-
-        framework::DDim dims = GetDims(meta_.dims());
-        if (meta_.type() == sendrecv::LOD_TENSOR) {
-          PADDLE_ENFORCE(meta_.lod_size() >= 0,
-                         "lod info should be got first!");
-          if (!CopyLodTensorData(&input, *dev_ctx_, dims, num_bytes)) {
-            return tag;
-          }
-          break;
-        }
-
-        if (meta_.type() == sendrecv::SELECTED_ROWS) {
-          if (!CopySelectRowsTensorData(&input, *dev_ctx_, dims, num_bytes)) {
-            return tag;
-          }
-          break;
-        }
-
-        return tag;
-      }
-      case sendrecv::VariableMessage::kRowsFieldNumber: {
-        PADDLE_ENFORCE((meta_.type() == sendrecv::SELECTED_ROWS ||
-                        meta_.type() == sendrecv::LOD_TENSOR) &&
-                           meta_.varname() != "",
-                       "meta info should be got first!");
-
-        int num_bytes = 0;
-        if (wt != WIRETYPE_LENGTH_DELIMITED ||
-            !ReadVarintSizeAsInt(&input, &num_bytes)) {
-          return tag;
-        }
+  }
 
-        if (!CopySelectRowsData(&input, *dev_ctx_, num_bytes)) {
-          return tag;
-        }
-        break;
-      }
-      case sendrecv::VariableMessage::kOutVarnameFieldNumber: {
-        uint32_t length;
-        if ((wt != WIRETYPE_LENGTH_DELIMITED) || !input.ReadVarint32(&length)) {
-          return tag;
-        }
+  VLOG(7) << "ProcSerializedField:" << meta_.varname()
+          << ", type:" << meta_.type() << std::endl;
+  framework::DDim dims = GetDims(meta_.dims());
+  if (meta_.type() == sendrecv::LOD_TENSOR) {
+    PADDLE_ENFORCE(meta_.lod_size() >= 0, "lod info should be got first!");
+    if (!CopyLodTensorData(input, *dev_ctx_, dims, num_bytes)) {
+      return false;
+    }
 
-        std::string temp;
-        if (!input.ReadString(&temp, length)) {
-          return tag;
-        }
+    return true;
+  }
 
-        meta_.set_out_varname(temp);
-        break;
-      }
-      case sendrecv::VariableMessage::kProfileFieldNumber: {
-        uint64_t profiling = 0;
-        if (!input.ReadVarint64(&profiling)) {
-          return tag;
-        }
-        meta_.set_profile(profiling);
-        int64_t listener_id = platform::ListenerId();
-        if (listener_id <= 0) {
-          break;
-        }
-        if (profiling == platform::kEnableProfiler &&
-            !platform::IsProfileEnabled()) {
-          platform::EnableProfiler(platform::ProfilerState::kCPU);
-        } else if (profiling == platform::kDisableProfiler &&
-                   platform::IsProfileEnabled()) {
-          // TODO(panyx0718): Should we allow to customize file dir.
-          platform::DisableProfiler(
-              platform::EventSortingKey::kDefault,
-              string::Sprintf("/tmp/profile_ps_%lld", listener_id));
-        }
-        break;
-      }
-      default: {
-        // Unknown tag, return unknown error.
-        return -1;
-      }
+  if (meta_.type() == sendrecv::SELECTED_ROWS) {
+    if (!CopySelectRowsTensorData(input, *dev_ctx_, dims, num_bytes)) {
+      return false;
     }
+    return true;
   }
 
-  return 0;
+  PADDLE_ENFORCE("not supported var types:", meta_.varname(), meta_.type());
+
+  return false;
 }
 
 };  // namespace distributed
diff --git a/paddle/fluid/operators/distributed/variable_response.h b/paddle/fluid/operators/distributed/variable_response.h
index 1db4a0a522654ff2497b8bd9ee1381b5ab64067a..6aec52ca00f59a42ecca01da8df1680ce4eda432 100644
--- a/paddle/fluid/operators/distributed/variable_response.h
+++ b/paddle/fluid/operators/distributed/variable_response.h
@@ -22,18 +22,35 @@
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/var_type.h"
 
-#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
-#include "paddle/fluid/operators/distributed/send_recv.pb.h"
-
 #include "google/protobuf/io/coded_stream.h"
 #include "google/protobuf/io/zero_copy_stream.h"
 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/distributed/bytebuffer_stream.h"
+#include "paddle/fluid/operators/distributed/send_recv.pb.h"
 
 namespace paddle {
 namespace operators {
 namespace distributed {
 
+// Source provides a way for a particular RPC implementation to provide
+// received data to ParseFrom.
+class Source {
+ public:
+  virtual ~Source() {}
+
+  // Return the stream that contains the data to be parsed.
+  // Note that this method might be invoked more than once if
+  // ParseFrom needs to fall back to a more expensive parsing method.
+  // Every call must return a stream pointing at the beginning of
+  // the serialized RecvTensorResponse.
+  //
+  // Note that a subsequent call to contents() invalidates previous
+  // results of contents().
+  //
+  // Ownership of the returned stream is retained by the Source and
+  // should not be deleted by the caller.
+  virtual ::google::protobuf::io::ZeroCopyInputStream* contents() = 0;
+};
+
 class VariableResponse {
  public:
   VariableResponse(const framework::Scope* scope,
@@ -51,22 +68,19 @@ class VariableResponse {
     }
   }
 
-  // return:
-  // 0:ok.
-  // -1: unkown error.
-  // other: number of error field.
-  int Parse(Source* source);
+  int Parse(Source* source, const sendrecv::VariableMessage& meta) {
+    meta_ = meta;
+    return Parse(source);
+  }
 
   // return:
   // 0:ok.
   // -1: unkown error.
   // other: number of error field.
-  int Parse(const ::grpc::ByteBuffer& byte_buffer);
-
-  const framework::Scope& GetLocalScope() const { return *local_scope_; }
-
-  framework::Scope* GetMutableLocalScope() const { return local_scope_; }
+  virtual int Parse(Source* source) = 0;
 
+  inline const framework::Scope& GetLocalScope() const { return *local_scope_; }
+  inline framework::Scope* GetMutableLocalScope() const { return local_scope_; }
   inline std::string Varname() const { return meta_.varname(); }
   inline std::string OutVarname() const { return meta_.out_varname(); }
 
@@ -78,7 +92,11 @@ class VariableResponse {
     return scope_->FindVar(meta_.varname());
   }
 
- private:
+ protected:
+  bool ReadRaw(::google::protobuf::io::CodedInputStream* input,
+               const platform::DeviceContext& dev_ctx, platform::Place place,
+               void* dest, int64_t size);
+
   bool CopySelectRowsTensorData(::google::protobuf::io::CodedInputStream* input,
                                 const platform::DeviceContext& ctx,
                                 const framework::DDim& dims, int length);
@@ -90,12 +108,16 @@ class VariableResponse {
                          const platform::DeviceContext& ctx,
                          const framework::DDim& dims, int length);
 
- private:
+  bool ProcSerializedField(int tag,
+                           ::google::protobuf::io::CodedInputStream* input,
+                           int64_t num_bytes);
+
+ protected:
   const framework::Scope* scope_;
   const platform::DeviceContext* dev_ctx_;
   bool create_scope_ = false;
   framework::Scope* local_scope_ = nullptr;
-  // only Skeleton
+
   sendrecv::VariableMessage meta_;
 };
 
diff --git a/paddle/fluid/operators/elementwise_add_mkldnn_op.cc b/paddle/fluid/operators/elementwise_add_mkldnn_op.cc
index 3f612256840825a75f49944ab97ff957d572a863..9ad82aec8182d6ba06b67391d71317a3d0df1833 100644
--- a/paddle/fluid/operators/elementwise_add_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise_add_mkldnn_op.cc
@@ -47,12 +47,12 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> {
     int axis = ctx.Attr<int>("axis");
 
     auto x_dims = x->dims();
-    auto y_dims = y->dims();
+    auto y_dims_untrimed = y->dims();
     auto z_dims = z->dims();
 
     // Execute default elementwise_add operator when
     // broadcast operations need to performed.
-    if (x_dims != y_dims) {
+    if (x_dims != y_dims_untrimed) {
       auto sum_func = [](T a, T b) -> T { return a + b; };
 
       TransformFunctor<decltype(sum_func), T,
@@ -62,11 +62,11 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> {
               ctx.template device_context<paddle::platform::CPUDeviceContext>(),
               sum_func);
 
-      axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
+      axis = (axis == -1 ? x_dims.size() - y_dims_untrimed.size() : axis);
       PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
                      "Axis should be in range [0, x_dims)");
 
-      trim_trailing_singular_dims(&y_dims);
+      auto y_dims = trim_trailing_singular_dims(y_dims_untrimed);
       axis = (y_dims.size() == 0) ? x_dims.size() : axis;
 
       int pre, n, post;
@@ -85,10 +85,10 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> {
                      "Wrong layout/format set for X tensor");
       PADDLE_ENFORCE(y->layout() == DataLayout::kMKLDNN &&
                          y->format() != memory::format::format_undef,
-                     "Wrong layout/format set for X tensor");
+                     "Wrong layout/format set for Y tensor");
 
       std::vector<int> src_x_tz = framework::vectorize2int(x_dims);
-      std::vector<int> src_y_tz = framework::vectorize2int(y_dims);
+      std::vector<int> src_y_tz = framework::vectorize2int(y_dims_untrimed);
       std::vector<int> dst_tz = framework::vectorize2int(z_dims);
 
       std::vector<memory::primitive_desc> srcs_pd;
@@ -137,41 +137,45 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> {
 };
 
 template <typename T>
-class EltwiseAddMKLDNNGradKernel : public framework::OpKernel<T> {
+class EltwiseAddMKLDNNGradKernel : public ElemwiseGradKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+    ElemwiseGradKernel<T>::Compute(ctx);
     using Tensor = framework::Tensor;
 
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* out = ctx.Input<Tensor>("Out");
     auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
     auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
     int axis = ctx.Attr<int>("axis");
+    // skip out, x, y,
+    // dout length is larger or equal than dx, dy.
+    auto* out = dout;
+    auto *x = dout, *y = dout;
 
     auto set_mkldnn_format = [](Tensor* in, const Tensor* out) {
       in->set_layout(DataLayout::kMKLDNN);
       in->set_format(out->format());
     };
 
-    if (x->dims() == y->dims()) {
-      auto blas = math::GetBlas<paddle::platform::CPUDeviceContext, T>(ctx);
-      if (dx) {
-        blas.VCOPY(dout->numel(), dout->data<T>(),
-                   dx->mutable_data<T>(ctx.GetPlace()));
-        set_mkldnn_format(dx, dout);
-      }
-
-      if (dy) {
-        blas.VCOPY(dout->numel(), dout->data<T>(),
-                   dy->mutable_data<T>(ctx.GetPlace()));
-        set_mkldnn_format(dy, dout);
+    if (dx != nullptr && dy != nullptr && dx->dims() == dy->dims()) {
+      if (dx->dims() == dy->dims()) {
+        auto blas = math::GetBlas<paddle::platform::CPUDeviceContext, T>(ctx);
+        if (dx) {
+          blas.VCOPY(dout->numel(), dout->data<T>(),
+                     dx->mutable_data<T>(ctx.GetPlace()));
+          set_mkldnn_format(dx, dout);
+        }
+
+        if (dy) {
+          blas.VCOPY(dout->numel(), dout->data<T>(),
+                     dy->mutable_data<T>(ctx.GetPlace()));
+          set_mkldnn_format(dy, dout);
+        }
       }
     } else {
       // Execute default kernel when broadcast is needed
-      ElemwiseGradCompute<paddle::platform::CPUDeviceContext, T,
-                          IdentityGrad<T>, IdentityGrad<T>>(
+      ElemwiseExplicitGradCompute<paddle::platform::CPUDeviceContext, T,
+                                  IdentityGrad<T>, IdentityGrad<T>>(
           ctx, *x, *y, *out, *dout, axis, dx, dy, IdentityGrad<T>(),
           IdentityGrad<T>());
     }
diff --git a/paddle/fluid/operators/elementwise_add_op.cc b/paddle/fluid/operators/elementwise_add_op.cc
index d2c20537136fc3ac9d1bece24a2238f26215c922..3c97ac995c649ecd0d196a584240e1e7ac04f08e 100644
--- a/paddle/fluid/operators/elementwise_add_op.cc
+++ b/paddle/fluid/operators/elementwise_add_op.cc
@@ -15,7 +15,9 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise_add_op.h"
 #include "paddle/fluid/operators/elementwise_op.h"
 namespace ops = paddle::operators;
-REGISTER_ELEMWISE_OP(elementwise_add, "Add", "Out = X + Y");
+REGISTER_ELEMWISE_GRAD_MAKER(elementwise_add, Add);
+REGISTER_ELEMWISE_EXPLICIT_OP(elementwise_add, "Add", "Out = X + Y", "Out",
+                              "X");
 REGISTER_OP_CPU_KERNEL(
     elementwise_add,
     ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/elementwise_add_op.h b/paddle/fluid/operators/elementwise_add_op.h
index baf04c30b17cb333fc8a6544afd6c479442f835b..c60cb1f92e99329d52f6ed39dccde406a5f83563 100644
--- a/paddle/fluid/operators/elementwise_add_op.h
+++ b/paddle/fluid/operators/elementwise_add_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/operators/elementwise_op.h"
 #include "paddle/fluid/operators/elementwise_op_function.h"
 #include "paddle/fluid/operators/math/blas.h"
 
@@ -95,9 +96,10 @@ void default_elementwise_add_grad(const framework::ExecutionContext& ctx,
                                   framework::Tensor* dy) {
   int axis = ctx.Attr<int>("axis");
 
-  ElemwiseGradCompute<DeviceContext, T, IdentityGrad<T>, IdentityGrad<T>>(
-      ctx, *x, *y, *out, *dout, axis, dx, dy, IdentityGrad<T>(),
-      IdentityGrad<T>());
+  ElemwiseExplicitGradCompute<DeviceContext, T, IdentityGrad<T>,
+                              IdentityGrad<T>>(ctx, *x, *y, *out, *dout, axis,
+                                               dx, dy, IdentityGrad<T>(),
+                                               IdentityGrad<T>());
 }
 
 template <typename DeviceContext, typename T>
@@ -135,19 +137,22 @@ elementwise_add_grad(const framework::ExecutionContext& ctx,
 }
 
 template <typename DeviceContext, typename T>
-class ElementwiseAddGradKernel : public framework::OpKernel<T> {
+class ElementwiseAddGradKernel : public ElemwiseGradKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+    ElemwiseGradKernel<T>::Compute(ctx);
+
     using Tensor = framework::Tensor;
 
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* out = ctx.Input<Tensor>("Out");
     auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
     auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    // skip out, x, y
+    auto* out = dout;
+    auto *x = dout, *y = dout;
 
-    if (platform::is_cpu_place(ctx.GetPlace()) && (x->dims() == y->dims())) {
+    if (platform::is_cpu_place(ctx.GetPlace()) && dx != nullptr &&
+        dy != nullptr && (dx->dims() == dy->dims())) {
       elementwise_add_grad<DeviceContext, T>(ctx, x, y, out, dout, dx, dy);
     } else {
       default_elementwise_add_grad<DeviceContext, T>(ctx, x, y, out, dout, dx,
diff --git a/paddle/fluid/operators/elementwise_div_op.cc b/paddle/fluid/operators/elementwise_div_op.cc
index 824b1221e5a77c8799dc34820b7f0db180c2439e..84c8a65e5f859d276ae6d5f1a3f25c9d713a7a61 100644
--- a/paddle/fluid/operators/elementwise_div_op.cc
+++ b/paddle/fluid/operators/elementwise_div_op.cc
@@ -15,7 +15,9 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise_div_op.h"
 #include "paddle/fluid/operators/elementwise_op.h"
 namespace ops = paddle::operators;
+
 REGISTER_ELEMWISE_OP(elementwise_div, "Div", "Out = X / Y");
+
 REGISTER_OP_CPU_KERNEL(
     elementwise_div,
     ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/elementwise_div_op.h b/paddle/fluid/operators/elementwise_div_op.h
index 95649ac46e6bd41b9e1a865794cdec3ae1e6e247..41a7950bf0c598507c0fda48c6a43f2fd38c41d2 100644
--- a/paddle/fluid/operators/elementwise_div_op.h
+++ b/paddle/fluid/operators/elementwise_div_op.h
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/fluid/operators/elementwise_op.h"
 #include "paddle/fluid/operators/elementwise_op_function.h"
-
 namespace paddle {
 namespace operators {
 
@@ -53,9 +53,10 @@ struct DivGradDY {
 };
 
 template <typename DeviceContext, typename T>
-class ElementwiseDivGradKernel : public framework::OpKernel<T> {
+class ElementwiseDivGradKernel : public ElemwiseGradKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+    ElemwiseGradKernel<T>::Compute(ctx);
     using Tensor = framework::Tensor;
 
     auto* x = ctx.Input<Tensor>("X");
diff --git a/paddle/fluid/operators/elementwise_max_op.h b/paddle/fluid/operators/elementwise_max_op.h
index 527a18ee3ba88a158a13266a7fbcdafe59ec69d9..bfb5c931958b4ca890ea720af42dad91d5625abb 100644
--- a/paddle/fluid/operators/elementwise_max_op.h
+++ b/paddle/fluid/operators/elementwise_max_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/fluid/operators/elementwise_op.h"
 #include "paddle/fluid/operators/elementwise_op_function.h"
 
 namespace paddle {
@@ -55,9 +56,10 @@ struct MaxGradDy {
 };
 
 template <typename DeviceContext, typename T>
-class ElementwiseMaxGradKernel : public framework::OpKernel<T> {
+class ElementwiseMaxGradKernel : public ElemwiseGradKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+    ElemwiseGradKernel<T>::Compute(ctx);
     using Tensor = framework::Tensor;
 
     auto* x = ctx.Input<Tensor>("X");
diff --git a/paddle/fluid/operators/elementwise_min_op.h b/paddle/fluid/operators/elementwise_min_op.h
index d4e5831463f3e54c72789b6876ea696cf1b4ef4b..db035ffb52e619b337c8190af4ed0e155aaac48d 100644
--- a/paddle/fluid/operators/elementwise_min_op.h
+++ b/paddle/fluid/operators/elementwise_min_op.h
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/fluid/operators/elementwise_op.h"
 #include "paddle/fluid/operators/elementwise_op_function.h"
-
 namespace paddle {
 namespace operators {
 
@@ -55,9 +55,10 @@ struct MinGradDy {
 };
 
 template <typename DeviceContext, typename T>
-class ElementwiseMinGradKernel : public framework::OpKernel<T> {
+class ElementwiseMinGradKernel : public ElemwiseGradKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+    ElemwiseGradKernel<T>::Compute(ctx);
     using Tensor = framework::Tensor;
 
     auto* x = ctx.Input<Tensor>("X");
diff --git a/paddle/fluid/operators/elementwise_mul_op.h b/paddle/fluid/operators/elementwise_mul_op.h
index dc73cb6f23614504640283af01981d3f69e89126..4437da4d95f97b5cbbca1650badf9710c26b4380 100644
--- a/paddle/fluid/operators/elementwise_mul_op.h
+++ b/paddle/fluid/operators/elementwise_mul_op.h
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include "paddle/fluid/operators/elementwise_op.h"
 #include "paddle/fluid/operators/elementwise_op_function.h"
+#include "paddle/fluid/operators/math/blas.h"
 
 namespace paddle {
 namespace operators {
@@ -23,6 +25,37 @@ struct MulFunctor {
   inline HOSTDEVICE T operator()(T a, T b) const { return a * b; }
 };
 
+template <typename DeviceContext, typename T>
+void default_elementwise_mul(const framework::ExecutionContext& ctx,
+                             const framework::Tensor* x,
+                             const framework::Tensor* y, framework::Tensor* z) {
+  int axis = ctx.Attr<int>("axis");
+  ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
+                                                        MulFunctor<T>(), z);
+}
+
+template <typename DeviceContext, typename T>
+typename std::enable_if<
+    std::is_floating_point<T>::value &&
+    std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
+elementwise_mul(const framework::ExecutionContext& ctx,
+                const framework::Tensor* x, const framework::Tensor* y,
+                framework::Tensor* z) {
+  auto blas = math::GetBlas<DeviceContext, T>(ctx);
+  blas.VMUL(x->numel(), x->data<T>(), y->data<T>(),
+            z->mutable_data<T>(ctx.GetPlace()));
+}
+
+template <typename DeviceContext, typename T>
+typename std::enable_if<
+    !std::is_floating_point<T>::value ||
+    !std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
+elementwise_mul(const framework::ExecutionContext& ctx,
+                const framework::Tensor* x, const framework::Tensor* y,
+                framework::Tensor* z) {
+  default_elementwise_mul<DeviceContext, T>(ctx, x, y, z);
+}
+
 template <typename DeviceContext, typename T>
 class ElementwiseMulKernel : public framework::OpKernel<T> {
  public:
@@ -33,9 +66,11 @@ class ElementwiseMulKernel : public framework::OpKernel<T> {
     auto* y = ctx.Input<Tensor>("Y");
     auto* z = ctx.Output<Tensor>("Out");
     z->mutable_data<T>(ctx.GetPlace());
-    int axis = ctx.Attr<int>("axis");
-    ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
-                                                          MulFunctor<T>(), z);
+    if (x->numel() == y->numel()) {
+      elementwise_mul<DeviceContext, T>(ctx, x, y, z);
+    } else {
+      default_elementwise_mul<DeviceContext, T>(ctx, x, y, z);
+    }
   }
 };
 
@@ -50,9 +85,10 @@ struct MulGradDY {
 };
 
 template <typename DeviceContext, typename T>
-class ElementwiseMulGradKernel : public framework::OpKernel<T> {
+class ElementwiseMulGradKernel : public ElemwiseGradKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+    ElemwiseGradKernel<T>::Compute(ctx);
     using Tensor = framework::Tensor;
 
     auto* x = ctx.Input<Tensor>("X");
diff --git a/paddle/fluid/operators/elementwise_op.h b/paddle/fluid/operators/elementwise_op.h
index bb88970e42c194d9437609b62435f1a89e2b446b..a79b900b9801e6b80e4433a9acdd4dab6c34859d 100644
--- a/paddle/fluid/operators/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise_op.h
@@ -78,7 +78,9 @@ class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker {
   void Make() final {
     AddInput("X", "(Tensor), The first input tensor of elementwise op.");
     AddInput("Y", "(Tensor), The second input tensor of elementwise op.");
-    AddOutput("Out", "The output of elementwise op.").Reuse("X");
+    // AddOutput("SavedShape", "(Tensor), save X, Y shape for grad to save
+    // memory.").AsIntermediate();
+    AddOutput("Out", "The output of elementwise op.");
     AddAttr<int>("axis",
                  "(int, default -1). The start dimension index "
                  "for broadcasting Y onto X.")
@@ -125,11 +127,13 @@ But the output only shares the LoD information with the input $X$.
 
 )DOC",
                                GetName(), GetEquation()));
+    SetReuse();
   }
 
  protected:
   virtual std::string GetName() const = 0;
   virtual std::string GetEquation() const = 0;
+  virtual void SetReuse() {}
 };
 
 class ElementwiseOpGrad : public framework::OperatorWithKernel {
@@ -162,8 +166,8 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    auto input_data_type =
-        framework::ToDataType(ctx.Input<Tensor>("X")->type());
+    auto input_data_type = framework::ToDataType(
+        ctx.Input<Tensor>(framework::GradVarName("Out"))->type());
 
 #ifdef PADDLE_WITH_MKLDNN
     if (platform::CanMKLDNNBeUsed(ctx)) {
@@ -175,9 +179,72 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
 };
+
+// For Add, Sub op, the X, Out is not needed.
+class ElementwiseOpExplicitGrad : public ElementwiseOpGrad {
+ public:
+  using operators::ElementwiseOpGrad::ElementwiseOpGrad;
+  using operators::ElementwiseOpGrad::GetExpectedKernelType;
+  using Tensor = framework::Tensor;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null");
+
+    auto x_grad_name = framework::GradVarName("X");
+    if (ctx->HasOutput(x_grad_name)) {
+      auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
+      ctx->SetOutputDim(x_grad_name, out_dims);
+    }
+    auto y_grad_name = framework::GradVarName("Y");
+    if (ctx->HasOutput(y_grad_name)) {
+      PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null");
+      auto y_dims = ctx->GetInputDim("Y");
+      ctx->SetOutputDim(y_grad_name, y_dims);
+    }
+  }
+};
+
+template <typename T>
+class ElemwiseGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* dx =
+        context.Output<framework::LoDTensor>(framework::GradVarName("X"));
+    if (dx != nullptr) {
+      auto& dout =
+          *context.Input<framework::LoDTensor>(framework::GradVarName("Out"));
+      dx->set_lod(dout.lod());
+    }
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
+/*
+*/
+
+#define REGISTER_ELEMWISE_GRAD_MAKER(kernel_type, op_name)                   \
+  class kernel_type##GradMaker                                               \
+      : public paddle::framework::SingleGradOpDescMaker {                    \
+   public:                                                                   \
+    using ::paddle::framework::SingleGradOpDescMaker::SingleGradOpDescMaker; \
+                                                                             \
+   protected:                                                                \
+    std::unique_ptr<paddle::framework::OpDesc> Apply() const override {      \
+      auto* op = new paddle::framework::OpDesc();                            \
+      op->SetType(#kernel_type "_grad");                                     \
+      op->SetInput("Y", Input("Y"));                                         \
+      op->SetInput(::paddle::framework::GradVarName("Out"),                  \
+                   OutputGrad("Out"));                                       \
+      op->SetAttrMap(Attrs());                                               \
+      op->SetOutput(::paddle::framework::GradVarName("X"), InputGrad("X"));  \
+      op->SetOutput(::paddle::framework::GradVarName("Y"), InputGrad("Y"));  \
+      return std::unique_ptr<::paddle::framework::OpDesc>(op);               \
+    }                                                                        \
+  }
+
 #define REGISTER_ELEMWISE_OP(op_type, op_name, equation)                \
   class __ElemwiseOp##op_type##Maker__                                  \
       : public ::paddle::operators::ElementwiseOpMaker {                \
@@ -190,3 +257,18 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
                     ::paddle::operators::ElementwiseOpInferVarType,     \
                     ::paddle::framework::DefaultGradOpDescMaker<true>); \
   REGISTER_OPERATOR(op_type##_grad, ::paddle::operators::ElementwiseOpGrad)
+
+#define REGISTER_ELEMWISE_EXPLICIT_OP(op_type, op_name, equation, ...) \
+  class __ElemwiseOp##op_type##Maker__                                 \
+      : public ::paddle::operators::ElementwiseOpMaker {               \
+   protected:                                                          \
+    virtual std::string GetName() const { return op_name; }            \
+    virtual std::string GetEquation() const { return equation; }       \
+    virtual void SetReuse() { Reuse(__VA_ARGS__); }                    \
+  };                                                                   \
+  REGISTER_OPERATOR(op_type, ::paddle::operators::ElementwiseOp,       \
+                    __ElemwiseOp##op_type##Maker__,                    \
+                    ::paddle::operators::ElementwiseOpInferVarType,    \
+                    op_type##GradMaker);                               \
+  REGISTER_OPERATOR(op_type##_grad,                                    \
+                    ::paddle::operators::ElementwiseOpExplicitGrad)
diff --git a/paddle/fluid/operators/elementwise_op_function.h b/paddle/fluid/operators/elementwise_op_function.h
index 8b052611f80ddf874ca48c1c58e13346528a834e..b1a399c22c2b9ed7464a1b1764478803d4416d94 100644
--- a/paddle/fluid/operators/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise_op_function.h
@@ -13,7 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+
+#include <glog/logging.h>
 #include <algorithm>
+#include <iterator>
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
@@ -44,9 +48,9 @@ namespace operators {
  *    pre=2*3, n=4*5, post=1
  *    x.shape(6, 20, 1) * y.shape(1, 20, 1).broadcast(6, 20, 1)
  */
-inline void get_mid_dims(const framework::DDim& x_dims,
-                         const framework::DDim& y_dims, const int axis,
-                         int* pre, int* n, int* post) {
+inline void get_mid_dims(const framework::DDim &x_dims,
+                         const framework::DDim &y_dims, const int axis,
+                         int *pre, int *n, int *post) {
   *pre = 1;
   *n = 1;
   *post = 1;
@@ -65,30 +69,41 @@ inline void get_mid_dims(const framework::DDim& x_dims,
   }
 }
 
-inline void trim_trailing_singular_dims(framework::DDim* dims) {
+inline framework::DDim trim_trailing_singular_dims(
+    const framework::DDim &dims) {
   // Remove trailing dimensions of size 1 for y
-  auto actual_dims_size = dims->size();
+  auto actual_dims_size = dims.size();
   for (; actual_dims_size != 0; --actual_dims_size) {
-    if ((*dims)[actual_dims_size - 1] != 1) break;
+    if (dims[actual_dims_size - 1] != 1) break;
+  }
+
+  std::vector<int> trim_dims;
+  trim_dims.resize(actual_dims_size);
+  for (int i = 0; i < actual_dims_size; ++i) {
+    trim_dims[i] = dims[i];
   }
-  if (actual_dims_size != dims->size()) {
-    auto actual_dims = framework::vectorize(*dims);
-    actual_dims.resize(actual_dims_size);
-    *dims = framework::make_ddim(actual_dims);
+  if (trim_dims.size() == 0) {
+    return framework::DDim(framework::make_dim());
   }
+  framework::DDim actual_dims = framework::make_ddim(trim_dims);
+  return actual_dims;
 }
 
 template <typename T, typename DeviceContext>
 class RowwiseTransformIterator;
+
 template <typename T, typename DeviceContext>
 class MidWiseTransformIterator;
 
+// NOTE(dzhwinter): ptrdiff_t in iterator is deperecated in c++17
 template <typename T>
-class RowwiseTransformIterator<T, platform::CPUDeviceContext> {
+class RowwiseTransformIterator<T, platform::CPUDeviceContext>
+    : public std::iterator<std::random_access_iterator_tag, T, std::ptrdiff_t,
+                           T *, T &> {
  public:
-  RowwiseTransformIterator(const T* ptr, int n) : ptr_(ptr), i_(0), n_(n) {}
+  RowwiseTransformIterator(const T *ptr, int n) : ptr_(ptr), i_(0), n_(n) {}
 
-  RowwiseTransformIterator<T, platform::CPUDeviceContext>& operator++() {
+  RowwiseTransformIterator<T, platform::CPUDeviceContext> &operator++() {
     ++i_;
     if (UNLIKELY(i_ == n_)) {
       i_ = 0;
@@ -96,31 +111,33 @@ class RowwiseTransformIterator<T, platform::CPUDeviceContext> {
     return *this;
   }
 
-  bool operator==(const RowwiseTransformIterator<T, platform::CPUDeviceContext>&
-                      rhs) const {
+  bool operator==(const RowwiseTransformIterator<T, platform::CPUDeviceContext>
+                      &rhs) const {
     return (ptr_ + i_) == &(*rhs);
   }
 
-  bool operator!=(const RowwiseTransformIterator<T, platform::CPUDeviceContext>&
-                      rhs) const {
+  bool operator!=(const RowwiseTransformIterator<T, platform::CPUDeviceContext>
+                      &rhs) const {
     return (ptr_ + i_) != &(*rhs);
   }
 
-  const T& operator*() { return ptr_[i_]; }
+  const T &operator*() { return ptr_[i_]; }
 
  private:
-  const T* ptr_;
+  const T *ptr_;
   int i_;
   int64_t n_;
 };
 
 template <typename T>
-class MidWiseTransformIterator<T, platform::CPUDeviceContext> {
+class MidWiseTransformIterator<T, platform::CPUDeviceContext>
+    : public std::iterator<std::random_access_iterator_tag, T, std::ptrdiff_t,
+                           T *, T &> {
  public:
-  MidWiseTransformIterator(const T* ptr, int n, int post)
+  MidWiseTransformIterator(const T *ptr, int n, int post)
       : ptr_(ptr), i_(0), j_(0), n_(n), post_(post) {}
 
-  MidWiseTransformIterator<T, platform::CPUDeviceContext>& operator++() {
+  MidWiseTransformIterator<T, platform::CPUDeviceContext> &operator++() {
     ++j_;
     if (UNLIKELY(j_ == post_)) {
       ++i_;
@@ -132,20 +149,20 @@ class MidWiseTransformIterator<T, platform::CPUDeviceContext> {
     return *this;
   }
 
-  bool operator==(const MidWiseTransformIterator<T, platform::CPUDeviceContext>&
-                      rhs) const {
+  bool operator==(const MidWiseTransformIterator<T, platform::CPUDeviceContext>
+                      &rhs) const {
     return (ptr_ + i_) == &(*rhs);
   }
 
-  bool operator!=(const MidWiseTransformIterator<T, platform::CPUDeviceContext>&
-                      rhs) const {
+  bool operator!=(const MidWiseTransformIterator<T, platform::CPUDeviceContext>
+                      &rhs) const {
     return (ptr_ + i_) != &(*rhs);
   }
 
-  const T& operator*() { return ptr_[i_]; }
+  const T &operator*() { return ptr_[i_]; }
 
  private:
-  const T* ptr_;
+  const T *ptr_;
   int64_t i_;
   int64_t j_;
   int64_t n_;
@@ -156,18 +173,18 @@ class MidWiseTransformIterator<T, platform::CPUDeviceContext> {
 template <typename T>
 class RowwiseTransformIterator<T, platform::CUDADeviceContext>
     : public thrust::iterator_adaptor<
-          RowwiseTransformIterator<T, platform::CUDADeviceContext>, const T*> {
+          RowwiseTransformIterator<T, platform::CUDADeviceContext>, const T *> {
  public:
   typedef thrust::iterator_adaptor<
-      RowwiseTransformIterator<T, platform::CUDADeviceContext>, const T*>
+      RowwiseTransformIterator<T, platform::CUDADeviceContext>, const T *>
       super_t;
-  HOSTDEVICE RowwiseTransformIterator(const T* x, int n)
+  HOSTDEVICE RowwiseTransformIterator(const T *x, int n)
       : super_t(x), begin_(x), n_(n) {}
   friend class thrust::iterator_core_access;
 
  private:
   unsigned int n_;
-  const T* begin_;
+  const T *begin_;
   HOSTDEVICE typename super_t::reference dereference() const {
     return *(begin_ + (this->base() - begin_) % n_);
   }
@@ -176,19 +193,19 @@ class RowwiseTransformIterator<T, platform::CUDADeviceContext>
 template <typename T>
 class MidWiseTransformIterator<T, platform::CUDADeviceContext>
     : public thrust::iterator_adaptor<
-          MidWiseTransformIterator<T, platform::CUDADeviceContext>, const T*> {
+          MidWiseTransformIterator<T, platform::CUDADeviceContext>, const T *> {
  public:
   typedef thrust::iterator_adaptor<
-      MidWiseTransformIterator<T, platform::CUDADeviceContext>, const T*>
+      MidWiseTransformIterator<T, platform::CUDADeviceContext>, const T *>
       super_t;
-  HOSTDEVICE MidWiseTransformIterator(const T* x, int n, int post)
+  HOSTDEVICE MidWiseTransformIterator(const T *x, int n, int post)
       : super_t(x), begin_(x), n_(n), post_(post) {}
   friend class thrust::iterator_core_access;
 
  private:
   unsigned int post_;
   unsigned int n_;
-  const T* begin_;
+  const T *begin_;
   HOSTDEVICE typename super_t::reference dereference() const {
     return *(begin_ + (((this->base() - begin_) / post_) % n_));
   }
@@ -199,8 +216,8 @@ template <typename Functor, typename T, typename DeviceContext,
           typename OutType = T>
 class TransformFunctor {
  public:
-  TransformFunctor(const framework::Tensor* x, const framework::Tensor* y,
-                   framework::Tensor* z, const DeviceContext& ctx, Functor func)
+  TransformFunctor(const framework::Tensor *x, const framework::Tensor *y,
+                   framework::Tensor *z, const DeviceContext &ctx, Functor func)
       : x_(x->data<T>()),
         y_(y->data<T>()),
         z_(z->mutable_data<OutType>(ctx.GetPlace())),
@@ -226,20 +243,20 @@ class TransformFunctor {
   }
 
  private:
-  const T* x_;
-  const T* y_;
-  OutType* z_;
+  const T *x_;
+  const T *y_;
+  OutType *z_;
   int64_t nx_;
-  const DeviceContext& ctx_;
+  const DeviceContext &ctx_;
   Functor func_;
 };
 
 #define EIGEN_FUNCTOR(name, eigen_op)                                          \
   struct Eigen##name##Functor {                                                \
     template <typename DeviceContext, typename T>                              \
-    inline void Run(const framework::Tensor* x, const framework::Tensor* y,    \
-                    framework::Tensor* z,                                      \
-                    const framework::ExecutionContext& ctx) {                  \
+    inline void Run(const framework::Tensor *x, const framework::Tensor *y,    \
+                    framework::Tensor *z,                                      \
+                    const framework::ExecutionContext &ctx) {                  \
       auto x_e = framework::EigenVector<T>::Flatten(*x);                       \
       auto y_e = framework::EigenVector<T>::Flatten(*y);                       \
       auto z_e = framework::EigenVector<T>::Flatten(*z);                       \
@@ -248,9 +265,9 @@ class TransformFunctor {
           eigen_op(x_e, y_e);                                                  \
     }                                                                          \
     template <typename DeviceContext, typename T>                              \
-    inline void RunBroadCast(const framework::Tensor* x,                       \
-                             const framework::Tensor* y, framework::Tensor* z, \
-                             const framework::ExecutionContext& ctx, int pre,  \
+    inline void RunBroadCast(const framework::Tensor *x,                       \
+                             const framework::Tensor *y, framework::Tensor *z, \
+                             const framework::ExecutionContext &ctx, int pre,  \
                              int n) {                                          \
       auto x_e = framework::EigenVector<T>::Flatten(*x);                       \
       auto y_e = framework::EigenVector<T>::Flatten(*y);                       \
@@ -263,10 +280,10 @@ class TransformFunctor {
           eigen_op(x_e, y_bcast);                                              \
     }                                                                          \
     template <typename DeviceContext, typename T>                              \
-    inline void RunBroadCast2(const framework::Tensor* x,                      \
-                              const framework::Tensor* y,                      \
-                              framework::Tensor* z,                            \
-                              const framework::ExecutionContext& ctx, int pre, \
+    inline void RunBroadCast2(const framework::Tensor *x,                      \
+                              const framework::Tensor *y,                      \
+                              framework::Tensor *z,                            \
+                              const framework::ExecutionContext &ctx, int pre, \
                               int n, int post) {                               \
       auto x_e = framework::EigenVector<T>::Flatten(*x);                       \
       auto y_e = framework::EigenVector<T>::Flatten(*y);                       \
@@ -281,23 +298,27 @@ class TransformFunctor {
   }
 
 #define EIGEN_ADD(x, y) ((x) + (y))
+
 EIGEN_FUNCTOR(Add, EIGEN_ADD);
 
 #define EIGEN_SUB(x, y) ((x) - (y))
+
 EIGEN_FUNCTOR(Sub, EIGEN_SUB);
 
 #define EIGEN_MUL(x, y) ((x) * (y))
+
 EIGEN_FUNCTOR(Mul, EIGEN_MUL);
 
 #define EIGEN_DIV(x, y) ((x) / (y))
+
 EIGEN_FUNCTOR(Div, EIGEN_DIV);
 
 template <typename T, typename DX_OP, typename DY_OP>
 struct ElemwiseGradNoBroadcast {
-  const T* x_;
-  const T* y_;
-  const T* out_;
-  const T* dout_;
+  const T *x_;
+  const T *y_;
+  const T *out_;
+  const T *dout_;
 
   HOSTDEVICE void operator()(size_t i) {
     if (dx_ != nullptr) {
@@ -310,14 +331,14 @@ struct ElemwiseGradNoBroadcast {
 
   DX_OP dx_op_;
   DY_OP dy_op_;
-  T* dx_;
-  T* dy_;
+  T *dx_;
+  T *dy_;
 };
 
 template <typename T, typename DX_OP, typename DY_OP>
-static void ElemwiseGradBroadcast1CPU(const T* x, const T* y, const T* out,
-                                      const T* dout, int h, int w, DX_OP dx_op,
-                                      DY_OP dy_op, T* dx, T* dy) {
+static void ElemwiseGradBroadcast1CPU(const T *x, const T *y, const T *out,
+                                      const T *dout, int h, int w, DX_OP dx_op,
+                                      DY_OP dy_op, T *dx, T *dy) {
   for (int i = 0; i < h; ++i) {
     for (int j = 0; j < w; ++j) {
       int x_offset = i * w + j;
@@ -339,8 +360,8 @@ static void ElemwiseGradBroadcast1CPU(const T* x, const T* y, const T* out,
 #ifdef __NVCC__
 template <typename T, typename DX_OP, typename DY_OP>
 static __global__ void ElemwiseGradBroadcast1CUDAKernel(
-    const T* x, const T* y, const T* out, const T* dout, int h, int w,
-    DX_OP dx_op, DY_OP dy_op, T* dx, T* dy) {
+    const T *x, const T *y, const T *out, const T *dout, int h, int w,
+    DX_OP dx_op, DY_OP dy_op, T *dx, T *dy) {
   int j = blockIdx.x;
   int i = threadIdx.x;
   int tid = threadIdx.x;
@@ -367,10 +388,10 @@ static __global__ void ElemwiseGradBroadcast1CUDAKernel(
 }
 
 template <typename T, typename DX_OP, typename DY_OP>
-static void ElemwiseGradBroadcast1CUDA(cudaStream_t stream, const T* x,
-                                       const T* y, const T* out, const T* dout,
+static void ElemwiseGradBroadcast1CUDA(cudaStream_t stream, const T *x,
+                                       const T *y, const T *out, const T *dout,
                                        int h, int w, DX_OP dx_op, DY_OP dy_op,
-                                       T* dx, T* dy) {
+                                       T *dx, T *dy) {
   int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, h);
   int gird_size = w;
   ElemwiseGradBroadcast1CUDAKernel<<<gird_size, block_size, 0, stream>>>(
@@ -380,9 +401,9 @@ static void ElemwiseGradBroadcast1CUDA(cudaStream_t stream, const T* x,
 #endif
 
 template <typename T, typename DX_OP, typename DY_OP>
-static void ElemwiseGradBroadcast2CPU(const T* x, const T* y, const T* out,
-                                      const T* dout, int pre, int n, int post,
-                                      DX_OP dx_op, DY_OP dy_op, T* dx, T* dy) {
+static void ElemwiseGradBroadcast2CPU(const T *x, const T *y, const T *out,
+                                      const T *dout, int pre, int n, int post,
+                                      DX_OP dx_op, DY_OP dy_op, T *dx, T *dy) {
   for (int i = 0; i < pre; ++i) {
     for (int j = 0; j < n; ++j) {
       for (int k = 0; k < post; ++k) {
@@ -407,8 +428,8 @@ static void ElemwiseGradBroadcast2CPU(const T* x, const T* y, const T* out,
 #ifdef __NVCC__
 template <typename T, typename DX_OP, typename DY_OP>
 static __global__ void ElemwiseGradBroadcast2CUDAKernel(
-    const T* x, const T* y, const T* out, const T* dout, int pre, int n,
-    int post, DX_OP dx_op, DY_OP dy_op, T* dx, T* dy) {
+    const T *x, const T *y, const T *out, const T *dout, int pre, int n,
+    int post, DX_OP dx_op, DY_OP dy_op, T *dx, T *dy) {
   int tid = threadIdx.x;
   int j = blockIdx.x;
 
@@ -444,10 +465,10 @@ static __global__ void ElemwiseGradBroadcast2CUDAKernel(
 }
 
 template <typename T, typename DX_OP, typename DY_OP>
-static void ElemwiseGradBroadcast2CUDA(cudaStream_t stream, const T* x,
-                                       const T* y, const T* out, const T* dout,
+static void ElemwiseGradBroadcast2CUDA(cudaStream_t stream, const T *x,
+                                       const T *y, const T *out, const T *dout,
                                        int pre, int n, int post, DX_OP dx_op,
-                                       DY_OP dy_op, T* dx, T* dy) {
+                                       DY_OP dy_op, T *dx, T *dy) {
   int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, pre * post);
   int gird_size = n;
   ElemwiseGradBroadcast2CUDAKernel<<<gird_size, block_size, 0, stream>>>(
@@ -457,78 +478,135 @@ static void ElemwiseGradBroadcast2CUDA(cudaStream_t stream, const T* x,
 #endif
 
 template <typename DeviceContext, typename T, typename DX_OP, typename DY_OP>
-void ElemwiseGradCompute(const framework::ExecutionContext& ctx,
-                         const framework::Tensor& x, const framework::Tensor& y,
-                         const framework::Tensor& out,
-                         const framework::Tensor& dout, int axis,
-                         framework::Tensor* dx, framework::Tensor* dy,
-                         DX_OP dx_op, DY_OP dy_op) {
-  if (x.dims() == y.dims()) {
-    size_t N = static_cast<size_t>(framework::product(x.dims()));
-    platform::ForRange<DeviceContext> for_range(
-        ctx.template device_context<DeviceContext>(), N);
-    for_range(ElemwiseGradNoBroadcast<T, DX_OP, DY_OP>{
-        x.data<T>(), y.data<T>(), out.data<T>(), dout.data<T>(), dx_op, dy_op,
-        dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
-        dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace())});
-  } else {  // Y is a scalar
-    auto x_dim = x.dims();
-    auto y_dim = y.dims();
-
-    axis = (axis == -1 ? x_dim.size() - y_dim.size() : axis);
-    trim_trailing_singular_dims(&y_dim);
-    axis = (y_dim.size() == 0) ? x_dim.size() : axis;
-
-    int pre, n, post;
-    get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post);
-    if (post == 1) {
-      int h = pre;
-      int w = n;
-      if (platform::is_gpu_place(ctx.GetPlace())) {
+void ElemwiseGradComputeNoBroadcast(
+    const framework::ExecutionContext &ctx, const framework::DDim &x_dim,
+    const framework::DDim &y_dim, const framework::Tensor &x,
+    const framework::Tensor &y, const framework::Tensor &out,
+    const framework::Tensor &dout, int axis, framework::Tensor *dx,
+    framework::Tensor *dy, DX_OP dx_op, DY_OP dy_op) {
+  size_t N = static_cast<size_t>(framework::product(x_dim));
+#if !defined(_WIN32)
+  platform::ForRange<DeviceContext> for_range(
+      ctx.template device_context<DeviceContext>(), N);
+#else
+  platform::ForRange<DeviceContext> for_range(
+      ctx.device_context<DeviceContext>(), N);
+#endif  // !_WIN32
+  for_range(ElemwiseGradNoBroadcast<T, DX_OP, DY_OP>{
+      x.data<T>(), y.data<T>(), out.data<T>(), dout.data<T>(), dx_op, dy_op,
+      dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
+      dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace())});
+}
+
+template <typename DeviceContext, typename T, typename DX_OP, typename DY_OP>
+void ElemwiseGradComputeWithBroadcast(
+    const framework::ExecutionContext &ctx, const framework::DDim &x_dim,
+    const framework::DDim &y_dim_untrimed, const framework::Tensor &x,
+    const framework::Tensor &y, const framework::Tensor &out,
+    const framework::Tensor &dout, int axis, framework::Tensor *dx,
+    framework::Tensor *dy, DX_OP dx_op, DY_OP dy_op) {
+  axis = (axis == -1 ? x_dim.size() - y_dim_untrimed.size() : axis);
+  auto y_dim = trim_trailing_singular_dims(y_dim_untrimed);
+  axis = (y_dim.size() == 0) ? x_dim.size() : axis;
+
+  int pre, n, post;
+  get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post);
+  if (post == 1) {
+    int h = pre;
+    int w = n;
+    if (platform::is_gpu_place(ctx.GetPlace())) {
 #ifdef __NVCC__
-        ElemwiseGradBroadcast1CUDA(
-            ctx.template device_context<DeviceContext>().stream(), x.data<T>(),
-            y.data<T>(), out.data<T>(), dout.data<T>(), h, w, dx_op, dy_op,
-            dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
-            dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
+      ElemwiseGradBroadcast1CUDA(
+          ctx.template device_context<DeviceContext>().stream(), x.data<T>(),
+          y.data<T>(), out.data<T>(), dout.data<T>(), h, w, dx_op, dy_op,
+          dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
+          dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
 #endif
-      } else {
-        ElemwiseGradBroadcast1CPU(
-            x.data<T>(), y.data<T>(), out.data<T>(), dout.data<T>(), h, w,
-            dx_op, dy_op,
-            dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
-            dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
-      }
     } else {
-      if (platform::is_gpu_place(ctx.GetPlace())) {
+      ElemwiseGradBroadcast1CPU(
+          x.data<T>(), y.data<T>(), out.data<T>(), dout.data<T>(), h, w, dx_op,
+          dy_op, dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
+          dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
+    }
+  } else {
+    if (platform::is_gpu_place(ctx.GetPlace())) {
 #ifdef __NVCC__
-        ElemwiseGradBroadcast2CUDA(
-            ctx.template device_context<DeviceContext>().stream(), x.data<T>(),
-            y.data<T>(), out.data<T>(), dout.data<T>(), pre, n, post, dx_op,
-            dy_op,
-            dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
-            dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
+      ElemwiseGradBroadcast2CUDA(
+          ctx.template device_context<DeviceContext>().stream(), x.data<T>(),
+          y.data<T>(), out.data<T>(), dout.data<T>(), pre, n, post, dx_op,
+          dy_op, dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
+          dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
 #endif
-      } else {
-        ElemwiseGradBroadcast2CPU(
-            x.data<T>(), y.data<T>(), out.data<T>(), dout.data<T>(), pre, n,
-            post, dx_op, dy_op,
-            dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
-            dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
-      }
+    } else {
+      ElemwiseGradBroadcast2CPU(
+          x.data<T>(), y.data<T>(), out.data<T>(), dout.data<T>(), pre, n, post,
+          dx_op, dy_op,
+          dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
+          dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
+    }
+  }
+}
+
+template <typename DeviceContext, typename T, typename DX_OP, typename DY_OP>
+void ElemwiseGradCompute(const framework::ExecutionContext &ctx,
+                         const framework::Tensor &x, const framework::Tensor &y,
+                         const framework::Tensor &out,
+                         const framework::Tensor &dout, int axis,
+                         framework::Tensor *dx, framework::Tensor *dy,
+                         DX_OP dx_op, DY_OP dy_op) {
+  const framework::DDim &x_dim = x.dims();
+  const framework::DDim &y_dim = y.dims();
+  if (x.dims() == y.dims()) {
+    ElemwiseGradComputeNoBroadcast<DeviceContext, T, DX_OP, DY_OP>(
+        ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
+  } else {  // Y is a scalar
+    ElemwiseGradComputeWithBroadcast<DeviceContext, T, DX_OP, DY_OP>(
+        ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
+  }
+}
+
+// NOTE(dzhwinter): Only used in elementwise_add, elementwise_sub.
+// explicit gradient can cut off X, Y, Out from gradient op
+// In elementwise_add, elementwise_sub, we use dout as fake X, Y, Out to reuse
+// elementwise code.
+template <typename DeviceContext, typename T, typename DX_OP, typename DY_OP>
+void ElemwiseExplicitGradCompute(const framework::ExecutionContext &ctx,
+                                 const framework::Tensor &x,
+                                 const framework::Tensor &y,
+                                 const framework::Tensor &out,
+                                 const framework::Tensor &dout, int axis,
+                                 framework::Tensor *dx, framework::Tensor *dy,
+                                 DX_OP dx_op, DY_OP dy_op) {
+  if (dy == nullptr) {
+    const framework::DDim &dx_dims = dout.dims();
+    auto dy_dims = dx_dims;
+    ElemwiseGradComputeNoBroadcast<DeviceContext, T, DX_OP, DY_OP>(
+        ctx, dx_dims, dy_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
+  } else {
+    if (dout.dims() == dy->dims()) {
+      const framework::DDim &dx_dims = dout.dims();
+      const framework::DDim &dy_dims = dy->dims();
+      ElemwiseGradComputeNoBroadcast<DeviceContext, T, DX_OP, DY_OP>(
+          ctx, dx_dims, dy_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
+    } else {  // Y is a scalar
+      auto dx_dims = dout.dims();
+      const framework::DDim &dy_dims = dy->dims();
+      ElemwiseGradComputeWithBroadcast<DeviceContext, T, DX_OP, DY_OP>(
+          ctx, dx_dims, dy_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
     }
   }
 }
 
+// Deprecated
 template <typename DeviceContext, typename T, typename functor,
           typename broadcastfunctor, typename broadcast2functor>
-void ElementwiseGradCompute(const framework::ExecutionContext& ctx,
-                            const framework::Tensor* x,
-                            const framework::Tensor* y,
-                            const framework::Tensor* out,
-                            const framework::Tensor* dout, int axis,
-                            framework::Tensor* dx, framework::Tensor* dy) {
-  auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+void ElementwiseGradCompute(const framework::ExecutionContext &ctx,
+                            const framework::Tensor *x,
+                            const framework::Tensor *y,
+                            const framework::Tensor *out,
+                            const framework::Tensor *dout, int axis,
+                            framework::Tensor *dx, framework::Tensor *dy) {
+  auto &place = *ctx.template device_context<DeviceContext>().eigen_device();
 
   auto x_dims = x->dims();
   auto y_dims = y->dims();
@@ -547,7 +625,7 @@ void ElementwiseGradCompute(const framework::ExecutionContext& ctx,
   }
 
   axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
-  trim_trailing_singular_dims(&y_dims);
+  trim_trailing_singular_dims(y_dims);
   axis = (y_dims.size() == 0) ? x_dims.size() : axis;
 
   int pre, n, post;
@@ -566,27 +644,27 @@ void ElementwiseGradCompute(const framework::ExecutionContext& ctx,
 
 template <typename Functor, typename DeviceContext, typename T,
           typename OutType = T>
-void ElementwiseComputeEx(const framework::ExecutionContext& ctx,
-                          const framework::Tensor* x,
-                          const framework::Tensor* y, int axis, Functor func,
-                          framework::Tensor* z) {
+
+void ElementwiseComputeEx(const framework::ExecutionContext &ctx,
+                          const framework::Tensor *x,
+                          const framework::Tensor *y, int axis, Functor func,
+                          framework::Tensor *z) {
   TransformFunctor<Functor, T, DeviceContext, OutType> functor(
       x, y, z, ctx.template device_context<DeviceContext>(), func);
-
   auto x_dims = x->dims();
-  auto y_dims = y->dims();
-  PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
+  auto y_dims_untrimed = y->dims();
+  PADDLE_ENFORCE_GE(x_dims.size(), y_dims_untrimed.size(),
                     "Rank of first input must >= rank of second input.");
 
-  if (x_dims == y_dims) {
+  if (x_dims == y_dims_untrimed) {
     functor.Run();
     return;
   }
 
-  axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
+  axis = (axis == -1 ? x_dims.size() - y_dims_untrimed.size() : axis);
   PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
                  "Axis should be in range [0, x_dims)");
-  trim_trailing_singular_dims(&y_dims);
+  auto y_dims = trim_trailing_singular_dims(y_dims_untrimed);
   axis = (y_dims.size() == 0) ? x_dims.size() : axis;
 
   int pre, n, post;
@@ -600,5 +678,823 @@ void ElementwiseComputeEx(const framework::ExecutionContext& ctx,
   }
 }
 
+// FusedElemwiseAndAct
+// --- forward
+template <typename T, typename CompoundFunctor, bool KeepIntermediateOut>
+struct FusedElemwiseAndActNoBroadcast {
+  HOSTDEVICE void operator()(size_t i) {
+    T y_val = y_[i];
+    T x_val = x_[i];
+    if (KeepIntermediateOut) {
+      T intermeidiate_out = compound_functor_.GetIntermediateOut(x_val, y_val);
+      intermediate_out_[i] = intermeidiate_out;
+      out_[i] =
+          compound_functor_.GetOutUseIntermediateOut(x_val, intermeidiate_out);
+    } else {
+      out_[i] = compound_functor_.GetOut(x_val, y_val);
+    }
+  }
+
+  const T *x_;
+  const T *y_;
+  CompoundFunctor compound_functor_;
+  T *out_;
+  T *intermediate_out_;
+};
+
+// FusedElemwiseAndActBroadcast1:
+// In this case, X and Y can be reshaped to a matrix.
+// For example shape(X) = (2, 3, 4, 5), shape(Y) = (4, 5) and axis = -1 or 2,
+// X can be reshaped to (6, 20) and Y can be reshaped to (1, 20)
+template <typename T, typename CompoundFunctor, bool BcastY,
+          bool KeepIntermediateOut, bool SameShapeOfIntermediateOutAndOut>
+static void FusedElemwiseAndActBroadcast1CPU(const T *x, const T *y,
+                                             CompoundFunctor compound_functor,
+                                             int h, int w, T *out,
+                                             T *intermediate_out) {
+  for (int i = 0; i < h; ++i) {
+    for (int j = 0; j < w; ++j) {
+      int offset = i * w + j;
+
+      T y_val = BcastY ? y[j] : y[offset];
+      T x_val = BcastY ? x[offset] : x[j];
+      int64_t intermediate_out_offset;
+      if (KeepIntermediateOut) {
+        T intermeidiate_out = compound_functor.GetIntermediateOut(x_val, y_val);
+
+        if (SameShapeOfIntermediateOutAndOut) {
+          // for the case of f1(f2(x, y))
+          intermediate_out_offset = offset;
+        } else if (BcastY) {
+          intermediate_out_offset = j;
+        } else {
+          intermediate_out_offset = offset;
+        }
+
+        intermediate_out[intermediate_out_offset] = intermeidiate_out;
+        out[offset] =
+            compound_functor.GetOutUseIntermediateOut(x_val, intermeidiate_out);
+      } else {
+        out[offset] = compound_functor.GetOut(x_val, y_val);
+      }
+    }
+  }
+}
+
+// FusedElemwiseAndActBroadcast2
+// In this case, X and Y can be reshaped to a matrix.
+// For example shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4) and axis = 1,
+// X can be reshaped to (2, 12, 5) and Y can be reshaped to (1, 12, 1)
+// pre = 2, n = 12, post = 5
+template <typename T, typename CompoundFunctor, bool BcastY,
+          bool KeepIntermediateOut, bool SameShapeOfIntermediateOutAndOut>
+static void FusedElemwiseAndActBroadcast2CPU(const T *x, const T *y, int pre,
+                                             int n, int post,
+                                             CompoundFunctor compound_functor,
+                                             T *out, T *intermediate_out) {
+  for (int i = 0; i < pre; ++i) {
+    for (int j = 0; j < n; ++j) {
+      for (int k = 0; k < post; ++k) {
+        int offset = i * n * post + j * post + k;
+
+        T y_val = BcastY ? y[j] : y[offset];
+        T x_val = BcastY ? x[offset] : x[j];
+        int64_t intermediate_out_offset;
+
+        if (KeepIntermediateOut) {
+          T intermeidiate_out =
+              compound_functor.GetIntermediateOut(x_val, y_val);
+
+          if (SameShapeOfIntermediateOutAndOut) {
+            // for the case of f1(f2(x, y))
+            intermediate_out_offset = offset;
+          } else if (BcastY) {
+            intermediate_out_offset = j;
+          } else {
+            intermediate_out_offset = offset;
+          }
+
+          intermediate_out[intermediate_out_offset] = intermeidiate_out;
+          out[offset] = compound_functor.GetOutUseIntermediateOut(
+              x_val, intermeidiate_out);
+        } else {
+          out[offset] = compound_functor.GetOut(x_val, y_val);
+        }
+      }
+    }
+  }
+}
+
+#ifdef __NVCC__
+template <typename T, typename CompoundFunctor, bool BcastY,
+          bool KeepIntermediateOut, bool SameShapeOfIntermediateOutAndOut>
+static __global__ void FusedElemwiseAndActBroadcast1CUDAKernel(
+    const T *x, const T *y, int h, int w, CompoundFunctor compound_functor,
+    T *out, T *intermediate_out) {
+  int j = blockIdx.x;
+  int i = threadIdx.x;
+
+  while (i < h) {
+    int offset = i * w + j;
+
+    T y_val = BcastY ? y[j] : y[offset];
+    T x_val = BcastY ? x[offset] : x[j];
+    int64_t intermediate_out_offset;
+
+    if (KeepIntermediateOut) {
+      T intermeidiate_out = compound_functor.GetIntermediateOut(x_val, y_val);
+
+      if (SameShapeOfIntermediateOutAndOut) {
+        // for the case of f1(f2(x, y))
+        intermediate_out_offset = offset;
+      } else if (BcastY) {
+        intermediate_out_offset = j;
+      } else {
+        intermediate_out_offset = offset;
+      }
+
+      intermediate_out[intermediate_out_offset] = intermeidiate_out;
+      out[offset] =
+          compound_functor.GetOutUseIntermediateOut(x_val, intermeidiate_out);
+    } else {
+      out[offset] = compound_functor.GetOut(x_val, y_val);
+    }
+
+    i += ELEMWISE_MAX_BLOCK_DIM;
+  }
+}
+
+template <typename T, typename CompoundFunctor, bool BcastY,
+          bool KeepIntermediateOut, bool SameShapeOfIntermediateOutAndOut>
+static void FusedElemwiseAndActBroadcast1CUDA(cudaStream_t stream, const T *x,
+                                              const T *y,
+                                              CompoundFunctor compound_functor,
+                                              int h, int w, T *out,
+                                              T *intermediate_out) {
+  int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, h);
+  int gird_size = w;
+  FusedElemwiseAndActBroadcast1CUDAKernel<
+      T, CompoundFunctor, BcastY, KeepIntermediateOut,
+      SameShapeOfIntermediateOutAndOut><<<gird_size, block_size, 0, stream>>>(
+      x, y, h, w, compound_functor, out, intermediate_out);
+}
+
+template <typename T, typename CompoundFunctor, bool BcastY,
+          bool KeepIntermediateOut, bool SameShapeOfIntermediateOutAndOut>
+static __global__ void FusedElemwiseAndActBroadcast2CUDAKernel(
+    const T *x, const T *y, CompoundFunctor compound_functor, int pre, int n,
+    int post, T *out, T *intermediate_out) {
+  int tid = threadIdx.x;
+  int j = blockIdx.x;
+
+  while (true) {
+    int i = tid / post;
+    int k = tid % post;
+    if (i >= pre) break;
+
+    int offset = i * n * post + j * post + k;
+
+    T y_val = BcastY ? y[j] : y[offset];
+    T x_val = BcastY ? x[offset] : x[j];
+    int64_t intermediate_out_offset;
+
+    if (KeepIntermediateOut) {
+      T intermeidiate_out = compound_functor.GetIntermediateOut(x_val, y_val);
+
+      if (SameShapeOfIntermediateOutAndOut) {
+        // for the case of f1(f2(x, y))
+        intermediate_out_offset = offset;
+      } else if (BcastY) {
+        intermediate_out_offset = j;
+      } else {
+        intermediate_out_offset = offset;
+      }
+
+      intermediate_out[intermediate_out_offset] = intermeidiate_out;
+      out[offset] =
+          compound_functor.GetOutUseIntermediateOut(x_val, intermeidiate_out);
+    } else {
+      out[offset] = compound_functor.GetOut(x_val, y_val);
+    }
+
+    tid += ELEMWISE_MAX_BLOCK_DIM;
+  }
+}
+
+template <typename T, typename CompoundFunctor, bool BcastY,
+          bool KeepIntermediateOut, bool SameShapeOfIntermediateOutAndOut>
+static void FusedElemwiseAndActBroadcast2CUDA(cudaStream_t stream, const T *x,
+                                              const T *y, int pre, int n,
+                                              int post,
+                                              CompoundFunctor compound_functor,
+                                              T *out, T *intermediate_out) {
+  int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, pre * post);
+  int gird_size = n;
+
+  FusedElemwiseAndActBroadcast2CUDAKernel<
+      T, CompoundFunctor, BcastY, KeepIntermediateOut,
+      SameShapeOfIntermediateOutAndOut><<<gird_size, block_size, 0, stream>>>(
+      x, y, compound_functor, pre, n, post, out, intermediate_out);
+}
+
+#endif
+
+template <typename DeviceContext, typename T, typename CompoundFunctor,
+          bool KeepIntermediateOut>
+void FusedElemwiseAndActComputeNoBroadcast(
+    const framework::ExecutionContext &ctx, const framework::DDim &x_dim,
+    const framework::Tensor &x, const framework::Tensor &y,
+    CompoundFunctor compound_functor, framework::Tensor *out,
+    framework::Tensor *intermediate_out) {
+  size_t N = static_cast<size_t>(framework::product(x_dim));
+
+  platform::ForRange<DeviceContext> for_range(
+      ctx.template device_context<DeviceContext>(), N);
+
+  for_range(
+      FusedElemwiseAndActNoBroadcast<T, CompoundFunctor, KeepIntermediateOut>{
+          x.data<T>(), y.data<T>(), compound_functor,
+          out->mutable_data<T>(ctx.GetPlace()),
+          intermediate_out == nullptr
+              ? nullptr
+              : intermediate_out->mutable_data<T>(ctx.GetPlace())});
+}
+
+template <typename DeviceContext, typename T, typename CompoundFunctor,
+          bool BcastY, bool KeepIntermediateOut,
+          bool SameShapeOfIntermediateOutAndOut>
+void FusedElemwiseAndActComputeWithBroadcast(
+    const framework::ExecutionContext &ctx, const framework::DDim &x_dim,
+    const framework::DDim &y_dim_untrimed, const framework::Tensor &x,
+    const framework::Tensor &y, CompoundFunctor compound_functor, int axis,
+    framework::Tensor *out, framework::Tensor *intermediate_out) {
+  axis = (axis == -1 ? x_dim.size() - y_dim_untrimed.size() : axis);
+  auto y_dim = trim_trailing_singular_dims(y_dim_untrimed);
+  axis = (y_dim.size() == 0) ? x_dim.size() : axis;
+
+  int pre, n, post;
+  get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post);
+
+  if (post == 1) {
+    int h = pre;
+    int w = n;
+    if (platform::is_gpu_place(ctx.GetPlace())) {
+#ifdef __NVCC__
+      FusedElemwiseAndActBroadcast1CUDA<T, CompoundFunctor, BcastY,
+                                        KeepIntermediateOut,
+                                        SameShapeOfIntermediateOutAndOut>(
+          ctx.template device_context<DeviceContext>().stream(), x.data<T>(),
+          y.data<T>(), compound_functor, h, w,
+          out->mutable_data<T>(ctx.GetPlace()),
+          intermediate_out == nullptr
+              ? nullptr
+              : intermediate_out->mutable_data<T>(ctx.GetPlace()));
+#endif
+    } else {
+      FusedElemwiseAndActBroadcast1CPU<T, CompoundFunctor, BcastY,
+                                       KeepIntermediateOut,
+                                       SameShapeOfIntermediateOutAndOut>(
+          x.data<T>(), y.data<T>(), compound_functor, h, w,
+          out->mutable_data<T>(ctx.GetPlace()),
+          intermediate_out == nullptr
+              ? nullptr
+              : intermediate_out->mutable_data<T>(ctx.GetPlace()));
+    }
+  } else {
+    if (platform::is_gpu_place(ctx.GetPlace())) {
+#ifdef __NVCC__
+      FusedElemwiseAndActBroadcast2CUDA<T, CompoundFunctor, BcastY,
+                                        KeepIntermediateOut,
+                                        SameShapeOfIntermediateOutAndOut>(
+          ctx.template device_context<DeviceContext>().stream(), x.data<T>(),
+          y.data<T>(), pre, n, post, compound_functor,
+          out->mutable_data<T>(ctx.GetPlace()),
+          intermediate_out == nullptr
+              ? nullptr
+              : intermediate_out->mutable_data<T>(ctx.GetPlace()));
+#endif
+    } else {
+      FusedElemwiseAndActBroadcast2CPU<T, CompoundFunctor, BcastY,
+                                       KeepIntermediateOut,
+                                       SameShapeOfIntermediateOutAndOut>(
+          x.data<T>(), y.data<T>(), pre, n, post, compound_functor,
+          out->mutable_data<T>(ctx.GetPlace()),
+          intermediate_out == nullptr
+              ? nullptr
+              : intermediate_out->mutable_data<T>(ctx.GetPlace()));
+    }
+  }
+}
+
+// --- backward
+template <typename T, typename DX_OP, typename DY_OP, bool UseIntermediateOut>
+struct FusedElemwiseAndActGradNoBroadcast {
+  HOSTDEVICE void operator()(size_t i) {
+    if (dx_ != nullptr) {
+      dx_[i] = UseIntermediateOut ? dx_op_(x_[i], y_[i], intermediate_out_[i],
+                                           out_[i], dout_[i])
+                                  : dx_op_(x_[i], y_[i], out_[i], dout_[i]);
+    }
+    if (dy_ != nullptr) {
+      dy_[i] = UseIntermediateOut ? dy_op_(x_[i], y_[i], intermediate_out_[i],
+                                           out_[i], dout_[i])
+                                  : dy_op_(x_[i], y_[i], out_[i], dout_[i]);
+    }
+  }
+
+  const T *x_;
+  const T *y_;
+  const T *intermediate_out_;
+  const T *out_;
+  const T *dout_;
+  DX_OP dx_op_;
+  DY_OP dy_op_;
+  T *dx_;
+  T *dy_;
+};
+
+template <typename DeviceContext, typename T, typename DX_OP, typename DY_OP,
+          bool UseIntermediateOut>
+void FusedElemwiseAndActGradComputeNoBroadcast(
+    const framework::ExecutionContext &ctx, const framework::DDim &x_dim,
+    const framework::DDim &y_dim, const framework::Tensor *x,
+    const framework::Tensor *y, const framework::Tensor *intermediate_out,
+    const framework::Tensor *out, const framework::Tensor *dout, int axis,
+    framework::Tensor *dx, framework::Tensor *dy, DX_OP dx_op, DY_OP dy_op) {
+  size_t N = static_cast<size_t>(framework::product(x_dim));
+  platform::ForRange<DeviceContext> for_range(
+      ctx.template device_context<DeviceContext>(), N);
+  for_range(
+      FusedElemwiseAndActGradNoBroadcast<T, DX_OP, DY_OP, UseIntermediateOut>{
+          x->data<T>(), y->data<T>(),
+          intermediate_out ? intermediate_out->data<T>() : nullptr,
+          out->data<T>(), dout->data<T>(), dx_op, dy_op,
+          dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
+          dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace())});
+}
+
+template <typename T, typename DX_OP, typename DY_OP, bool UseIntermediateOut,
+          bool BcastY, bool SameShapeOfIntermediateOutAndOut>
+static void FusedElemwiseAndActGradBroadcast1CPU(const T *x, const T *y,
+                                                 const T *intermediate_out,
+                                                 const T *out, const T *dout,
+                                                 int h, int w, DX_OP dx_op,
+                                                 DY_OP dy_op, T *dx, T *dy) {
+  int64_t tmp_out_idx, x_idx, y_idx;
+  for (int i = 0; i < h; ++i) {
+    for (int j = 0; j < w; ++j) {
+      int offset = i * w + j;
+
+      tmp_out_idx = BcastY ? j : offset;
+      y_idx = BcastY ? j : offset;
+      x_idx = BcastY ? offset : j;
+
+      if (SameShapeOfIntermediateOutAndOut) {
+        tmp_out_idx = offset;
+      }
+
+      if (dx != nullptr) {
+        T tmp = UseIntermediateOut
+                    ? dx_op(x[x_idx], y[y_idx], intermediate_out[tmp_out_idx],
+                            out[offset], dout[offset])
+                    : dx_op(x[x_idx], y[y_idx], out[offset], dout[offset]);
+
+        if (BcastY) {
+          dx[x_idx] = tmp;
+        } else {
+          if (i == 0) {
+            dx[x_idx] = tmp;
+          } else {
+            dx[x_idx] += tmp;
+          }
+        }
+      }
+      if (dy != nullptr) {
+        T tmp = UseIntermediateOut
+                    ? dy_op(x[x_idx], y[y_idx], intermediate_out[tmp_out_idx],
+                            out[offset], dout[offset])
+                    : dy_op(x[x_idx], y[y_idx], out[offset], dout[offset]);
+        if (BcastY) {
+          if (i == 0) {
+            dy[y_idx] = tmp;
+          } else {
+            dy[y_idx] += tmp;
+          }
+        } else {
+          dy[y_idx] = tmp;
+        }
+      }
+    }
+  }
+}
+
+template <typename T, typename DX_OP, typename DY_OP, bool UseIntermediateOut,
+          bool BcastY, bool SameShapeOfIntermediateOutAndOut>
+static void FusedElemwiseAndActGradBroadcast2CPU(const T *x, const T *y,
+                                                 const T *intermediate_out,
+                                                 const T *out, const T *dout,
+                                                 int pre, int n, int post,
+                                                 DX_OP dx_op, DY_OP dy_op,
+                                                 T *dx, T *dy) {
+  int64_t tmp_out_idx, x_idx, y_idx;
+  for (int i = 0; i < pre; ++i) {
+    for (int j = 0; j < n; ++j) {
+      for (int k = 0; k < post; ++k) {
+        int offset = i * n * post + j * post + k;
+
+        tmp_out_idx = BcastY ? j : offset;
+        y_idx = BcastY ? j : offset;
+        x_idx = BcastY ? offset : j;
+
+        if (SameShapeOfIntermediateOutAndOut) {
+          tmp_out_idx = offset;
+        }
+
+        if (dx != nullptr) {
+          T tmp = UseIntermediateOut
+                      ? dx_op(x[x_idx], y[y_idx], intermediate_out[tmp_out_idx],
+                              out[offset], dout[offset])
+                      : dx_op(x[x_idx], y[y_idx], out[offset], dout[offset]);
+
+          if (BcastY) {
+            dx[x_idx] = tmp;
+          } else {
+            if (i == 0 && k == 0) {
+              dx[x_idx] = tmp;
+            } else {
+              dx[x_idx] += tmp;
+            }
+          }
+        }
+        if (dy != nullptr) {
+          T tmp = UseIntermediateOut
+                      ? dy_op(x[x_idx], y[y_idx], intermediate_out[tmp_out_idx],
+                              out[offset], dout[offset])
+                      : dy_op(x[x_idx], y[y_idx], out[offset], dout[offset]);
+          if (BcastY) {
+            if (i == 0 && k == 0) {
+              dy[y_idx] = tmp;
+            } else {
+              dy[y_idx] += tmp;
+            }
+          } else {
+            dy[y_idx] = tmp;
+          }
+        }
+      }
+    }
+  }
+}
+
+#ifdef __NVCC__
+template <typename T, typename DX_OP, typename DY_OP, bool UseIntermediateOut,
+          bool BcastY, bool SameShapeOfIntermediateOutAndOut>
+static __global__ void FusedElemwiseAndActGradBroadcast1CUDAKernel(
+    const T *x, const T *y, const T *intermediate_out, const T *out,
+    const T *dout, int h, int w, DX_OP dx_op, DY_OP dy_op, T *dx, T *dy) {
+  int j = blockIdx.x;
+  int i = threadIdx.x;
+  int tid = threadIdx.x;
+  T val(0);
+  int64_t tmp_out_idx, x_idx, y_idx;
+
+  do {
+    int offset = i * w + j;
+
+    tmp_out_idx = BcastY ? j : offset;
+    y_idx = BcastY ? j : offset;
+    x_idx = BcastY ? offset : j;
+
+    if (SameShapeOfIntermediateOutAndOut) {
+      tmp_out_idx = offset;
+    }
+
+    if (dx != nullptr) {
+      T tmp = UseIntermediateOut
+                  ? dx_op(x[x_idx], y[y_idx], intermediate_out[tmp_out_idx],
+                          out[offset], dout[offset])
+                  : dx_op(x[x_idx], y[y_idx], out[offset], dout[offset]);
+
+      if (BcastY) {
+        dx[x_idx] = tmp;
+      } else {
+        val += tmp;
+      }
+    }
+    if (dy != nullptr) {
+      T tmp = UseIntermediateOut
+                  ? dy_op(x[x_idx], y[y_idx], intermediate_out[tmp_out_idx],
+                          out[offset], dout[offset])
+                  : dy_op(x[x_idx], y[y_idx], out[offset], dout[offset]);
+      if (BcastY) {
+        val += tmp;
+      } else {
+        dy[y_idx] = tmp;
+      }
+    }
+
+    i += ELEMWISE_MAX_BLOCK_DIM;
+  } while (i < h);
+
+  if (BcastY) {
+    if (dy) {
+      h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h;
+      val = paddle::platform::reduceSum(val, tid, h);
+      if (threadIdx.x == 0) {
+        dy[j] = val;
+      }
+    }
+  } else {
+    if (dx) {
+      h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h;
+      val = paddle::platform::reduceSum(val, tid, h);
+      if (threadIdx.x == 0) {
+        dx[j] = val;
+      }
+    }
+  }
+}
+
+template <typename T, typename DX_OP, typename DY_OP, bool UseIntermediateOut,
+          bool BcastY, bool SameShapeOfIntermediateOutAndOut>
+static void FusedElemwiseAndActGradBroadcast1CUDA(cudaStream_t stream,
+                                                  const T *x, const T *y,
+                                                  const T *intermediate_out,
+                                                  const T *out, const T *dout,
+                                                  int h, int w, DX_OP dx_op,
+                                                  DY_OP dy_op, T *dx, T *dy) {
+  int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, h);
+  int gird_size = w;
+  FusedElemwiseAndActGradBroadcast1CUDAKernel<
+      T, DX_OP, DY_OP, UseIntermediateOut, BcastY,
+      SameShapeOfIntermediateOutAndOut><<<gird_size, block_size, 0, stream>>>(
+      x, y, intermediate_out, out, dout, h, w, dx_op, dy_op, dx, dy);
+}
+
+template <typename T, typename DX_OP, typename DY_OP, bool UseIntermediateOut,
+          bool BcastY, bool SameShapeOfIntermediateOutAndOut>
+static __global__ void FusedElemwiseAndActGradBroadcast2CUDAKernel(
+    const T *x, const T *y, const T *intermediate_out, const T *out,
+    const T *dout, int pre, int n, int post, DX_OP dx_op, DY_OP dy_op, T *dx,
+    T *dy) {
+  int tid = threadIdx.x;
+  int j = blockIdx.x;
+
+  T val(0);
+  int ttid = tid;
+  int64_t tmp_out_idx, x_idx, y_idx;
+  while (true) {
+    int i = ttid / post;
+    int k = ttid % post;
+    if (i >= pre) break;
+
+    int offset = i * n * post + j * post + k;
+
+    tmp_out_idx = BcastY ? j : offset;
+    y_idx = BcastY ? j : offset;
+    x_idx = BcastY ? offset : j;
+
+    if (SameShapeOfIntermediateOutAndOut) {
+      tmp_out_idx = offset;
+    }
+
+    if (dx != nullptr) {
+      T tmp = UseIntermediateOut
+                  ? dx_op(x[x_idx], y[y_idx], intermediate_out[tmp_out_idx],
+                          out[offset], dout[offset])
+                  : dx_op(x[x_idx], y[y_idx], out[offset], dout[offset]);
+
+      if (BcastY) {
+        dx[x_idx] = tmp;
+      } else {
+        val += tmp;
+      }
+    }
+    if (dy != nullptr) {
+      T tmp = UseIntermediateOut
+                  ? dy_op(x[x_idx], y[y_idx], intermediate_out[tmp_out_idx],
+                          out[offset], dout[offset])
+                  : dy_op(x[x_idx], y[y_idx], out[offset], dout[offset]);
+      if (BcastY) {
+        val += tmp;
+      } else {
+        dy[y_idx] = tmp;
+      }
+    }
+
+    ttid += ELEMWISE_MAX_BLOCK_DIM;
+  }
+
+  if (BcastY) {
+    if (dy) {
+      int h = pre * post;
+      h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h;
+      val = paddle::platform::reduceSum(val, tid, h);
+      if (threadIdx.x == 0) {
+        dy[j] = val;
+      }
+    }
+  } else {
+    if (dx) {
+      int h = pre * post;
+      h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h;
+      val = paddle::platform::reduceSum(val, tid, h);
+      if (threadIdx.x == 0) {
+        dx[j] = val;
+      }
+    }
+  }
+}
+
+template <typename T, typename DX_OP, typename DY_OP, bool UseIntermediateOut,
+          bool BcastY, bool SameShapeOfIntermediateOutAndOut>
+static void FusedElemwiseAndActGradBroadcast2CUDA(
+    cudaStream_t stream, const T *x, const T *y, const T *intermediate_out,
+    const T *out, const T *dout, int pre, int n, int post, DX_OP dx_op,
+    DY_OP dy_op, T *dx, T *dy) {
+  int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, pre * post);
+  int gird_size = n;
+  FusedElemwiseAndActGradBroadcast2CUDAKernel<
+      T, DX_OP, DY_OP, UseIntermediateOut, BcastY,
+      SameShapeOfIntermediateOutAndOut><<<gird_size, block_size, 0, stream>>>(
+      x, y, intermediate_out, out, dout, pre, n, post, dx_op, dy_op, dx, dy);
+}
+#endif
+
+template <typename DeviceContext, typename T, typename DX_OP, typename DY_OP,
+          bool UseIntermediateOut, bool BcastY,
+          bool SameShapeOfIntermediateOutAndOut>
+void FusedElemwiseAndActGradComputeWithBroadcast(
+    const framework::ExecutionContext &ctx, const framework::DDim &x_dim,
+    const framework::DDim &y_dim_untrimed, const framework::Tensor *x,
+    const framework::Tensor *y, const framework::Tensor *intermediate_out,
+    const framework::Tensor *out, const framework::Tensor *dout, int axis,
+    framework::Tensor *dx, framework::Tensor *dy, DX_OP dx_op, DY_OP dy_op) {
+  axis = (axis == -1 ? x_dim.size() - y_dim_untrimed.size() : axis);
+  auto y_dim = trim_trailing_singular_dims(y_dim_untrimed);
+  axis = (y_dim.size() == 0) ? x_dim.size() : axis;
+
+  int pre, n, post;
+  get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post);
+  if (post == 1) {
+    int h = pre;
+    int w = n;
+    if (platform::is_gpu_place(ctx.GetPlace())) {
+#ifdef __NVCC__
+      FusedElemwiseAndActGradBroadcast1CUDA<T, DX_OP, DY_OP, UseIntermediateOut,
+                                            BcastY,
+                                            SameShapeOfIntermediateOutAndOut>(
+          ctx.template device_context<DeviceContext>().stream(), x->data<T>(),
+          y->data<T>(),
+          intermediate_out == nullptr ? nullptr : intermediate_out->data<T>(),
+          out->data<T>(), dout->data<T>(), h, w, dx_op, dy_op,
+          dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
+          dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
+#endif
+    } else {
+      FusedElemwiseAndActGradBroadcast1CPU<T, DX_OP, DY_OP, UseIntermediateOut,
+                                           BcastY,
+                                           SameShapeOfIntermediateOutAndOut>(
+          x->data<T>(), y->data<T>(),
+          intermediate_out == nullptr ? nullptr : intermediate_out->data<T>(),
+          out->data<T>(), dout->data<T>(), h, w, dx_op, dy_op,
+          dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
+          dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
+    }
+  } else {
+    if (platform::is_gpu_place(ctx.GetPlace())) {
+#ifdef __NVCC__
+      FusedElemwiseAndActGradBroadcast2CUDA<T, DX_OP, DY_OP, UseIntermediateOut,
+                                            BcastY,
+                                            SameShapeOfIntermediateOutAndOut>(
+          ctx.template device_context<DeviceContext>().stream(), x->data<T>(),
+          y->data<T>(),
+          intermediate_out == nullptr ? nullptr : intermediate_out->data<T>(),
+          out->data<T>(), dout->data<T>(), pre, n, post, dx_op, dy_op,
+          dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
+          dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
+#endif
+    } else {
+      FusedElemwiseAndActGradBroadcast2CPU<T, DX_OP, DY_OP, UseIntermediateOut,
+                                           BcastY,
+                                           SameShapeOfIntermediateOutAndOut>(
+          x->data<T>(), y->data<T>(),
+          intermediate_out == nullptr ? nullptr : intermediate_out->data<T>(),
+          out->data<T>(), dout->data<T>(), pre, n, post, dx_op, dy_op,
+          dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
+          dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
+    }
+  }
+}
+
+template <typename DeviceContext, typename T, typename DX_OP, typename DY_OP,
+          bool UseIntermediateOut, bool SameShapeOfIntermediateOutAndOut>
+void FusedElemwiseAndActGradComputeEx(
+    const framework::ExecutionContext &ctx, const framework::Tensor *x,
+    const framework::Tensor *y, const framework::Tensor *out,
+    const framework::Tensor *intermediate_out, const framework::Tensor *dout,
+    int axis, framework::Tensor *dx, framework::Tensor *dy, DX_OP dx_op,
+    DY_OP dy_op) {
+  const framework::DDim &x_dim = x->dims();
+  const framework::DDim &y_dim = y->dims();
+  if (UseIntermediateOut) {
+    PADDLE_ENFORCE(intermediate_out, "intermediate_out should not be nullptr");
+  }
+  if (x_dim == y_dim) {
+    FusedElemwiseAndActGradComputeNoBroadcast<DeviceContext, T, DX_OP, DY_OP,
+                                              UseIntermediateOut>(
+        ctx, x_dim, y_dim, x, y, intermediate_out, out, dout, axis, dx, dy,
+        dx_op, dy_op);
+  } else {  // Y is a scalar
+    bool bcast_y = x_dim.size() >= y_dim.size();
+    if (x_dim.size() == y_dim.size()) {
+      for (int i = 0; i < x_dim.size(); ++i) {
+        if (x_dim[i] < y_dim[i]) {
+          bcast_y = false;
+          break;
+        }
+      }
+    }
+
+    // z = f1(x, f2(y))
+    // z = f1(f2(x, y))
+    if (bcast_y) {  // Y should be broadcast.
+      FusedElemwiseAndActGradComputeWithBroadcast<
+          DeviceContext, T, DX_OP, DY_OP, UseIntermediateOut, true /*BcastY*/,
+          SameShapeOfIntermediateOutAndOut>(ctx, x_dim, y_dim, x, y,
+                                            intermediate_out, out, dout, axis,
+                                            dx, dy, dx_op, dy_op);
+    } else {
+      FusedElemwiseAndActGradComputeWithBroadcast<
+          DeviceContext, T, DX_OP, DY_OP, UseIntermediateOut, false /*BcastY*/,
+          SameShapeOfIntermediateOutAndOut>(ctx, y_dim, x_dim, x, y,
+                                            intermediate_out, out, dout, axis,
+                                            dx, dy, dx_op, dy_op);
+    }
+  }
+}
+
+template <typename DeviceContext, typename T, typename CompoundFunctor,
+          bool KeepIntermediateOut, bool SameShapeOfIntermediateOutAndOut>
+void FusedElemwiseAndActComputeEx(const framework::ExecutionContext &ctx,
+                                  const framework::Tensor &x,
+                                  const framework::Tensor &y, int axis,
+                                  CompoundFunctor compound_functor,
+                                  framework::Tensor *out,
+                                  framework::Tensor *intermediate_out) {
+  if (KeepIntermediateOut) {
+    PADDLE_ENFORCE(intermediate_out,
+                   "The keep_intermediate_value is opened, "
+                   "intermediate_out should not be nullptr.");
+  }
+
+  const framework::DDim &x_dim = x.dims();
+  const framework::DDim &y_dim = y.dims();
+  if (x.dims() == y.dims()) {
+    FusedElemwiseAndActComputeNoBroadcast<DeviceContext, T, CompoundFunctor,
+                                          KeepIntermediateOut>(
+        ctx, x_dim, x, y, compound_functor, out, intermediate_out);
+  } else {
+    // Whether the shape of Y is a continuous subsequence of X,
+    // For more information please refer to the op's introduction.
+    bool bcast_y = x.dims().size() >= y.dims().size();
+    if (x.dims().size() == y.dims().size()) {
+      for (int i = 0; i < x.dims().size(); ++i) {
+        if (x.dims()[i] < y.dims()[i]) {
+          bcast_y = false;
+          break;
+        }
+      }
+    }
+
+    // z = f1(x, f2(y))
+    // z = f1(f2(x, y))
+    if (bcast_y) {  // Y should be broadcast.
+      // In this case,
+      // for 'f2(y)', the shape of intermediate_out should be equal to the shape
+      // of Y.
+      // for 'f2(x, y)', the shape of intermediate_out should be equal to the
+      // shape of Out.
+      // the shape of Out should be equal to the shape of X.
+      FusedElemwiseAndActComputeWithBroadcast<
+          DeviceContext, T, CompoundFunctor, true /*BcastY*/,
+          KeepIntermediateOut, SameShapeOfIntermediateOutAndOut>(
+          ctx, x_dim /*OutShape*/, y_dim, x, y, compound_functor, axis, out,
+          intermediate_out);
+    } else {
+      // In this case,
+      // for 'f2(y)', the shape of intermediate_out should be equal to the shape
+      // of Out.
+      // for 'f2(x, y)', the shape of intermediate_out should be equal to the
+      // shape of Out.
+      // the shape of Out should be equal to the shape of Y.
+      FusedElemwiseAndActComputeWithBroadcast<
+          DeviceContext, T, CompoundFunctor, false /*BcastY*/,
+          KeepIntermediateOut, SameShapeOfIntermediateOutAndOut>(
+          ctx, y_dim /*OutShape*/, x_dim, x, y, compound_functor, axis, out,
+          intermediate_out);
+    }
+  }
+}
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise_sub_op.cc b/paddle/fluid/operators/elementwise_sub_op.cc
index a7562b166b373ee2a8c9b6f379431d88d3e45fcb..b7224261e6a7ca82dff92a25f5fe8818c08e676d 100644
--- a/paddle/fluid/operators/elementwise_sub_op.cc
+++ b/paddle/fluid/operators/elementwise_sub_op.cc
@@ -15,7 +15,10 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise_sub_op.h"
 #include "paddle/fluid/operators/elementwise_op.h"
 namespace ops = paddle::operators;
-REGISTER_ELEMWISE_OP(elementwise_sub, "Sub", "Out = X - Y");
+REGISTER_ELEMWISE_GRAD_MAKER(elementwise_sub, Sub);
+REGISTER_ELEMWISE_EXPLICIT_OP(elementwise_sub, "Sub", "Out = X - Y", "Out",
+                              "X");
+
 REGISTER_OP_CPU_KERNEL(
     elementwise_sub,
     ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/elementwise_sub_op.h b/paddle/fluid/operators/elementwise_sub_op.h
index fe088b8203722a43b9aba7be3878b8f4ca68ba12..3385df0897700d37d60d8804a01db777ebc02a7e 100644
--- a/paddle/fluid/operators/elementwise_sub_op.h
+++ b/paddle/fluid/operators/elementwise_sub_op.h
@@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 
-    http://www.apache.org/licenses/LICENSE-2.0
+   http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include "paddle/fluid/operators/elementwise_op.h"
 #include "paddle/fluid/operators/elementwise_op_function.h"
 
 namespace paddle {
@@ -50,19 +51,21 @@ struct SubGradDY {
 };
 
 template <typename DeviceContext, typename T>
-class ElementwiseSubGradKernel : public framework::OpKernel<T> {
+class ElementwiseSubGradKernel : public ElemwiseGradKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+    ElemwiseGradKernel<T>::Compute(ctx);
     using Tensor = framework::Tensor;
 
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* out = ctx.Input<Tensor>("Out");
     auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
     auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
     int axis = ctx.Attr<int>("axis");
-    ElemwiseGradCompute<DeviceContext, T, SubGradDX<T>, SubGradDY<T>>(
+    // skip out, x, y
+    auto* out = dout;
+    auto *x = dout, *y = dout;
+
+    ElemwiseExplicitGradCompute<DeviceContext, T, SubGradDX<T>, SubGradDY<T>>(
         ctx, *x, *y, *out, *dout, axis, dx, dy, SubGradDX<T>(), SubGradDY<T>());
   }
 };
diff --git a/paddle/fluid/operators/extract_rows_op.cc b/paddle/fluid/operators/extract_rows_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9a297d03cfb041e584159a5fc5ba214f8ac404b4
--- /dev/null
+++ b/paddle/fluid/operators/extract_rows_op.cc
@@ -0,0 +1,103 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+class ExtractRowsOpInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of ExtractRowsOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of ExtractRowsOp should not be null.");
+    PADDLE_ENFORCE_EQ(ctx->GetInputsVarType("X")[0],
+                      framework::proto::VarType::SELECTED_ROWS,
+                      "The type of input(X) must be SelectedRows.");
+    auto in_dims = ctx->GetInputDim("X");
+
+    ctx->SetOutputDim(
+        "Out", framework::make_ddim(std::vector<int64_t>{in_dims[0], 1}));
+  }
+};
+
+class ExtractRowsOp : public framework::OperatorBase {
+ public:
+  ExtractRowsOp(const std::string &type,
+                const framework::VariableNameMap &inputs,
+                const framework::VariableNameMap &outputs,
+                const framework::AttributeMap &attrs)
+      : framework::OperatorBase(type, inputs, outputs, attrs) {}
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto &in = scope.FindVar(Input("X"))->Get<framework::SelectedRows>();
+    auto out = scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
+
+    auto in_rows = in.rows();
+    auto out_dim = framework::make_ddim(
+        std::vector<int64_t>{static_cast<int64_t>(in_rows.size()), 1});
+    auto dst_ptr = out->mutable_data<int64_t>(out_dim, in.place());
+
+    if (paddle::platform::is_gpu_place(in.place())) {
+#ifdef PADDLE_WITH_CUDA
+      platform::DeviceContextPool &pool =
+          platform::DeviceContextPool::Instance();
+      auto *dev_ctx = pool.Get(in.place());
+      auto src_ptr = in_rows.Data(in.place());
+      auto stream =
+          reinterpret_cast<const platform::CUDADeviceContext &>(*dev_ctx)
+              .stream();
+      memory::Copy(boost::get<platform::CUDAPlace>(out->place()), dst_ptr,
+                   boost::get<platform::CUDAPlace>(in.place()), src_ptr,
+                   in_rows.size() * sizeof(int64_t), stream);
+#else
+      PADDLE_THROW("Not compiled with CUDA.");
+#endif
+    } else {
+      memory::Copy(platform::CPUPlace(), dst_ptr, platform::CPUPlace(),
+                   in_rows.data(), in_rows.size() * sizeof(int64_t));
+    }
+  }
+};
+
+class ExtractRowsOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(SelectedRows). The input tensor of extract_rows operator,"
+             " and its type is SelectedRows.");
+    AddOutput("Out", "(Tensor). The the rows of input(X).");
+
+    AddComment(R"DOC(
+    ExtractRows Operator.
+
+The function of extract_rows_op is extracting the rows from the input(X)
+whose type is SelectedRows.
+
+    )DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(extract_rows, ops::ExtractRowsOp, ops::ExtractRowsOpMaker,
+                  ops::ExtractRowsOpInferShape);
diff --git a/paddle/fluid/operators/fake_dequantize_op.cc b/paddle/fluid/operators/fake_dequantize_op.cc
index 43f949111104ee56efc8625bdd609e412ef7f37d..2008e7027524ffd1f80a6eede015801b8a0b0254 100644
--- a/paddle/fluid/operators/fake_dequantize_op.cc
+++ b/paddle/fluid/operators/fake_dequantize_op.cc
@@ -18,15 +18,32 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+template <typename T>
+struct DequantizeFunctor<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& dev_ctx,
+                  const framework::Tensor* in, const framework::Tensor* scale,
+                  T max_range, framework::Tensor* out) {
+    auto in_e = framework::EigenVector<T>::Flatten(*in);
+    const T* scale_factor = scale->data<T>();
+    auto out_e = framework::EigenVector<T>::Flatten(*out);
+
+    auto& dev = *dev_ctx.eigen_device();
+    out_e.device(dev) = (scale_factor[0] / max_range) * in_e;
+  }
+};
+
+template struct DequantizeFunctor<platform::CPUDeviceContext, float>;
+template struct DequantizeFunctor<platform::CPUDeviceContext, double>;
+
 class FakeDequantizeMaxAbsOp : public framework::OperatorWithKernel {
  public:
-  FakeDequantizeMaxAbsOp(const std::string &type,
-                         const framework::VariableNameMap &inputs,
-                         const framework::VariableNameMap &outputs,
-                         const framework::AttributeMap &attrs)
+  FakeDequantizeMaxAbsOp(const std::string& type,
+                         const framework::VariableNameMap& inputs,
+                         const framework::VariableNameMap& outputs,
+                         const framework::AttributeMap& attrs)
       : OperatorWithKernel(type, inputs, outputs, attrs) {}
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"),
                    "Input(X) of FakeDequantizeMaxAbsOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
@@ -42,21 +59,17 @@ class FakeDequantizeMaxAbsOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X",
              "(Tensor) The input with float-32/64 type is the "
              "low precision tensor.");
+    AddInput("Scale", "(float) The scale in quantization stage.");
     AddOutput("Out",
               "(Tensor) The output is the dequantized high "
               "precision tensor.");
-    AddAttr<int>("num_bits",
-                 "(int) `num_bits` is the quantization level bits, "
-                 "such as 2, 5, 8.");
-    AddAttr<float>("scale",
-                   "(float) The maximum absolute value of low precision tensor."
-                   "It is usually calculated by the fake_quantize_max_abs_op.");
+    AddAttr<float>("max_range", "(float) The max range in quantization stage.");
     AddComment(R"DOC(
 FakeDequantizeMaxAbsOp operator.
 
 This calculation is an opposite operation of FakeQuantizeMaxAbsOp:
 
-$$Out = \frac{scale*X}{2^{num_bits} - 1}$$
+$$Out = \frac{scale*X}{ max_range }$$
 
 )DOC");
   }
diff --git a/paddle/fluid/operators/fake_dequantize_op.cu b/paddle/fluid/operators/fake_dequantize_op.cu
index 1bd38d1bd2c3a6f90d2fbad415d61efaead3afe9..225bcc45bc65bc9268d1e866a4358731eaf0c3ef 100644
--- a/paddle/fluid/operators/fake_dequantize_op.cu
+++ b/paddle/fluid/operators/fake_dequantize_op.cu
@@ -14,6 +14,42 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/fake_dequantize_op.h"
 
+namespace paddle {
+namespace operators {
+
+template <typename T>
+__global__ void KeDequantize(const T* in, const T* scale, T max_range, int num,
+                             T* out) {
+  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < num) {
+    out[idx] = in[idx] * scale[0] / max_range;
+  }
+}
+
+template <typename T>
+struct DequantizeFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& dev_ctx,
+                  const framework::Tensor* in, const framework::Tensor* scale,
+                  T max_range, framework::Tensor* out) {
+    const T* in_data = in->data<T>();
+    const T* scale_factor = scale->data<T>();
+    T* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
+
+    int num = in->numel();
+    int block = 512;
+    int grid = (num + block - 1) / block;
+
+    KeDequantize<T><<<grid, block, 0, dev_ctx.stream()>>>(
+        in_data, scale_factor, max_range, num, out_data);
+  }
+};
+
+template struct DequantizeFunctor<platform::CUDADeviceContext, float>;
+template struct DequantizeFunctor<platform::CUDADeviceContext, double>;
+
+}  // namespace operators
+}  // namespace paddle
+
 namespace ops = paddle::operators;
 using CUDA = paddle::platform::CUDADeviceContext;
 REGISTER_OP_CUDA_KERNEL(fake_dequantize_max_abs,
diff --git a/paddle/fluid/operators/fake_dequantize_op.h b/paddle/fluid/operators/fake_dequantize_op.h
index 0901e68b3761159c3cc9c6684567bee38ec3f16d..d9923a10daa01ca06ebabb27cf9285b0628634bc 100644
--- a/paddle/fluid/operators/fake_dequantize_op.h
+++ b/paddle/fluid/operators/fake_dequantize_op.h
@@ -19,22 +19,29 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
+
+template <typename DeviceContext, typename T>
+struct DequantizeFunctor {
+  void operator()(const DeviceContext& dev_ctx, const framework::Tensor* in,
+                  const framework::Tensor* scale, T max_range,
+                  framework::Tensor* out);
+};
+
 template <typename DeviceContext, typename T>
 class FakeDequantizeMaxAbsKernel : public framework::OpKernel<T> {
  public:
   virtual void Compute(const framework::ExecutionContext& ctx) const {
     auto* in = ctx.Input<framework::Tensor>("X");
+    auto* scale = ctx.Input<framework::Tensor>("Scale");
     auto* out = ctx.Output<framework::Tensor>("Out");
-    out->mutable_data<T>(in->place());
 
-    int num_bits = ctx.Attr<int>("num_bits");
-    T scale = static_cast<T>(ctx.Attr<float>("scale"));
-    int range = std::pow(2, num_bits) - 1;
+    float max_range = ctx.Attr<float>("max_range");
+
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    out->mutable_data<T>(dev_ctx.GetPlace());
 
-    auto eigen_out = framework::EigenVector<T>::Flatten(*out);
-    auto eigen_in = framework::EigenVector<T>::Flatten(*in);
-    auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
-    eigen_out.device(dev) = (scale / range) * eigen_in;
+    DequantizeFunctor<DeviceContext, T>()(dev_ctx, in, scale,
+                                          static_cast<T>(max_range), out);
   }
 };
 
diff --git a/paddle/fluid/operators/fake_quantize_op.cc b/paddle/fluid/operators/fake_quantize_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e608eba05d5680254835f7b25f53d6a59e310e2a
--- /dev/null
+++ b/paddle/fluid/operators/fake_quantize_op.cc
@@ -0,0 +1,230 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/fake_quantize_op.h"
+#include <string>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/operators/clip_op.h"
+#include "paddle/fluid/platform/transform.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVectorArrayMap =
+    Eigen::TensorMap<Eigen::Tensor<T, 1, MajorType, IndexType>>;
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using ConstEigenVectorArrayMap =
+    Eigen::TensorMap<const Eigen::Tensor<T, 1, MajorType, IndexType>>;
+
+template <typename T>
+struct FindAbsMaxFunctor<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& ctx, const T* in,
+                  const int num, T* out) {
+    Eigen::DSizes<Eigen::DenseIndex, 1> idim(num);
+    Eigen::DSizes<Eigen::DenseIndex, 1> odim(1);
+    Eigen::TensorMap<Eigen::Tensor<const T, 1, Eigen::RowMajor>> in_e(in, idim);
+    Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor>> out_e(out, odim);
+
+    out_e = in_e.abs().maximum();
+  }
+};
+
+template struct FindAbsMaxFunctor<platform::CPUDeviceContext, float>;
+
+template <typename T>
+struct ClipAndFakeQuantFunctor<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& ctx,
+                  const framework::Tensor& in, const framework::Tensor& scale,
+                  const int bin_cnt, framework::Tensor* out) {
+    T s = scale.data<T>()[0];
+    platform::Transform<platform::CPUDeviceContext> trans;
+    trans(ctx, in.data<T>(), in.data<T>() + in.numel(),
+          out->mutable_data<T>(ctx.GetPlace()), ClipFunctor<T>(-s, s));
+    auto in_e = framework::EigenVector<T>::Flatten(in);
+    auto out_e = framework::EigenVector<T>::Flatten(*out);
+
+    out_e.device(*ctx.eigen_device()) = (bin_cnt / s * in_e).round();
+  }
+};
+
+template struct ClipAndFakeQuantFunctor<platform::CPUDeviceContext, float>;
+
+template <typename T>
+struct FindRangeAbsMaxFunctor<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& ctx,
+                  const framework::Tensor& cur_scale,
+                  const framework::Tensor& last_scale,
+                  const framework::Tensor& iter, const int window_size,
+                  framework::Tensor* scales_arr, framework::Tensor* out_scale) {
+    T* scale_arr = scales_arr->mutable_data<T>(ctx.GetPlace());
+    int64_t it = iter.data<int64_t>()[0];
+    int idx = it % window_size;
+    T removed = scale_arr[idx];
+    T cur = cur_scale.data<T>()[0];
+    scale_arr[idx] = cur;
+
+    T max = last_scale.data<T>()[0];
+    if (max < cur) {
+      max = cur;
+    } else if (fabs(removed - max) < 1e-6) {
+      int size = (it > window_size) ? window_size : it;
+      FindAbsMaxFunctor<platform::CPUDeviceContext, T>()(ctx, scale_arr, size,
+                                                         &max);
+    }
+    out_scale->mutable_data<T>(ctx.GetPlace())[0] = max;
+  }
+};
+
+template struct FindRangeAbsMaxFunctor<platform::CPUDeviceContext, float>;
+
+class FakeQuantizeAbsMaxOp : public framework::OperatorWithKernel {
+ public:
+  FakeQuantizeAbsMaxOp(const std::string& type,
+                       const framework::VariableNameMap& inputs,
+                       const framework::VariableNameMap& outputs,
+                       const framework::AttributeMap& attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of FakeQuantizeOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of FakeQuantizeOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("OutScale"),
+                   "Output(Scale) of FakeQuantizeOp should not be null.");
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    ctx->SetOutputDim("OutScale", {1});
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+
+class FakeQuantizeAbsMaxOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor) Input is float data type.");
+    AddOutput("Out",
+              "(Tensor) Output of quantized low level tensor, "
+              "but also saved as float data type.");
+    AddOutput("OutScale", "(Tensor) Current scale");
+    AddAttr<int>("bit_length", "(int, default 8)")
+        .SetDefault(8)
+        .AddCustomChecker([](const int& bit_length) {
+          PADDLE_ENFORCE(bit_length >= 1 && bit_length <= 16,
+                         "'bit_length' should be between 1 and 16.");
+        });
+    AddComment(R"DOC(
+FakeQuantize operator
+
+$$scale = max(abs(X))$$ 
+$$range = 2^{bit_length - 1} - 1$$
+$$Out = round(X/scale * range)$$
+
+)DOC");
+  }
+};
+
+class FakeQuantizeRangeAbsMaxOp : public framework::OperatorWithKernel {
+ public:
+  FakeQuantizeRangeAbsMaxOp(const std::string& type,
+                            const framework::VariableNameMap& inputs,
+                            const framework::VariableNameMap& outputs,
+                            const framework::AttributeMap& attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of FakeQuantizeRangeAbsMaxOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("Out"),
+        "Output(Out) of FakeQuantizeRangeAbsMaxOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("OutScale"),
+        "Output(OutScale) of FakeQuantizeRangeAbsMaxOp should not be null");
+    if (ctx->HasOutput("OutScales")) {
+      int window_size = ctx->Attrs().Get<int>("window_size");
+      ctx->SetOutputDim("OutScales", {window_size});
+    }
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    ctx->SetOutputDim("OutScale", {1});
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+
+class FakeQuantizeRangeAbsMaxOpMaker
+    : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor) Input is float data type.");
+    AddInput("InScale", "Last scale.");
+    AddInput("Iter", "Global step iteration.").AsDispensable();
+    AddOutput("Out", "(Tensor) Output of quantized low level tensor.");
+    AddOutput("OutScale", " Current scale");
+    AddOutput("OutScales", "(Tensor) scale buffer.").AsDispensable();
+    AddAttr<int>("window_size", "(int, default 10000) window range size.")
+        .SetDefault(10000);
+    AddAttr<int>("bit_length", "(int, default 8), quantization bit number.")
+        .SetDefault(8)
+        .AddCustomChecker([](const int& bit_length) {
+          PADDLE_ENFORCE(bit_length >= 1 && bit_length <= 16,
+                         "'bit_length' should be between 1 and 16.");
+        });
+    AddAttr<bool>("is_test", "").SetDefault(false);
+    AddComment(R"DOC(
+FakeQuantize operator is used in static quantization.
+
+$$scale = max(max(abs(x)), history_abs_max)$$ 
+$$range = 2^{bit_length - 1} - 1$$
+$$Out = round(X/scale * range)$$
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+using CPU = paddle::platform::CPUDeviceContext;
+
+REGISTER_OPERATOR(fake_quantize_abs_max, ops::FakeQuantizeAbsMaxOp,
+                  ops::FakeQuantizeAbsMaxOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(fake_quantize_abs_max,
+                       ops::FakeQuantizeAbsMaxKernel<CPU, float>);
+
+REGISTER_OPERATOR(fake_quantize_range_abs_max, ops::FakeQuantizeRangeAbsMaxOp,
+                  ops::FakeQuantizeRangeAbsMaxOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(fake_quantize_range_abs_max,
+                       ops::FakeQuantizeRangeAbsMaxKernel<CPU, float>);
diff --git a/paddle/fluid/operators/fake_quantize_op.cu b/paddle/fluid/operators/fake_quantize_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a0ff6396210c2b3a7f8bd6b9f274b875d7fd4933
--- /dev/null
+++ b/paddle/fluid/operators/fake_quantize_op.cu
@@ -0,0 +1,178 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <string>
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/operators/fake_quantize_op.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+__global__ void FindAbsMaxKernel(const T* in, const int n, T* out) {
+  int bid = threadIdx.x + blockIdx.x * blockDim.x;
+  int tid = threadIdx.x;
+
+  extern __shared__ T shared_max_data[];
+  if (gridDim.x > 1) {
+    shared_max_data[tid] = T(0);
+    for (int i = bid; i < n; i += blockDim.x * gridDim.x) {
+      T tmp = fabs(in[i]);
+      if (tmp > shared_max_data[tid]) {
+        shared_max_data[tid] = tmp;
+      }
+    }
+  } else {
+    if (bid < n) {
+      shared_max_data[tid] = fabs(in[bid]);
+    } else {
+      shared_max_data[tid] = T(0);
+    }
+  }
+  __syncthreads();
+
+  for (int i = blockDim.x / 2; i > 0; i >>= 1) {
+    if (tid < i && (shared_max_data[tid] < shared_max_data[tid + i])) {
+      shared_max_data[tid] = shared_max_data[tid + i];
+    }
+    __syncthreads();
+  }
+  if (tid == 0) {
+    out[blockIdx.x] = shared_max_data[0];
+  }
+}
+
+template <typename T>
+struct FindAbsMaxFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& ctx, const T* in,
+                  const int num, T* out) {
+    int block = 1024;
+    int grid = (block - 1 + num) / block;
+    grid = (grid > block) ? block : grid;
+
+    framework::Tensor max;
+    T* max_data =
+        max.mutable_data<T>(framework::make_ddim({grid}), ctx.GetPlace());
+    FindAbsMaxKernel<T><<<grid, block, 1024 * sizeof(T), ctx.stream()>>>(
+        in, num, max_data);
+    FindAbsMaxKernel<T><<<1, block, 1024 * sizeof(T), ctx.stream()>>>(
+        max_data, grid, out);
+  }
+};
+
+template struct FindAbsMaxFunctor<platform::CUDADeviceContext, float>;
+
+template <typename T>
+__global__ void ClipAndQuantKernel(const T* in, const T* scale,
+                                   const int bin_cnt, const int n, T* out) {
+  int bid = threadIdx.x + blockIdx.x * blockDim.x;
+  int tid = threadIdx.x;
+
+  T s = scale[0];
+  for (int i = bid; i < n; i += blockDim.x * gridDim.x) {
+    T x = in[bid];
+    T v = x > s ? s : x;
+    v = v < -s ? -s : v;
+    v = bin_cnt / s * v;
+    out[bid] = round(v);
+  }
+}
+
+template <typename T>
+__global__ void FindRangeAbsMaxAndFillArray(const T* cur_scale,
+                                            const T* last_scale,
+                                            const int64_t* iter,
+                                            const int window_size, T* scale_arr,
+                                            T* out_scale, int* need_find_max,
+                                            int* out_size) {
+  int it = iter[0];
+  int idx = it % window_size;
+  T removed = scale_arr[idx];
+  T cur = cur_scale[0];
+  scale_arr[idx] = cur;
+  T max = last_scale[0];
+  out_scale[0] = max < cur ? cur : max;
+  if (fabs(removed - max) < 1e-6) {
+    need_find_max[0] = 1;
+    out_size[0] = it > window_size ? window_size : it;
+  } else {
+    need_find_max[0] = 0;
+  }
+}
+
+template <typename T>
+struct FindRangeAbsMaxFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& ctx,
+                  const framework::Tensor& cur_scale,
+                  const framework::Tensor& last_scale,
+                  const framework::Tensor& iter, const int window_size,
+                  framework::Tensor* scales_arr, framework::Tensor* out_scale) {
+    const auto gpu_place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
+
+    T* scale_arr = scales_arr->mutable_data<T>(gpu_place);
+    T* out_scale_data = out_scale->mutable_data<T>(gpu_place);
+
+    framework::Tensor need_find_max, out_size;
+    int* find_max = need_find_max.mutable_data<int>(gpu_place);
+    int* out_size_data = out_size.mutable_data<int>(gpu_place);
+
+    FindRangeAbsMaxAndFillArray<T><<<1, 1, 0, ctx.stream()>>>(
+        cur_scale.data<T>(), last_scale.data<T>(), iter.data<int64_t>(),
+        window_size, scale_arr, out_scale_data, find_max, out_size_data);
+
+    int g_find_max;
+    memory::Copy(platform::CPUPlace(), &g_find_max, gpu_place, find_max,
+                 sizeof(int), 0);
+    if (g_find_max) {
+      int len;
+      memory::Copy(platform::CPUPlace(), &len, gpu_place, out_size_data,
+                   sizeof(int), 0);
+      FindAbsMaxFunctor<platform::CUDADeviceContext, T>()(ctx, scale_arr, len,
+                                                          out_scale_data);
+    }
+  }
+};
+
+template struct FindRangeAbsMaxFunctor<platform::CUDADeviceContext, float>;
+
+template <typename T>
+struct ClipAndFakeQuantFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& ctx,
+                  const framework::Tensor& in, const framework::Tensor& scale,
+                  const int bin_cnt, framework::Tensor* out) {
+    int num = in.numel();
+    int block = 1024;
+    int grid = (block - 1 + num) / block;
+
+    const T* in_data = in.data<T>();
+    const T* scale_data = scale.data<T>();
+    T* out_data = out->mutable_data<T>(ctx.GetPlace());
+
+    ClipAndQuantKernel<T><<<grid, block, 0, ctx.stream()>>>(
+        in_data, scale_data, bin_cnt, num, out_data);
+  }
+};
+
+template struct ClipAndFakeQuantFunctor<platform::CUDADeviceContext, float>;
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+using CUDA = paddle::platform::CUDADeviceContext;
+REGISTER_OP_CUDA_KERNEL(fake_quantize_abs_max,
+                        ops::FakeQuantizeAbsMaxKernel<CUDA, float>);
+REGISTER_OP_CUDA_KERNEL(fake_quantize_range_abs_max,
+                        ops::FakeQuantizeRangeAbsMaxKernel<CUDA, float>);
diff --git a/paddle/fluid/operators/fake_quantize_op.h b/paddle/fluid/operators/fake_quantize_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..7ace7573ec5c03ab8788cfc0aab614b7f80ea073
--- /dev/null
+++ b/paddle/fluid/operators/fake_quantize_op.h
@@ -0,0 +1,109 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/blas.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+struct FindAbsMaxFunctor {
+  void operator()(const DeviceContext& ctx, const T* in, const int num, T* out);
+};
+
+template <typename DeviceContext, typename T>
+struct ClipAndFakeQuantFunctor {
+  void operator()(const DeviceContext& ctx, const framework::Tensor& in,
+                  const framework::Tensor& scale, const int bin_cnt,
+                  framework::Tensor* out);
+};
+
+template <typename DeviceContext, typename T>
+struct FindRangeAbsMaxFunctor {
+  void operator()(const DeviceContext& ctx, const framework::Tensor& cur_scale,
+                  const framework::Tensor& last_scale,
+                  const framework::Tensor& iter, const int window_size,
+                  framework::Tensor* scales_arr, framework::Tensor* out_scale);
+};
+
+template <typename DeviceContext, typename T>
+class FakeQuantizeAbsMaxKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in = context.Input<framework::Tensor>("X");
+
+    auto* out = context.Output<framework::Tensor>("Out");
+    auto* out_scale = context.Output<framework::Tensor>("OutScale");
+    T* out_s = out_scale->mutable_data<T>(context.GetPlace());
+
+    int bit_length = context.Attr<int>("bit_length");
+    int bin_cnt = std::pow(2, bit_length - 1) - 1;
+
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    const T* in_data = in->data<T>();
+    FindAbsMaxFunctor<DeviceContext, T>()(dev_ctx, in_data, in->numel(), out_s);
+    ClipAndFakeQuantFunctor<DeviceContext, T>()(dev_ctx, *in, *out_scale,
+                                                bin_cnt, out);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class FakeQuantizeRangeAbsMaxKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in = context.Input<framework::Tensor>("X");
+    auto* in_scale = context.Input<framework::Tensor>("InScale");
+
+    auto* out = context.Output<framework::Tensor>("Out");
+    out->mutable_data<T>(context.GetPlace());
+
+    bool is_test = context.Attr<bool>("is_test");
+    int bit_length = context.Attr<int>("bit_length");
+    int bin_cnt = std::pow(2, bit_length - 1) - 1;
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+
+    // testing
+    if (is_test) {
+      ClipAndFakeQuantFunctor<DeviceContext, T>()(dev_ctx, *in, *in_scale,
+                                                  bin_cnt, out);
+      return;
+    }
+
+    // training
+    auto* out_scale = context.Output<framework::Tensor>("OutScale");
+    auto* out_scales = context.Output<framework::Tensor>("OutScales");
+    auto* iter = context.Input<framework::Tensor>("Iter");
+
+    int window_size = context.Attr<int>("window_size");
+    out_scale->mutable_data<T>(context.GetPlace());
+
+    framework::Tensor cur_scale;
+    T* cur_scale_data = cur_scale.mutable_data<T>({1}, context.GetPlace());
+    FindAbsMaxFunctor<DeviceContext, T>()(dev_ctx, in->data<T>(), in->numel(),
+                                          cur_scale_data);
+    FindRangeAbsMaxFunctor<DeviceContext, T>()(dev_ctx, cur_scale, *in_scale,
+                                               *iter, window_size, out_scales,
+                                               out_scale);
+    ClipAndFakeQuantFunctor<DeviceContext, T>()(dev_ctx, *in, *out_scale,
+                                                bin_cnt, out);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/fc_mkldnn_op.cc b/paddle/fluid/operators/fc_mkldnn_op.cc
index 99fa659a351249a4a93f71700e1c646465861aba..e595f1a627cfefbb91b070b898046cf135dc4988 100644
--- a/paddle/fluid/operators/fc_mkldnn_op.cc
+++ b/paddle/fluid/operators/fc_mkldnn_op.cc
@@ -125,13 +125,16 @@ class FCMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
 
     auto input = ctx.Input<Tensor>("Input");
     auto w = ctx.Input<Tensor>("W");
+    auto bias = ctx.Input<Tensor>("Bias");
 
     PADDLE_ENFORCE(input->dims().size() == 2 || input->dims().size() == 4,
                    "Input must be with 2 or 4 dimensions, i.e. NCHW");
+    // TODO(intel friends): the native weight format is io,
+    // but the mkldnn weight format is oihw, which may need be transposed.
     PADDLE_ENFORCE(w->dims().size() == 2 || w->dims().size() == 4,
                    "Weights must be with 2 or 4 dimensions, i.e. OI or OIHW");
 
-    bool with_bias = ctx.Attr<bool>("bias_attr");
+    bool with_bias = bias != nullptr;
     MKLDNNMD<Tensor> md(input, w, with_bias);
 
     std::shared_ptr<mkldnn::inner_product_forward::primitive_desc> pd =
@@ -154,6 +157,7 @@ class FCMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     auto dst_memory = mem.dst(output_data);
     auto src_memory = mem.src(input_data);
     auto weights_memory = mem.weights(w_data);
+    // TODO(intel friends): bias memory should also be obtain from bias->data()
     auto bias_memory = mem.bias();
 
     auto forward = with_bias ? mkldnn::inner_product_forward(
@@ -216,7 +220,8 @@ class FCMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     const Tensor* out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
     const T* out_grad_data = out_grad->data<T>();
 
-    bool with_bias = ctx.Attr<bool>("bias_attr");
+    auto bias = ctx.Input<Tensor>("Bias");
+    bool with_bias = bias != nullptr;
 
     MKLDNNMD<Tensor> md(input, w, with_bias);
     MKLDNNMemory mem(&md, mkldnn_engine);
diff --git a/paddle/fluid/operators/fc_op.cc b/paddle/fluid/operators/fc_op.cc
index a9ae1396db8d7dab0364779e506d5c0a3e2ff6ed..fa4dec9cf118cef9b836943fd4eae90d23e6218a 100644
--- a/paddle/fluid/operators/fc_op.cc
+++ b/paddle/fluid/operators/fc_op.cc
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/fc_op.h"
 #include <vector>
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/math/fc_compute.h"
 
 namespace paddle {
 namespace operators {
@@ -25,16 +27,29 @@ void FCOp::InferShape(framework::InferShapeContext* ctx) const {
                  "Out(Output) of Fully Connected should not be null.");
   PADDLE_ENFORCE(ctx->HasInput("W"),
                  "W(Input) of Fully Connected should not be null.");
-
+  // NCHW
   auto in_dims = ctx->GetInputDim("Input");
+  // IO, I=C*H*W
   auto w_dims = ctx->GetInputDim("W");
   std::vector<int64_t> output_shape({in_dims[0], w_dims[1]});
 
+  if (ctx->HasInput("Bias")) {
+    auto bias_dims = ctx->GetInputDim("Bias");
+    if (bias_dims.size() == 2) {
+      PADDLE_ENFORCE_EQ(bias_dims[0], 1, "The shape of Bias must be [1, dim].");
+      PADDLE_ENFORCE_EQ(bias_dims[1], w_dims[1],
+                        "The shape of Bias must be [1, dim].");
+    } else if (bias_dims.size() == 1) {
+      PADDLE_ENFORCE_EQ(bias_dims[0], w_dims[1],
+                        "The shape of Bias must be [1, dim].");
+    }
+  }
   PADDLE_ENFORCE(in_dims.size() == 2 || in_dims.size() == 4,
                  "Fully Connected input should be 2-D or 4-D tensor.");
-
-  PADDLE_ENFORCE(w_dims.size() == 2 || w_dims.size() == 4,
-                 "Fully Connected input should be 2-D or 4-D tensor.");
+  PADDLE_ENFORCE_EQ(w_dims.size(), 2UL,
+                    "Fully Connected input should be 2-D tensor.");
+  PADDLE_ENFORCE_EQ(framework::product(in_dims) / in_dims[0], w_dims[0],
+                    "Fully Connected input and weigth size do not match.");
 
   ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
   ctx->ShareLoD("Input", "Out");
@@ -42,9 +57,12 @@ void FCOp::InferShape(framework::InferShapeContext* ctx) const {
 
 framework::OpKernelType FCOp::GetExpectedKernelType(
     const framework::ExecutionContext& ctx) const {
-  framework::LibraryType library{framework::LibraryType::kMKLDNN};
-  framework::DataLayout layout{framework::DataLayout::kMKLDNN};
-
+  framework::LibraryType library = framework::LibraryType::kPlain;
+  framework::DataLayout layout = framework::DataLayout::kAnyLayout;
+  if (ctx.Attr<bool>("use_mkldnn")) {
+    library = framework::LibraryType::kMKLDNN;
+    layout = framework::DataLayout::kMKLDNN;
+  }
   return framework::OpKernelType(
       framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
       layout, library);
@@ -60,43 +78,77 @@ void FCOpGrad::InferShape(framework::InferShapeContext* ctx) const {
   if (ctx->HasOutput(framework::GradVarName("W"))) {
     ctx->SetOutputDim(framework::GradVarName("W"), w_dims);
   }
+
+  if (ctx->HasInput("Bias")) {
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Bias")),
+                   "Should have bias grad");
+    auto bias_dims = ctx->GetInputDim("Bias");
+    ctx->SetOutputDim(framework::GradVarName("Bias"), bias_dims);
+  }
 }
 
 framework::OpKernelType FCOpGrad::GetExpectedKernelType(
     const framework::ExecutionContext& ctx) const {
-  framework::LibraryType library{framework::LibraryType::kMKLDNN};
-  framework::DataLayout layout{framework::DataLayout::kMKLDNN};
-
+  framework::LibraryType library = framework::LibraryType::kPlain;
+  framework::DataLayout layout = framework::DataLayout::kAnyLayout;
+  if (ctx.Attr<bool>("use_mkldnn")) {
+    library = framework::LibraryType::kMKLDNN;
+    layout = framework::DataLayout::kMKLDNN;
+  }
   return framework::OpKernelType(
       framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
       layout, library);
 }
 
 void FCOpMaker::Make() {
-  AddInput("Input", "(Tensor) The input tensor of fully connected operator. ");
-  AddInput("W", "(Tensor), The second input tensor of fc op.");
+  AddInput("Input",
+           "(Tensor), The input tensor of fully connected operator with format "
+           "(NCHW). ");
+  AddInput("W", "(Tensor), The weight fc op with shape (I, O).");
+  AddInput("Bias", "(Tensor, optional) Bias vector with shape (1 x O")
+      .AsDispensable();
   AddOutput("Out", "(Tensor) The output tensor of fully connected operator. ");
   AddAttr<bool>("use_mkldnn",
                 "(bool, default false) Only used in mkldnn kernel")
       .SetDefault(false);
-  AddAttr<bool>("bias_attr", "(bool, default false) Only used in mkldnn kernel")
-      .SetDefault(false);
   AddComment(R"DOC(
   Fully Connected Operator.
 
-  The fully connected operation calculates the output based on the input, weights and bias attribute.
+  The fully connected operation calculates the output based on the input, weights and bias.
   The size of each dimension of the parameters checked in the infer-shape.
-  The matrix of bias is generated by the mkldnn framework, when the bias_attr is True.
-  Additional parametrs are use_mkldnn and bias_attr.
-  The input(X) size and output(Out) size may be diffrent.
-
-  The fully connected layer only supports MKLDNN version
 )DOC");
 }
 
+template <typename T>
+class FCOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
+                   "It must use CPUPlace.");
+    auto input = ctx.Input<Tensor>("Input");
+    auto w = ctx.Input<Tensor>("W");
+    auto bias = ctx.Input<Tensor>("Bias");
+    auto output = ctx.Output<Tensor>("Out");
+    auto in_dims = input->dims();
+    auto w_dims = w->dims();
+
+    const T* input_data = input->data<T>();
+    const T* w_data = w->data<T>();
+    T* output_data = output->mutable_data<T>(ctx.GetPlace());
+    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(ctx);
+    math::FCCompute<platform::CPUDeviceContext, T>(
+        blas, in_dims[0], w_dims[1], w_dims[0], input_data, w_data, output_data,
+        bias ? bias->data<T>() : NULL);
+
+    // TODO(TJ): fuse act
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OPERATOR(fc, paddle::operators::FCOp, paddle::operators::FCOpMaker,
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(fc, ops::FCOp, ops::FCOpMaker,
                   paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(fc_grad, paddle::operators::FCOpGrad);
+REGISTER_OPERATOR(fc_grad, ops::FCOpGrad);
+REGISTER_OP_CPU_KERNEL(fc, ops::FCOpKernel<float>, ops::FCOpKernel<double>);
diff --git a/paddle/fluid/operators/feed_op.cc b/paddle/fluid/operators/feed_op.cc
index bcb3e63ed7dbc775c1de6c4522f0548ea48a6cf0..dc7ef664958238ddbd48745bd59cc7db28e49f5b 100644
--- a/paddle/fluid/operators/feed_op.cc
+++ b/paddle/fluid/operators/feed_op.cc
@@ -31,7 +31,6 @@ class FeedOp : public framework::OperatorBase {
                const platform::Place &place) const override {
     // get device context from pool
     auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place);
-    platform::RecordEvent record_event(Type(), dev_ctx);
 
     auto feed_var_name = Input("X");
     auto *feed_var = scope.FindVar(feed_var_name);
diff --git a/paddle/fluid/operators/fetch_barrier_op.cc b/paddle/fluid/operators/fetch_barrier_op.cc
index 02beb80fc8a9f451393dcdd54492c4f88f908497..9d7ac7ab6194593747548fac3cefc8d4ed3058d8 100644
--- a/paddle/fluid/operators/fetch_barrier_op.cc
+++ b/paddle/fluid/operators/fetch_barrier_op.cc
@@ -36,28 +36,24 @@ class FetchBarrierOp : public framework::OperatorBase {
   void RunImpl(const framework::Scope& scope,
                const platform::Place& place) const override {
     std::vector<std::string> eps = Attr<std::vector<std::string>>("endpoints");
-
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    auto& ctx = *pool.Get(place);
-    // For profiling
-    platform::RecordEvent record_event(Type(), &ctx);
-
     distributed::RPCClient* rpc_client =
         distributed::RPCClient::GetInstance<RPCCLIENT_T>();
 
-    rpc_client->Wait();
+    PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
 
     for (auto& ep : eps) {
       VLOG(3) << "fetch barrier, ep: " << ep;
       rpc_client->AsyncSendFetchBarrier(ep);
     }
-    rpc_client->Wait();
+    PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
   }
 };
 
 class FetchBarrierOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() {
+    AddOutput("Out", "(Any) Dummy outputs, used for control dependency")
+        .AsDuplicable();
     AddComment(R"DOC(
 SendBarrier operator
 
diff --git a/paddle/fluid/operators/fetch_op.cc b/paddle/fluid/operators/fetch_op.cc
index 1640a2a22c69a0e3ab81a2889d6105b2cf4162b7..c197b45e8196a47def6465128e8ca39d8daefed6 100644
--- a/paddle/fluid/operators/fetch_op.cc
+++ b/paddle/fluid/operators/fetch_op.cc
@@ -30,9 +30,6 @@ class FetchOp : public framework::OperatorBase {
  private:
   void RunImpl(const framework::Scope &scope,
                const platform::Place &place) const override {
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    platform::RecordEvent record_event(Type(), pool.Get(place));
-
     auto fetch_var_name = Input("X");
     auto *fetch_var = scope.FindVar(fetch_var_name);
     PADDLE_ENFORCE(fetch_var != nullptr,
diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc
index 130f18dde4f979a6a9925ede9cbf745fcec14d48..2826b82117db113d4d8c10095e89f610ca895775 100644
--- a/paddle/fluid/operators/fill_constant_op.cc
+++ b/paddle/fluid/operators/fill_constant_op.cc
@@ -15,7 +15,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
@@ -41,19 +40,33 @@ class FillConstantOp : public framework::OperatorBase {
         static_cast<framework::proto::VarType::Type>(Attr<int>("dtype"));
     auto value = Attr<float>("value");
     auto force_cpu = Attr<bool>("force_cpu");
-    auto &out =
-        *scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
-    out.Resize(framework::make_ddim(Attr<std::vector<int>>("shape")));
+
+    framework::Tensor *tensor = nullptr;
+
+    auto &out_var = *scope.FindVar(Output("Out"));
+
+    if (out_var.IsType<framework::LoDTensor>()) {
+      tensor = out_var.GetMutable<framework::LoDTensor>();
+      tensor->Resize(framework::make_ddim(Attr<std::vector<int>>("shape")));
+    } else if (out_var.IsType<framework::SelectedRows>()) {
+      tensor = out_var.GetMutable<framework::SelectedRows>()->mutable_value();
+      tensor->Resize(framework::make_ddim(Attr<std::vector<int>>("shape")));
+    } else {
+      PADDLE_THROW(
+          "fill constant op's output only"
+          "supports SelectedRows and LoDTensor");
+    }
+
     if (force_cpu) {
       auto cpu = platform::CPUPlace();
-      out.mutable_data(cpu, framework::ToTypeIndex(data_type));
+      tensor->mutable_data(cpu, framework::ToTypeIndex(data_type));
     } else {
-      out.mutable_data(dev_place, framework::ToTypeIndex(data_type));
+      tensor->mutable_data(dev_place, framework::ToTypeIndex(data_type));
     }
 
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
     auto &dev_ctx = *pool.Get(dev_place);
-    math::set_constant(dev_ctx, &out, value);
+    math::set_constant(dev_ctx, tensor, value);
   }
 };
 
diff --git a/paddle/fluid/operators/fill_op.cc b/paddle/fluid/operators/fill_op.cc
index 925dc19061e2196a40411f415eb6e5ad59ab52ff..adc7cb1f9e48ba5fabeb91c5e3ecec016db34a45 100644
--- a/paddle/fluid/operators/fill_op.cc
+++ b/paddle/fluid/operators/fill_op.cc
@@ -25,7 +25,7 @@ struct FillOpVisitor {
       : tensor_(tensor), value_(value) {}
 
   template <typename T>
-  void operator()() const {
+  void apply() const {
     platform::CPUPlace cpu;
     auto *data = tensor_->mutable_data<T>(cpu);
     std::transform(value_.data(), value_.data() + tensor_->numel(), data,
diff --git a/paddle/fluid/operators/flatten_op.cc b/paddle/fluid/operators/flatten_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8e80dc0e641c443923076c31e269689b5bc134a7
--- /dev/null
+++ b/paddle/fluid/operators/flatten_op.cc
@@ -0,0 +1,284 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+class FlattenOpInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input (X) of Flatten op should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output (Output) of Flatten op should not be null.");
+    const auto &axis = ctx->Attrs().Get<int>("axis");
+    const auto &in_dims = ctx->GetInputDim("X");
+    PADDLE_ENFORCE(axis >= 0, "The axis should be greater than or equal to 0.");
+    PADDLE_ENFORCE(
+        axis <= in_dims.size(),
+        "The axis should be less than or equal to input tensor's rank.");
+
+    const auto &out_dims = GetOutputShape(axis, in_dims);
+    ctx->SetOutputDim("Out", framework::make_ddim(out_dims));
+    if (in_dims[0] == out_dims[0]) {
+      // Only pass LoD when the first dimension of output and Input(X)
+      // are the same.
+      ctx->ShareLoD("X", "Out");
+    }
+  }
+
+  static std::vector<int32_t> GetOutputShape(const int axis,
+                                             const framework::DDim &in_dims) {
+    int64_t outer = 1, inner = 1;
+    for (int i = 0; i < in_dims.size(); ++i) {
+      if (i < axis) {
+        outer *= in_dims[i];
+      } else {
+        inner *= in_dims[i];
+      }
+    }
+    std::vector<int32_t> out_shape(2);
+    out_shape[0] = outer;
+    out_shape[1] = inner;
+    return out_shape;
+  }
+};
+
+class FlattenOp : public framework::OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto &axis = Attr<int>("axis");
+    auto in_dims =
+        scope.FindVar(Input("X"))->Get<framework::LoDTensor>().dims();
+    const auto &out_dims = FlattenOpInferShape::GetOutputShape(axis, in_dims);
+
+    framework::AttributeMap attrs;
+    attrs["shape"] = out_dims;
+    attrs["inplace"] = false;
+    // Invoke Reshape Op
+    auto reshape_op = framework::OpRegistry::CreateOp(
+        "reshape", {{"X", {Input("X")}}, {"Shape", {}}},
+        {{"Out", {Output("Out")}}}, attrs);
+    reshape_op->Run(scope, place);
+  }
+};
+
+class FlattenOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor) A tensor of rank >= axis.");
+    AddOutput("Out",
+              "A 2D tensor is reshaped input tensor. The input dimensions"
+              "up to axis are flattened to the outer dimension of the output"
+              "and the remaining input dimensions are flattened into the inner"
+              "dimension of the output.");
+    AddAttr<int>("axis",
+                 "(int)"
+                 "Indicate up to which input dimensions (exclusive) should be"
+                 "flattened to the outer dimension of the output. The value"
+                 "for axis must be in the range [0, R], where R is the rank of"
+                 "the input tensor. When axis = 0, the shape of the output"
+                 "tensor is (1, (d_0 X d_1 ... d_n), where the shape of the"
+                 "input tensor is (d_0, d_1, ... d_n).")
+        .SetDefault(1);
+    AddComment(R"DOC(
+Flatten Operator
+
+Flattens the input tensor into a 2D matrix.
+
+Examples:
+Case 1:
+  Given
+    X.shape = (3, 100, 100, 4)
+  and
+    axis = 2
+  We get:
+    Out.shape = (3 * 100, 4 * 100)
+
+Case 2:
+  Given
+    X.shape = (3, 100, 100, 4)
+  and
+    axis = 0
+  We get:
+    Out.shape = (1, 3 * 100 * 100 * 4)
+)DOC");
+  }
+};
+
+class FlattenGradInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    context->SetOutputDim(framework::GradVarName("X"),
+                          context->GetInputDim("X"));
+    context->ShareLoD("X", framework::GradVarName("X"));
+  }
+};
+
+class FlattenGradOp : public framework::OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto dx_name = Output(framework::GradVarName("X"));
+    auto dout_name = Input(framework::GradVarName("Out"));
+    auto in_dims =
+        scope.FindVar(Input("X"))->Get<framework::LoDTensor>().dims();
+    framework::AttributeMap attrs;
+    attrs["shape"] = framework::vectorize2int(in_dims);
+    attrs["inplace"] = false;
+
+    auto reshape_op = framework::OpRegistry::CreateOp(
+        "reshape", {{"X", {dout_name}}, {"Shape", {}}}, {{"Out", {dx_name}}},
+        attrs);
+    reshape_op->Run(scope, place);
+  }
+};
+
+// FIXME(zcd): flatten2 adds an intermediate output(XShape) based on flatten,
+// the XShape is used to carry the shape and lod of X which will be used in
+// flatten_grad, in this way, the framework can reuse the memory of X
+// immediately the flatten2_op is finished.
+// Considering compatibility issues, we could not fix flatten2_op
+class Flatten2OpInferShape : public FlattenOpInferShape {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    FlattenOpInferShape::operator()(ctx);
+    PADDLE_ENFORCE(ctx->HasOutput("XShape"),
+                   "Output (XShape) of Flatten op should not be null.");
+    const auto &in_dims = ctx->GetInputDim("X");
+    std::vector<int64_t> xshape_dims(in_dims.size() + 1);
+    xshape_dims[0] = 0;
+    for (int i = 0; i < in_dims.size(); ++i) {
+      xshape_dims[i + 1] = in_dims[i];
+    }
+    ctx->SetOutputDim("XShape", framework::make_ddim(xshape_dims));
+    ctx->ShareLoD("X", "XShape");
+  }
+};
+
+class Flatten2Op : public framework::OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto &axis = Attr<int>("axis");
+    auto in_dims =
+        scope.FindVar(Input("X"))->Get<framework::LoDTensor>().dims();
+    const auto &out_dims = FlattenOpInferShape::GetOutputShape(axis, in_dims);
+
+    framework::AttributeMap attrs;
+    attrs["shape"] = out_dims;
+    attrs["inplace"] = false;
+    // Invoke Reshape Op
+    auto reshape_op = framework::OpRegistry::CreateOp(
+        "reshape2", {{"X", {Input("X")}}, {"Shape", {}}},
+        {{"Out", {Output("Out")}}, {"XShape", {Output("XShape")}}}, attrs);
+    reshape_op->Run(scope, place);
+  }
+};
+
+class Flatten2OpMaker : public FlattenOpMaker {
+ public:
+  void Make() override {
+    FlattenOpMaker::Make();
+    AddOutput("XShape",
+              "XShape is just used to store the shape and lod of X, which will "
+              "be used in FlattenGradOp.")
+        .AsIntermediate();
+  }
+};
+
+class Flatten2GradOpMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
+    grad_op->SetType("flatten2_grad");
+    grad_op->SetInput("XShape", Output("XShape"));
+    grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    grad_op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+
+class Flatten2GradInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInput("XShape"),
+                   "Input(XShape) shouldn't be null.");
+    PADDLE_ENFORCE(context->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) shouldn't be null.");
+    auto xshape_dims = context->GetInputDim("XShape");
+    auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
+    context->SetOutputDim(framework::GradVarName("X"), x_dims);
+    context->ShareLoD("XShape", framework::GradVarName("X"));
+  }
+};
+
+class Flatten2GradOp : public framework::OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto dx_name = Output(framework::GradVarName("X"));
+    auto dout_name = Input(framework::GradVarName("Out"));
+    auto xshape_name = Input("XShape");
+    auto xshape_dims =
+        scope.FindVar(xshape_name)->Get<framework::LoDTensor>().dims();
+    auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
+
+    framework::AttributeMap attrs;
+    attrs["shape"] = framework::vectorize2int(x_dims);
+    attrs["inplace"] = false;
+
+    auto reshape_op = framework::OpRegistry::CreateOp(
+        "reshape2", {{"X", {dout_name}}, {"Shape", {}}},
+        {{"Out", {dx_name}}, {"XShape", {xshape_name}}}, attrs);
+    reshape_op->Run(scope, place);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+USE_OP(reshape);
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(flatten, ops::FlattenOp, ops::FlattenOpMaker,
+                  ops::FlattenOpInferShape,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(flatten_grad, ops::FlattenGradOp, ops::FlattenGradInferShape);
+
+REGISTER_OPERATOR(flatten2, ops::Flatten2Op, ops::Flatten2OpMaker,
+                  ops::Flatten2OpInferShape, ops::Flatten2GradOpMaker);
+REGISTER_OPERATOR(flatten2_grad, ops::Flatten2GradOp,
+                  ops::Flatten2GradInferShape);
diff --git a/paddle/fluid/operators/fused_elemwise_activation_op.cc b/paddle/fluid/operators/fused_elemwise_activation_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b54f0091b3fe21222b4690f4dcff1c081d4799e7
--- /dev/null
+++ b/paddle/fluid/operators/fused_elemwise_activation_op.cc
@@ -0,0 +1,341 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/fused_elemwise_activation_op.h"
+#include <string>
+#include <vector>
+
+namespace paddle {
+namespace operators {
+
+/*
+ * Whether the compound function is Unary(Binary(X, Y)).
+ * For Unary(Binary(X, Y)), the intermediate_out's shape is the same the final
+ * out.
+ */
+static bool IsUnaryCompound(const std::vector<std::string> &functor_list) {
+  PADDLE_ENFORCE_EQ(functor_list.size(), 2);
+  static std::unordered_set<std::string> binary_fun = {
+      "elementwise_add", "elementwise_mul", "elementwise_add_grad",
+      "elementwise_mul_grad"};
+  return binary_fun.count(functor_list[1]) != 0;
+}
+
+/*
+ * Whether the Input(X) could be absent.
+ */
+static bool InputXCanBeAbsent(const std::vector<std::string> &functor_list) {
+  PADDLE_ENFORCE_EQ(functor_list.size(), 2);
+  static std::unordered_set<std::string> binary_fun = {"elementwise_add_grad"};
+  return binary_fun.count(functor_list[0]) != 0 ||
+         binary_fun.count(functor_list[1]) != 0;
+}
+
+/*
+ * Whether the compound function is supported.
+ * For Unary(Binary(X, Y)), the intermediate_out's shape is the same the final
+ * out.
+ */
+static bool IsSupportedCompound(const std::vector<std::string> &functors) {
+  static std::unordered_set<std::string> unary_fun = {"scale", "relu"};
+  static std::unordered_set<std::string> binary_fun = {"elementwise_add",
+                                                       "elementwise_mul"};
+
+  std::string unary_fun_str;
+  if (binary_fun.count(functors[0])) {
+    unary_fun_str = functors[1];
+  } else if (binary_fun.count(functors[1])) {
+    unary_fun_str = functors[0];
+  } else {
+    PADDLE_THROW("%s and %s are not included in fused_list.", functors[0],
+                 functors[1]);
+  }
+  PADDLE_ENFORCE_EQ(unary_fun.count(unary_fun_str), 1,
+                    "%s is not included in fused_list.", unary_fun_str);
+  return true;
+}
+
+class FusedElemwiseActivationOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(
+        ctx->HasInput("X"),
+        "Input(X) of FusedElemwiseActivationOp op should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput("Y"),
+        "Input(Y) of FusedElemwiseActivationOp op should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("Out"),
+        "Output(Out) of FusedElemwiseActivationOp op should not be null.");
+
+    auto x_dim = ctx->GetInputDim("X");
+    auto y_dim = ctx->GetInputDim("Y");
+
+    // Whether the shape of Y is a continuous subsequence of X,
+    // For more information please refer to the op's introduction.
+    bool bcast_y = x_dim.size() >= y_dim.size();
+    if (x_dim.size() == y_dim.size()) {
+      for (int i = 0; i < x_dim.size(); ++i) {
+        if (x_dim[i] < y_dim[i]) {
+          bcast_y = false;
+          break;
+        }
+      }
+    }
+
+    auto &out_dim = bcast_y ? x_dim : y_dim;
+    std::string out_lod = bcast_y ? "X" : "Y";
+
+    if (ctx->Attrs().Get<bool>("keep_intermediate_value")) {
+      PADDLE_ENFORCE(ctx->HasOutput("IntermediateOut"),
+                     "Output(IntermediateOut) of FusedElemwiseActivationOp "
+                     "should not be null.");
+
+      if (IsUnaryCompound(
+              ctx->Attrs().Get<std::vector<std::string>>("functor_list"))) {
+        // for Unary(Binary(X, Y)), the shape and lod of out and
+        // intermediate_out are the same.
+        ctx->SetOutputDim("IntermediateOut", out_dim);
+        // set the lod of intermediate_out
+        ctx->ShareLoD(out_lod, /*->*/ "IntermediateOut");
+      } else {
+        // for Binary(X, Unary(Y)), the shape and lod of Y and
+        // intermediate_out are the same.
+        ctx->SetOutputDim("IntermediateOut", y_dim);
+        // set the lod of intermediate_out
+        ctx->ShareLoD("Y", /*->*/ "IntermediateOut");
+      }
+    }
+    ctx->SetOutputDim("Out", out_dim);
+    ctx->ShareLoD(out_lod, /*->*/ "Out");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE_EQ(ctx.Input<framework::Tensor>("X")->type(),
+                      ctx.Input<framework::Tensor>("Y")->type(),
+                      "The element's type of input should be the same.");
+    auto input_data_type =
+        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type());
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+  }
+};
+
+class FusedElemwiseActivationMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput(
+        "X",
+        "(Tensor) The input tensor of fused_elemwise_activation operator.");
+    AddInput(
+        "Y",
+        "(Tensor) The input tensor of fused_elemwise_activation operator.");
+    AddOutput("Out",
+              "vector<Tensor> The output tensor of fused_elemwise_activation "
+              "operator.");
+    AddOutput("IntermediateOut",
+              "Tensor The IntermediateOut tensor of fused_elemwise_activation "
+              "operator.")
+        .AsIntermediate();
+    AddAttr<int>("axis",
+                 "axis is used by elementwise_op, the default value is -1.")
+        .SetDefault(-1);
+    AddAttr<float>("scale",
+                   "scale is used by scale_op, the default value is 0.0.")
+        .SetDefault(0.0);
+    AddAttr<bool>(
+        "recomputation",
+        "Whether to recompute the Out."
+        "The computation of fused_elemwise_activation_grad has two methods to "
+        "get the dx and dy, one is to use the 'Out', and the other is not. "
+        "The former method will save the time of recomputing the 'Out', but it "
+        "must occupy the memory to store the 'out'. While, the later method "
+        "can avoid occupying the memory, but it must recompute the 'Out'. "
+        "It is useful for Unary(Binary(X, Y)). The default value is true.")
+        .SetDefault(true);
+    AddAttr<bool>("keep_intermediate_value",
+                  "Whether to save the intermediate_out.")
+        .SetDefault(false);
+    AddAttr<std::vector<std::string>>("functor_list",
+                                      "The functors that should be fused.")
+        .AddCustomChecker([&](const std::vector<std::string> &functor_list) {
+          PADDLE_ENFORCE(IsSupportedCompound(functor_list));
+        });
+
+    AddComment(R"DOC(
+FusedElemwiseActivation Operator.
+
+At present, FusedElemwiseActivation only supports Two kinds of compound
+operators (elementwise_op and activation_op):
+
+    Z = Binary(X, Unary(Y))
+    Z = Unary(Binary(X, Y))
+
+There are two cases for this operator:
+
+1. The shape of $Y$ and $X$ is the same.
+2. The shape of $Y$ is a continuous subsequence of $X$ or the shape of $X$ is a continuous subsequence of $Y$.
+
+For case 2 (assume that the shape of $Y$ is a continuous subsequence of $X$ ):
+
+1. Broadcast $Y$ to match the shape of $X$, where $axis$ is the start dimension index
+   for broadcasting $Y$ onto $X$.
+2. If $axis$ is -1 (default), $axis = rank(X) - rank(Y)$.
+3. The trailing dimensions of size 1 for $Y$ will be ignored for the consideration of
+   subsequence, such as shape(Y) = (2, 1) => (2).
+
+For example:
+
+  .. code-block:: python
+
+    shape(X) = (2, 3, 4, 5), shape(Y) = (,)
+    shape(X) = (2, 3, 4, 5), shape(Y) = (5,)
+    shape(X) = (2, 3, 4, 5), shape(Y) = (4, 5), with axis=-1(default) or axis=2
+    shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
+    shape(X) = (2, 3, 4, 5), shape(Y) = (2), with axis=0
+    shape(X) = (2, 3, 4, 5), shape(Y) = (2, 1), with axis=0
+
+
+The inputs $X$ and $Y$ can carry the different LoD information.
+But the output only shares the LoD information with the one whose shape is the same with Out.
+The attributions of activation_op can be get from fused_elemwise_activation_op's.
+The functor_list records the functions to be fused, for example
+["scale", "elementwise_add"].
+
+)DOC");
+  }
+};
+
+class FusedElemwiseActivationGradMaker
+    : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *op_desc_ptr = new framework::OpDesc();
+    op_desc_ptr->SetType(this->ForwardOpType() + "_grad");
+
+    for (auto &input_param : this->InputNames()) {
+      op_desc_ptr->SetInput(input_param, this->Input(input_param));
+      op_desc_ptr->SetOutput(framework::GradVarName(input_param),
+                             this->InputGrad(input_param, true));
+    }
+
+    for (auto &output_param : this->OutputNames()) {
+      op_desc_ptr->SetInput(output_param, this->Output(output_param));
+      op_desc_ptr->SetInput(framework::GradVarName(output_param),
+                            this->OutputGrad(output_param));
+    }
+
+    op_desc_ptr->SetAttrMap(this->Attrs());
+
+    std::vector<std::string> functor_names =
+        boost::get<std::vector<std::string>>(
+            op_desc_ptr->GetAttr("functor_list"));
+    functor_names[0] += "_grad";
+    functor_names[1] += "_grad";
+    op_desc_ptr->SetAttr("functor_list", functor_names);
+    return std::unique_ptr<framework::OpDesc>(op_desc_ptr);
+  }
+};
+
+class FusedElemwiseActivationOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@Grad) should not be null");
+    if (ctx->Attrs().Get<bool>("keep_intermediate_value")) {
+      PADDLE_ENFORCE(ctx->HasInput("IntermediateOut"),
+                     "Input(IntermediateOut) should not be null");
+    } else {
+      PADDLE_ENFORCE_EQ(ctx->Inputs(framework::GradVarName("Out")).size(), 1);
+    }
+
+    auto funtor_list =
+        ctx->Attrs().Get<std::vector<std::string>>("functor_list");
+    auto x_grad_name = framework::GradVarName("X");
+    auto y_grad_name = framework::GradVarName("Y");
+
+    if (ctx->HasOutput(x_grad_name)) {
+      if (ctx->HasInputs("X")) {
+        ctx->SetOutputDim(x_grad_name, ctx->GetInputDim("X"));
+        ctx->ShareLoD("X", x_grad_name);
+      } else {
+        // Node: If "X" is absence, the shape of Y should be a continuous
+        // subsequence of X, if not, we could not infer the shape of dx.
+
+        // Currently, only when Binary is elementwise_add or elementwise_sub,
+        // the "X" could be absent.
+        PADDLE_ENFORCE(InputXCanBeAbsent(funtor_list),
+                       "Only when BinaryFunctor is elementwise_add, the 'X' "
+                       "could be absent.");
+
+        // For Unary(Binary(X, Y)), IntermediateOut should not be empty.
+        if (IsUnaryCompound(funtor_list)) {
+          PADDLE_ENFORCE(
+              ctx->HasInputs("IntermediateOut"),
+              "If the compound_functor is Unary(Binary(X, Y)) and Binary "
+              "is elementwise_add, the intermediate_out must be not absent.");
+        }
+
+        ctx->SetOutputDim(x_grad_name,
+                          ctx->GetInputDim(framework::GradVarName("Out")));
+        ctx->ShareLoD(framework::GradVarName("Out"), x_grad_name);
+      }
+    }
+    if (ctx->HasOutput(y_grad_name)) {
+      PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null");
+      ctx->SetOutputDim(y_grad_name, ctx->GetInputDim("Y"));
+      ctx->ShareLoD("Y", y_grad_name);
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    //    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null");
+    auto input_data_type_index = ctx.Input<framework::Tensor>("Y")->type();
+    auto input_data_type = framework::ToDataType(input_data_type_index);
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(fused_elemwise_activation, ops::FusedElemwiseActivationOp,
+                  ops::FusedElemwiseActivationMaker,
+                  ops::FusedElemwiseActivationGradMaker);
+REGISTER_OPERATOR(fused_elemwise_activation_grad,
+                  ops::FusedElemwiseActivationOpGrad);
+
+REGISTER_OP_CPU_KERNEL(
+    fused_elemwise_activation,
+    ops::FusedElemwiseActivationKernel<paddle::platform::CPUDeviceContext,
+                                       float>,
+    ops::FusedElemwiseActivationKernel<paddle::platform::CPUDeviceContext,
+                                       double>);
+
+REGISTER_OP_CPU_KERNEL(
+    fused_elemwise_activation_grad,
+    ops::FusedElemwiseActivationGradKernel<paddle::platform::CPUDeviceContext,
+                                           float>,
+    ops::FusedElemwiseActivationGradKernel<paddle::platform::CPUDeviceContext,
+                                           double>);
diff --git a/paddle/fluid/operators/fused_elemwise_activation_op.cu b/paddle/fluid/operators/fused_elemwise_activation_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e1d2b16b4b5e3a480777f834c2cbeb6d00a755e4
--- /dev/null
+++ b/paddle/fluid/operators/fused_elemwise_activation_op.cu
@@ -0,0 +1,30 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/fused_elemwise_activation_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    fused_elemwise_activation,
+    ops::FusedElemwiseActivationKernel<paddle::platform::CUDADeviceContext,
+                                       float>,
+    ops::FusedElemwiseActivationKernel<paddle::platform::CUDADeviceContext,
+                                       double>);
+
+REGISTER_OP_CUDA_KERNEL(
+    fused_elemwise_activation_grad,
+    ops::FusedElemwiseActivationGradKernel<paddle::platform::CUDADeviceContext,
+                                           float>,
+    ops::FusedElemwiseActivationGradKernel<paddle::platform::CUDADeviceContext,
+                                           double>);
diff --git a/paddle/fluid/operators/fused_elemwise_activation_op.h b/paddle/fluid/operators/fused_elemwise_activation_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..6321541aab7e31cd703289bb8951245215ecb3e2
--- /dev/null
+++ b/paddle/fluid/operators/fused_elemwise_activation_op.h
@@ -0,0 +1,397 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/detail/safe_ref.h"
+#include "paddle/fluid/operators/elementwise_op_function.h"
+#include "paddle/fluid/operators/math/compound_functors.h"
+#include "paddle/fluid/operators/math/functors.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T, typename BinaryFunctor,
+          typename UnaryFunctor>
+static void RunBinaryCompoundFunctor(
+    const framework::ExecutionContext &ctx, const BinaryFunctor &binary_functor,
+    const UnaryFunctor &unary_functor, const framework::Tensor &in_x,
+    const framework::Tensor &in_y, std::vector<framework::Tensor *> *outputs) {
+  // Z = Binary(X, Unary(Y))
+  // intermediate_out = Unary(Y)
+  // out = Binary(X, Unary(Y))
+  // In this case, the shape of intermediate_out and out are different.
+  paddle::operators::math::BinaryCompoundFunctor<T, BinaryFunctor, UnaryFunctor>
+      compound_func(binary_functor, unary_functor);
+  int axis = ctx.Attr<int>("axis");
+  if (ctx.Attr<bool>("keep_intermediate_value")) {
+    FusedElemwiseAndActComputeEx<DeviceContext, T,
+                                 paddle::operators::math::BinaryCompoundFunctor<
+                                     T, BinaryFunctor, UnaryFunctor>,
+                                 true /*KeepIntermediateValue*/,
+                                 false /*SameShapeOfIntermediateOutAndOut*/>(
+        ctx, in_x, in_y, axis, compound_func, (*outputs)[0], (*outputs)[1]);
+  } else {
+    FusedElemwiseAndActComputeEx<DeviceContext, T,
+                                 paddle::operators::math::BinaryCompoundFunctor<
+                                     T, BinaryFunctor, UnaryFunctor>,
+                                 false /*KeepIntermediateValue*/,
+                                 false /*SameShapeOfIntermediateOutAndOut*/>(
+        ctx, in_x, in_y, axis, compound_func, (*outputs)[0], (*outputs)[1]);
+  }
+}
+
+template <typename DeviceContext, typename T, typename UnaryFunctor,
+          typename BinaryFunctor>
+static void RunUnaryCompoundFunctors(
+    const framework::ExecutionContext &ctx, const UnaryFunctor &unary_functor,
+    const BinaryFunctor &binary_functor, const framework::Tensor &in_x,
+    const framework::Tensor &in_y, std::vector<framework::Tensor *> *outputs) {
+  // Z = Unary(Binary(X, Y))
+  // intermediate_out = Binary(X, Y)
+  // out = Unary(Binary(X, Y))
+  // In this case, the shape of intermediate_out and out are the same.
+  int axis = ctx.Attr<int>("axis");
+
+  paddle::operators::math::UnaryCompoundFunctor<T, UnaryFunctor, BinaryFunctor>
+      compound_func(unary_functor, binary_functor);
+
+  if (ctx.Attr<bool>("keep_intermediate_value")) {
+    FusedElemwiseAndActComputeEx<DeviceContext, T,
+                                 paddle::operators::math::UnaryCompoundFunctor<
+                                     T, UnaryFunctor, BinaryFunctor>,
+                                 true /*KeepIntermediateValue*/,
+                                 true /*SameShapeOfIntermediateOutAndOut*/>(
+        ctx, in_x, in_y, axis, compound_func, (*outputs)[0], (*outputs)[1]);
+  } else {
+    FusedElemwiseAndActComputeEx<DeviceContext, T,
+                                 paddle::operators::math::UnaryCompoundFunctor<
+                                     T, UnaryFunctor, BinaryFunctor>,
+                                 false /*KeepIntermediateValue*/,
+                                 true /*SameShapeOfIntermediateOutAndOut*/>(
+        ctx, in_x, in_y, axis, compound_func, (*outputs)[0], (*outputs)[1]);
+  }
+}
+
+template <typename DeviceContext, typename T, typename BinaryGradFunctor,
+          typename UnaryFunctor, typename UnaryGradFunctor>
+static void RunBinaryCompoundGradFunctors(
+    const framework::ExecutionContext &ctx,
+    const BinaryGradFunctor &binary_grad_functor,
+    const UnaryFunctor &unary_functor,
+    const UnaryGradFunctor &unary_grad_functor, const framework::Tensor *in_x,
+    const framework::Tensor *in_y, const framework::Tensor *in_out,
+    const framework::Tensor *in_intermediate_out,
+    const framework::Tensor *in_out_grad, framework::Tensor *x_grad,
+    framework::Tensor *y_grad) {
+  // Z = Binary(X, Unary(Y))
+  int axis = ctx.Attr<int>("axis");
+
+  using BinaryCompoundDxFunctor =
+      paddle::operators::math::BinaryCompoundGradDxFunctor<T, BinaryGradFunctor,
+                                                           UnaryFunctor>;
+  using BinaryCompoundDyFunctor =
+      paddle::operators::math::BinaryCompoundGradDyFunctor<
+          T, BinaryGradFunctor, UnaryFunctor, UnaryGradFunctor>;
+
+  if (in_intermediate_out) {
+    FusedElemwiseAndActGradComputeEx<
+        DeviceContext, T, BinaryCompoundDxFunctor, BinaryCompoundDyFunctor,
+        true /*UseIntermediateOut*/,
+        false /*SameShapeOfIntermediateOutAndOut*/>(
+        ctx, in_x, in_y, in_out, in_intermediate_out, in_out_grad, axis, x_grad,
+        y_grad, BinaryCompoundDxFunctor(binary_grad_functor, unary_functor),
+        BinaryCompoundDyFunctor(binary_grad_functor, unary_functor,
+                                unary_grad_functor));
+  } else {
+    FusedElemwiseAndActGradComputeEx<
+        DeviceContext, T, BinaryCompoundDxFunctor, BinaryCompoundDyFunctor,
+        false /*UseIntermediateOut*/,
+        false /*SameShapeOfIntermediateOutAndOut*/>(
+        ctx, in_x, in_y, in_out, in_intermediate_out, in_out_grad, axis, x_grad,
+        y_grad, BinaryCompoundDxFunctor(binary_grad_functor, unary_functor),
+        BinaryCompoundDyFunctor(binary_grad_functor, unary_functor,
+                                unary_grad_functor));
+  }
+}
+
+template <typename DeviceContext, typename T, typename UnaryGradFunctor,
+          typename BinaryFunctor, typename BinaryGradFunctor,
+          bool Recomputation = true>
+static void RunUnaryCompoundGradFunctors(
+    const framework::ExecutionContext &ctx,
+    const UnaryGradFunctor &unary_grad_functor,
+    const BinaryFunctor &binary_functor,
+    const BinaryGradFunctor &binary_grad_functor, const framework::Tensor *in_x,
+    const framework::Tensor *in_y, const framework::Tensor *in_out,
+    const framework::Tensor *in_intermediate_out,
+    const framework::Tensor *in_out_grad, framework::Tensor *x_grad,
+    framework::Tensor *y_grad) {
+  // Z = Unary(Binary(X, Y))
+  int axis = ctx.Attr<int>("axis");
+
+  using UnaryCompoundDxFunctor =
+      paddle::operators::math::UnaryCompoundGradDxFunctor<
+          T, UnaryGradFunctor, BinaryFunctor, BinaryGradFunctor, Recomputation>;
+  using UnaryCompoundDyFunctor =
+      paddle::operators::math::UnaryCompoundGradDyFunctor<
+          T, UnaryGradFunctor, BinaryFunctor, BinaryGradFunctor, Recomputation>;
+
+  if (in_intermediate_out) {
+    FusedElemwiseAndActGradComputeEx<
+        DeviceContext, T, UnaryCompoundDxFunctor, UnaryCompoundDyFunctor,
+        true /*UseIntermediateOut*/, true /*SameShapeOfIntermediateOutAndOut*/>(
+        ctx, in_x, in_y, in_out, in_intermediate_out, in_out_grad, axis, x_grad,
+        y_grad, UnaryCompoundDxFunctor(unary_grad_functor, binary_functor,
+                                       binary_grad_functor),
+        UnaryCompoundDyFunctor(unary_grad_functor, binary_functor,
+                               binary_grad_functor));
+  } else {
+    FusedElemwiseAndActGradComputeEx<DeviceContext, T, UnaryCompoundDxFunctor,
+                                     UnaryCompoundDyFunctor,
+                                     false /*UseIntermediateOut*/,
+                                     true /*SameShapeOfIntermediateOutAndOut*/>(
+        ctx, in_x, in_y, in_out, in_intermediate_out, in_out_grad, axis, x_grad,
+        y_grad, UnaryCompoundDxFunctor(unary_grad_functor, binary_functor,
+                                       binary_grad_functor),
+        UnaryCompoundDyFunctor(unary_grad_functor, binary_functor,
+                               binary_grad_functor));
+  }
+}
+
+template <typename DeviceContext, typename T>
+static void RunFunctors(const framework::ExecutionContext &ctx,
+                        const framework::Tensor &in_x,
+                        const framework::Tensor &in_y,
+                        std::vector<framework::Tensor *> *outputs) {
+  auto &functors = ctx.Attr<std::vector<std::string>>("functor_list");
+
+  // TODO(zcd): The following code can be refined.
+  auto funcs_str = functors[0] + "," + functors[1];
+  if (funcs_str == "elementwise_add,scale") {
+    // Z = Binary(X, Unary(Y))
+    T scale = static_cast<T>(ctx.Attr<float>("scale"));
+    RunBinaryCompoundFunctor<DeviceContext, T,
+                             paddle::operators::math::AddFunctor<T>,
+                             paddle::operators::math::ScaleFunctor<T>>(
+        ctx, paddle::operators::math::AddFunctor<T>(),
+        paddle::operators::math::ScaleFunctor<T>(scale), in_x, in_y, outputs);
+  } else if (funcs_str == "scale,elementwise_add") {
+    // Z = Unary(Binary(X, Y))
+    T scale = static_cast<T>(ctx.Attr<float>("scale"));
+    RunUnaryCompoundFunctors<DeviceContext, T,
+                             paddle::operators::math::ScaleFunctor<T>,
+                             paddle::operators::math::AddFunctor<T>>(
+        ctx, paddle::operators::math::ScaleFunctor<T>(scale),
+        paddle::operators::math::AddFunctor<T>(), in_x, in_y, outputs);
+  } else if (funcs_str == "elementwise_add,relu") {
+    // Z = Binary(X, Unary(Y))
+    RunBinaryCompoundFunctor<DeviceContext, T,
+                             paddle::operators::math::AddFunctor<T>,
+                             paddle::operators::math::ReluFunctor<T>>(
+        ctx, paddle::operators::math::AddFunctor<T>(),
+        paddle::operators::math::ReluFunctor<T>(), in_x, in_y, outputs);
+  } else if (funcs_str == "relu,elementwise_add") {
+    // Z = Unary(Binary(X, Y))
+    RunUnaryCompoundFunctors<DeviceContext, T,
+                             paddle::operators::math::ReluFunctor<T>,
+                             paddle::operators::math::AddFunctor<T>>(
+        ctx, paddle::operators::math::ReluFunctor<T>(),
+        paddle::operators::math::AddFunctor<T>(), in_x, in_y, outputs);
+  } else if (funcs_str == "elementwise_mul,scale") {
+    // Z = Binary(X, Unary(Y))
+    T scale = static_cast<T>(ctx.Attr<float>("scale"));
+    RunBinaryCompoundFunctor<DeviceContext, T,
+                             paddle::operators::math::MulFunctor<T>,
+                             paddle::operators::math::ScaleFunctor<T>>(
+        ctx, paddle::operators::math::MulFunctor<T>(),
+        paddle::operators::math::ScaleFunctor<T>(scale), in_x, in_y, outputs);
+  } else {
+    PADDLE_THROW("%s has not been implemented.", funcs_str);
+  }
+}
+
+template <typename DeviceContext, typename T, bool ReComputation>
+static void RunGradFunctors(const framework::ExecutionContext &ctx,
+                            const framework::Tensor *in_x,
+                            const framework::Tensor *in_y,
+                            const framework::Tensor *in_out,
+                            const framework::Tensor *in_intermediate_out,
+                            const framework::Tensor *in_out_grad,
+                            framework::Tensor *x_grad,
+                            framework::Tensor *y_grad) {
+  auto &functors = ctx.Attr<std::vector<std::string>>("functor_list");
+  auto funcs_str = functors[0] + "," + functors[1];
+
+  // TODO(zcd): The following code can be refined. for example, use registrition
+  if (funcs_str == "elementwise_add_grad,scale_grad") {
+    // The backward of Z = Binary(X, Unary(Y))
+    T scale = static_cast<T>(ctx.Attr<float>("scale"));
+    RunBinaryCompoundGradFunctors<DeviceContext, T,
+                                  paddle::operators::math::AddGradFunctor<T>,
+                                  paddle::operators::math::ScaleFunctor<T>,
+                                  paddle::operators::math::ScaleGradFunctor<T>>(
+        ctx, paddle::operators::math::AddGradFunctor<T>(),
+        paddle::operators::math::ScaleFunctor<T>(scale),
+        paddle::operators::math::ScaleGradFunctor<T>(scale), in_x, in_y, in_out,
+        in_intermediate_out, in_out_grad, x_grad, y_grad);
+  } else if (funcs_str == "scale_grad,elementwise_add_grad") {
+    // The backward of Z = Unary(Binary(X, Y))
+    T scale = static_cast<T>(ctx.Attr<float>("scale"));
+    RunUnaryCompoundGradFunctors<DeviceContext, T,
+                                 paddle::operators::math::ScaleGradFunctor<T>,
+                                 paddle::operators::math::AddFunctor<T>,
+                                 paddle::operators::math::AddGradFunctor<T>,
+                                 ReComputation /*Recomputation*/>(
+        ctx, paddle::operators::math::ScaleGradFunctor<T>(scale),
+        paddle::operators::math::AddFunctor<T>(),
+        paddle::operators::math::AddGradFunctor<T>(), in_x, in_y, in_out,
+        in_intermediate_out, in_out_grad, x_grad, y_grad);
+  } else if (funcs_str == "elementwise_add_grad,relu_grad") {
+    RunBinaryCompoundGradFunctors<DeviceContext, T,
+                                  paddle::operators::math::AddGradFunctor<T>,
+                                  paddle::operators::math::ReluFunctor<T>,
+                                  paddle::operators::math::ReluGradFunctor<T>>(
+        ctx, paddle::operators::math::AddGradFunctor<T>(),
+        paddle::operators::math::ReluFunctor<T>(),
+        paddle::operators::math::ReluGradFunctor<T>(), in_x, in_y, in_out,
+        in_intermediate_out, in_out_grad, x_grad, y_grad);
+  } else if (funcs_str == "relu_grad,elementwise_add_grad") {
+    RunUnaryCompoundGradFunctors<DeviceContext, T,
+                                 paddle::operators::math::ReluGradFunctor<T>,
+                                 paddle::operators::math::AddFunctor<T>,
+                                 paddle::operators::math::AddGradFunctor<T>,
+                                 ReComputation /*Recomputation*/>(
+        ctx, paddle::operators::math::ReluGradFunctor<T>(),
+        paddle::operators::math::AddFunctor<T>(),
+        paddle::operators::math::AddGradFunctor<T>(), in_x, in_y, in_out,
+        in_intermediate_out, in_out_grad, x_grad, y_grad);
+  } else if (funcs_str == "elementwise_mul_grad,scale_grad") {
+    // The backward of Z = Binary(X, Unary(Y))
+    T scale = static_cast<T>(ctx.Attr<float>("scale"));
+    RunBinaryCompoundGradFunctors<DeviceContext, T,
+                                  paddle::operators::math::MulGradFunctor<T>,
+                                  paddle::operators::math::ScaleFunctor<T>,
+                                  paddle::operators::math::ScaleGradFunctor<T>>(
+        ctx, paddle::operators::math::MulGradFunctor<T>(),
+        paddle::operators::math::ScaleFunctor<T>(scale),
+        paddle::operators::math::ScaleGradFunctor<T>(scale), in_x, in_y, in_out,
+        in_intermediate_out, in_out_grad, x_grad, y_grad);
+  } else {
+    PADDLE_THROW("%s has not been implemented.", funcs_str);
+  }
+}
+
+template <typename DeviceContext, typename T>
+class FusedElemwiseActivationKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto &in_x = detail::Ref(ctx.Input<framework::Tensor>("X"),
+                             "Cannot get input tensor %s, variable name = %s",
+                             "X", ctx.op().Input("X"));
+    auto &in_y = detail::Ref(ctx.Input<framework::Tensor>("Y"),
+                             "Cannot get input tensor %s, variable name = %s",
+                             "Y", ctx.op().Input("Y"));
+    PADDLE_ENFORCE(ctx.HasOutput("Out"), "The output(Out) should not be empty");
+    auto output = ctx.Output<framework::Tensor>("Out");
+
+    std::vector<framework::Tensor *> outputs;
+    outputs.emplace_back(output);
+
+    if (ctx.Attr<bool>("keep_intermediate_value")) {
+      PADDLE_ENFORCE(ctx.HasOutput("IntermediateOut"),
+                     "The keep_intermediate_value is enable, so the "
+                     "IntermediateOut should not be empty.");
+      auto intermediate_out = ctx.Output<framework::Tensor>("IntermediateOut");
+      outputs.emplace_back(intermediate_out);
+    } else {
+      outputs.emplace_back(nullptr);
+    }
+
+    RunFunctors<DeviceContext, T>(ctx, in_x, in_y, &outputs);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class FusedElemwiseActivationGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto x = ctx.Input<framework::Tensor>("X");
+    auto y = ctx.Input<framework::Tensor>("Y");
+
+    auto in_out = ctx.Input<framework::Tensor>("Out");
+    auto in_out_grad =
+        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+
+    framework::Tensor *x_grad =
+        ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    framework::Tensor *y_grad =
+        ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
+
+    PADDLE_ENFORCE(y != nullptr, "Input(Y) should not be nullptr.");
+
+    if (ctx.Attr<bool>("recomputation")) {
+      PADDLE_ENFORCE(
+          x != nullptr,
+          "The recomputation is opened, so Input(X) should not be absent.");
+    } else {
+      PADDLE_ENFORCE(in_out != nullptr,
+                     "The recomputation is disabled, so the Input('Out') "
+                     "should not be empty.");
+    }
+
+    framework::Tensor *in_x;
+    auto functor_list = ctx.Attr<std::vector<std::string>>("functor_list");
+
+    // If functor_list contains elementwise_add, the backward doesn't use
+    // in_x, and in_outs.
+    if (x == nullptr) {
+      PADDLE_ENFORCE(functor_list[0] == "elementwise_add_grad" ||
+                         functor_list[1] == "elementwise_add_grad",
+                     "Only when the compoundfunctor contains "
+                     "elementwise_add_grad, the 'X' could be absent.");
+      in_x = const_cast<framework::Tensor *>(in_out_grad);
+      in_out = const_cast<framework::Tensor *>(in_out_grad);
+    } else {
+      in_x = const_cast<framework::Tensor *>(x);
+    }
+
+    framework::Tensor *in_intermediate_out;
+    if (ctx.Attr<bool>("keep_intermediate_value")) {
+      in_intermediate_out = const_cast<framework::Tensor *>(
+          ctx.Input<framework::Tensor>("IntermediateOut"));
+      PADDLE_ENFORCE(in_intermediate_out != nullptr,
+                     "The option of 'keep_intermediate_value' is opened, "
+                     "so the number of 'Out' should be two.");
+    } else {
+      in_intermediate_out = nullptr;
+    }
+
+    if (ctx.Attr<bool>("recomputation")) {
+      RunGradFunctors<DeviceContext, T, true /*Recomputation*/>(
+          ctx, in_x, y, in_out, in_intermediate_out, in_out_grad, x_grad,
+          y_grad);
+    } else {
+      RunGradFunctors<DeviceContext, T, false /*Recomputation*/>(
+          ctx, in_x, y, in_out, in_intermediate_out, in_out_grad, x_grad,
+          y_grad);
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/fusion_gru_op.cc b/paddle/fluid/operators/fusion_gru_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..31e87d9113118ebe7a4b25ffee5ba55e2714fb66
--- /dev/null
+++ b/paddle/fluid/operators/fusion_gru_op.cc
@@ -0,0 +1,430 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/fusion_gru_op.h"
+#include <cstring>  // for memcpy
+#include <string>
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/math/cpu_vec.h"
+#include "paddle/fluid/operators/math/fc_compute.h"
+#include "paddle/fluid/operators/math/sequence2batch.h"
+#include "paddle/fluid/platform/cpu_info.h"
+
+namespace paddle {
+namespace operators {
+
+void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
+  PADDLE_ENFORCE(ctx->HasInput("X"), "Assert only one Input(X) of GRU.");
+  PADDLE_ENFORCE(ctx->HasInput("WeightX"),
+                 "Assert only one Input(WeightX) of GRU.");
+  PADDLE_ENFORCE(ctx->HasInput("WeightH"),
+                 "Assert only one Input(WeightH) of GRU.");
+  PADDLE_ENFORCE(ctx->HasOutput("XX"), "Assert only one Output(XX) of GRU.");
+  PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
+                 "Assert only one Output(Hidden) of GRU.");
+
+  auto x_dims = ctx->GetInputDim("X");
+  PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank must be 2.");
+
+  auto wx_dims = ctx->GetInputDim("WeightX");
+  PADDLE_ENFORCE_EQ(wx_dims.size(), 2,
+                    "The rank of Input(WeightX) should be 2.");
+  PADDLE_ENFORCE_EQ(wx_dims[0], x_dims[1],
+                    "The first dimension of Input(WeightX) "
+                    "should be %d.",
+                    x_dims[1]);
+
+  int frame_size = wx_dims[1] / 3;
+  auto wh_dims = ctx->GetInputDim("WeightH");
+  PADDLE_ENFORCE_EQ(wh_dims.size(), 2,
+                    "The rank of Input(WeightH) should be 2.");
+  PADDLE_ENFORCE_EQ(wh_dims[0], frame_size,
+                    "The first dimension of Input(WeightH) "
+                    "should be %d.",
+                    frame_size);
+  PADDLE_ENFORCE_EQ(wh_dims[1], 3 * frame_size,
+                    "The second dimension of Input(WeightH) "
+                    "should be 3 * %d.",
+                    frame_size);
+
+  if (ctx->HasInput("H0")) {
+    auto h0_dims = ctx->GetInputDim("H0");
+    PADDLE_ENFORCE_EQ(h0_dims[1], frame_size,
+                      "The width of H0 must be equal to frame_size.");
+  }
+  if (ctx->HasInput("Bias")) {
+    auto b_dims = ctx->GetInputDim("Bias");
+    PADDLE_ENFORCE_EQ(b_dims.size(), 2, "The rank of Input(Bias) should be 2.");
+    PADDLE_ENFORCE_EQ(b_dims[0], 1,
+                      "The first dimension of Input(Bias) should be 1.");
+    PADDLE_ENFORCE_EQ(b_dims[1], frame_size * 3,
+                      "The shape of Bias must be [1, frame_size * 3].");
+  }
+  framework::DDim out_dims({x_dims[0], frame_size});
+  ctx->SetOutputDim("Hidden", out_dims);
+  ctx->ShareLoD("X", "Hidden");
+  int xx_width;
+  if (ctx->Attrs().Get<bool>("use_seq")) {
+    xx_width = wx_dims[1];
+  } else {
+    xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1];
+    PADDLE_ENFORCE(ctx->HasOutput("ReorderedH0"),
+                   "Assert only one Output(ReorderedH0) of GRU.");
+    PADDLE_ENFORCE(ctx->HasOutput("BatchedInput"),
+                   "Assert only one Output(BatchedInput) of GRU.");
+    PADDLE_ENFORCE(ctx->HasOutput("BatchedOut"),
+                   "Assert only one Output(BatchedOut) of GRU.");
+    ctx->SetOutputDim("BatchedInput", {x_dims[0], wx_dims[1]});
+    ctx->SetOutputDim("BatchedOut", out_dims);
+  }
+  ctx->SetOutputDim("XX", {x_dims[0], xx_width});
+  ctx->ShareLoD("X", "XX");
+}
+
+framework::OpKernelType FusionGRUOp::GetExpectedKernelType(
+    const framework::ExecutionContext& ctx) const {
+  return framework::OpKernelType(
+      framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+      ctx.device_context());
+}
+
+void FusionGRUOpMaker::Make() {
+  AddInput("X",
+           "(LoDTensor) the input is a LodTensor, which support "
+           "variable-time length input sequence. The underlying tensor in "
+           "this LoDTensor is a matrix with shape (T X M), where T is the "
+           "total time steps in this mini-batch, M is the dim size of x.");
+  AddInput("H0",
+           "(Tensor, optional) The initial hidden state is an optional "
+           "input. This is a tensor with shape (N x D), where N is the "
+           "batch size, D is the hidden size.")
+      .AsDispensable();
+  AddInput("WeightX",
+           "(Tensor) The FC weight with shape (M x 3D),"
+           "where M is the dim size of x, D is the hidden size. ");
+  AddInput("WeightH",
+           "(Tensor) (D x 3D) Same as GRUOp, where D is the hidden size. "
+           "This weight is not exactly D x 3D as: {W_update, W_reset, W_state}"
+           "Acutally they are D x 2D and D x D two part weights."
+           "{W_update, W_reset; W_state}"
+           "{D x (D + D); D x D}");
+  AddInput("Bias",
+           "(Tensor, optional) (1 x 3D)."
+           "Almost same as GRUOp."
+           "Note: if have FC bias it should be added on this bias.")
+      .AsDispensable();
+  AddOutput("ReorderedH0", "(Tensor) (N x D), which N is the min-batch size.")
+      .AsIntermediate();
+  AddOutput("XX",
+            "(LoDTensor) the result after X * WeightX (size is T x 3D)"
+            " or batched_X (size is T x M), this will be automatically chosen,"
+            " where T is the total time steps in this mini-batch,"
+            " D is the hidden size, M is the dim size of x input.")
+      .AsIntermediate();
+  AddOutput("BatchedInput",
+            "(LoDTensor) This is the batched result of input X"
+            "or the batched result after fc, shape (T x 3D)")
+      .AsIntermediate();
+  AddOutput("BatchedOut", "(LoDTensor) (T X D) save batched hidden.")
+      .AsIntermediate();
+  AddOutput("Hidden", "(LoDTensor) (T x D) Same as GRUOp");
+  AddAttr<std::string>("activation",
+                       "(string, default tanh) "
+                       "The activation type used for output candidate {h}_t.")
+      .SetDefault("tanh");
+  AddAttr<std::string>(
+      "gate_activation",
+      "(string, default sigmoid) "
+      "The activation type used in update gate and reset gate.")
+      .SetDefault("sigmoid");
+  AddAttr<bool>("is_reverse",
+                "(bool, defalut: False) "
+                "whether to compute reversed GRU.")
+      .SetDefault(false);
+  AddAttr<bool>("use_seq",
+                "(bool, defalut: True) "
+                "whether to use seq mode to compute GRU.")
+      .SetDefault(true);
+  AddComment(R"DOC(
+The Fusion complete GRU Operator.
+This operator fuse the fully-connected operator into GRU, 
+more details can refer to GRU op.
+)DOC");
+}
+
+template <typename T>
+class FusionGRUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    if (ctx.Attr<bool>("use_seq")) {
+      SeqCompute(ctx);
+    } else {
+      BatchCompute(ctx);
+    }
+  }
+
+#define INIT_VEC_FUNC                                                     \
+  std::function<void(const int, const T *, T *)> act_gate, act_state;     \
+  std::function<void(const int, const T*, const T*, const T*, T*)> cross; \
+  auto& act_gate_str = ctx.Attr<std::string>("gate_activation");          \
+  auto& act_state_str = ctx.Attr<std::string>("activation");              \
+  if (platform::jit::MayIUse(platform::jit::avx)) {                       \
+    math::VecActivations<T, platform::jit::avx> act_functor;              \
+    act_gate = act_functor(act_gate_str);                                 \
+    act_state = act_functor(act_state_str);                               \
+    cross = math::vec_cross<T, platform::jit::avx>;                       \
+  } else {                                                                \
+    math::VecActivations<T, platform::jit::isa_any> act_functor;          \
+    act_gate = act_functor(act_gate_str);                                 \
+    act_state = act_functor(act_state_str);                               \
+    cross = math::vec_cross<T, platform::jit::isa_any>;                   \
+  }
+
+#define INIT_BASE_INPUT_OUTPUT                        \
+  auto* h0 = ctx.Input<Tensor>("H0");                 \
+  auto* wx = ctx.Input<Tensor>("WeightX");            \
+  auto* wh = ctx.Input<Tensor>("WeightH");            \
+  auto* bias = ctx.Input<Tensor>("Bias");             \
+  auto* xx = ctx.Output<LoDTensor>("XX");             \
+  auto* hidden_out = ctx.Output<LoDTensor>("Hidden"); \
+  bool is_reverse = ctx.Attr<bool>("is_reverse");
+
+#define INIT_BASE_SIZES                  \
+  auto x_dims = x->dims();   /* T x M*/  \
+  auto wh_dims = wh->dims(); /* D x 3D*/ \
+  const int total_T = x_dims[0];         \
+  const int M = x_dims[1];               \
+  const int D = wh_dims[0];              \
+  const int D3 = wh_dims[1];             \
+  const int D2 = D * 2;
+
+  void SeqCompute(const framework::ExecutionContext& ctx) const {
+    using DeviceContext = paddle::platform::CPUDeviceContext;
+    auto* x = ctx.Input<LoDTensor>("X");
+    INIT_BASE_INPUT_OUTPUT
+    INIT_BASE_SIZES
+    INIT_VEC_FUNC
+
+    auto x_lod = x->lod();
+    const int N = x_lod[0].size() - 1;
+    const T* x_data = x->data<T>();
+    const T* h0_data = h0 ? h0->data<T>() : nullptr;
+    const T* wx_data = wx->data<T>();
+    const T* wh_data = wh->data<T>();
+    const T* wh_state_data = wh_data + D * D2;
+    T* xx_data = xx->mutable_data<T>(ctx.GetPlace());
+    T* hidden_out_data = hidden_out->mutable_data<T>(ctx.GetPlace());
+
+    auto blas = math::GetBlas<DeviceContext, T>(ctx);
+    math::FCCompute<DeviceContext, T>(blas, total_T, D3, M, x_data, wx_data,
+                                      xx_data,
+                                      bias ? bias->data<T>() : nullptr);
+
+    int xx_offset = D3;
+    int gate_offset = D;
+    if (is_reverse) {
+      const int offset = (total_T - 1) * D;
+      xx_data = xx_data + offset * 3;
+      hidden_out_data = hidden_out_data + offset;
+      xx_offset = -D3;
+      gate_offset = -D;
+    }
+    auto move_step = [&]() {
+      xx_data = xx_data + xx_offset;
+      hidden_out_data = hidden_out_data + gate_offset;
+    };
+    for (int i = 0; i < N; ++i) {
+      int bid = is_reverse ? N - 1 - i : i;
+      int seq_len = x_lod[0][bid + 1] - x_lod[0][bid];
+      const T* prev_hidden_data = nullptr;
+      int tstart = 0;
+      if (h0_data) {
+        prev_hidden_data = h0_data + bid * D;
+      } else {
+        // W: {W_update, W_reset; W_state}
+        // update gate
+        act_gate(D, xx_data, xx_data);
+        // state gate
+        act_state(D, xx_data + D2, xx_data + D2);
+        // out = a*b
+        blas.VMUL(D, xx_data, xx_data + D2, hidden_out_data);
+        // save prev
+        prev_hidden_data = hidden_out_data;
+        tstart = 1;
+        move_step();
+      }
+      for (int step = tstart; step < seq_len; ++step) {
+        // gemm prev * (Wu + Wr)
+        blas.GEMM(CblasNoTrans, CblasNoTrans, 1, D2, D, static_cast<T>(1),
+                  prev_hidden_data, D, wh_data, D2, static_cast<T>(1), xx_data,
+                  D3);
+        act_gate(D2, xx_data, xx_data);
+        // rt = rt*ht_1 inplace result
+        blas.VMUL(D, prev_hidden_data, xx_data + D, hidden_out_data);
+
+        // gemm rt * Ws
+        blas.GEMM(CblasNoTrans, CblasNoTrans, 1, D, D, static_cast<T>(1),
+                  hidden_out_data, D, wh_state_data, D, static_cast<T>(1),
+                  xx_data + D2, D3);
+        act_state(D, xx_data + D2, xx_data + D2);
+        // out = zt*ht~ + (1-zt)*ht_1
+        cross(D, xx_data, xx_data + D2, prev_hidden_data, hidden_out_data);
+        // save prev
+        prev_hidden_data = hidden_out_data;
+        move_step();
+      }
+    }
+  }
+
+  void BatchCompute(const framework::ExecutionContext& ctx) const {
+    using DeviceContext = paddle::platform::CPUDeviceContext;
+    auto* x = ctx.Input<LoDTensor>("X");
+    if (x->lod()[0].size() == 2) {
+      SeqCompute(ctx);
+      return;
+    }
+    INIT_BASE_INPUT_OUTPUT
+    INIT_BASE_SIZES
+    INIT_VEC_FUNC
+
+    auto* reordered_h0 = ctx.Output<Tensor>("ReorderedH0");
+    auto* batched_input = ctx.Output<LoDTensor>("BatchedInput");
+    auto* batched_out = ctx.Output<LoDTensor>("BatchedOut");
+
+    const T* x_data = x->data<T>();
+    const T* wx_data = wx->data<T>();
+    const T* wh_data = wh->data<T>();
+    T* xx_data = xx->mutable_data<T>(ctx.GetPlace());
+    T* batched_input_data = batched_input->mutable_data<T>(ctx.GetPlace());
+    T* batched_out_data = batched_out->mutable_data<T>(ctx.GetPlace());
+    hidden_out->mutable_data<T>(ctx.GetPlace());
+
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
+    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
+    if (M > D3) {
+      math::FCCompute<DeviceContext, T>(blas, total_T, D3, M, x_data, wx_data,
+                                        xx_data,
+                                        bias ? bias->data<T>() : nullptr);
+      to_batch(dev_ctx, *xx, batched_input, true, is_reverse);
+    } else {
+      to_batch(dev_ctx, *x, xx, true, is_reverse);
+      batched_input->set_lod(xx->lod());
+      math::FCCompute<DeviceContext, T>(blas, total_T, D3, M, xx_data, wx_data,
+                                        batched_input_data,
+                                        bias ? bias->data<T>() : nullptr);
+    }
+
+    auto batched_lod = batched_input->lod();
+    const auto& seq_order = batched_lod[2];
+    const int max_bs = seq_order.size();
+    reordered_h0->Resize({max_bs, D});
+
+    int tstart = 0;
+    T* prev_hidden_data = nullptr;
+    if (h0) {
+      // reorder h0
+      T* reordered_h0_data = reordered_h0->mutable_data<T>(ctx.GetPlace());
+      const T* h0_data = h0->data<T>();
+      prev_hidden_data = reordered_h0_data;
+      size_t sz = sizeof(T) * D;
+      for (int i = 0; i < max_bs; ++i) {
+        std::memcpy(reordered_h0_data, h0_data + seq_order[i] * D, sz);
+        reordered_h0_data += D;
+      }
+    } else {
+      // compute without h0
+      T* cur_in_data = batched_input_data;
+      T* cur_out_data = batched_out_data;
+      // W: {W_update, W_reset; W_state}
+      for (int i = 0; i < max_bs; ++i) {
+        // update gate
+        act_gate(D, cur_in_data, cur_in_data);
+        // state gate
+        act_state(D, cur_in_data + D2, cur_in_data + D2);
+        // out = a*b
+        blas.VMUL(D, cur_in_data, cur_in_data + D2, cur_out_data);
+        // add offset
+        cur_in_data += D3;
+        cur_out_data += D;
+      }
+      tstart = 1;
+      prev_hidden_data = batched_out_data;
+    }
+    // Then start from next
+    const T* wh_state_data = wh_data + D * D2;
+    const auto& batch_starts = batched_lod[0];
+    const int max_seq_len = batch_starts.size() - 1;
+    batched_input_data = batched_input_data + tstart * max_bs * D3;
+    batched_out_data = batched_out_data + tstart * max_bs * D;
+    for (int step = tstart; step < max_seq_len; ++step) {
+      const int cur_bs = batch_starts[step + 1] - batch_starts[step];
+      // gemm prev * (Wu + Wr)
+      blas.GEMM(CblasNoTrans, CblasNoTrans, cur_bs, D2, D, static_cast<T>(1),
+                prev_hidden_data, D, wh_data, D2, static_cast<T>(1),
+                batched_input_data, D3);
+
+      T* cur_batched_data = batched_input_data;
+      T* cur_out_data = batched_out_data;
+      T* cur_prev_hidden_data = prev_hidden_data;
+      for (int i = 0; i < cur_bs; ++i) {
+        act_gate(D2, cur_batched_data, cur_batched_data);
+        // rt = rt*ht_1 inplace result
+        blas.VMUL(D, cur_prev_hidden_data, cur_batched_data + D, cur_out_data);
+
+        cur_batched_data += D3;
+        cur_prev_hidden_data += D;
+        cur_out_data += D;
+      }
+
+      cur_batched_data = batched_input_data;
+      cur_out_data = batched_out_data;
+      blas.GEMM(CblasNoTrans, CblasNoTrans, cur_bs, D, D, static_cast<T>(1),
+                cur_out_data, D, wh_state_data, D, static_cast<T>(1),
+                cur_batched_data + D2, D3);
+
+      cur_prev_hidden_data = prev_hidden_data;
+      for (int i = 0; i < cur_bs; ++i) {
+        // ht~ = act_state(...)
+        act_state(D, cur_batched_data + D2, cur_batched_data + D2);
+        // out = zt*ht~ + (1-zt)*ht_1
+        cross(D, cur_batched_data, cur_batched_data + D2, cur_prev_hidden_data,
+              cur_out_data);
+
+        cur_batched_data += D3;
+        cur_prev_hidden_data += D;
+        cur_out_data += D;
+      }
+      prev_hidden_data = batched_out_data;
+      batched_out_data = cur_out_data;
+      batched_input_data = cur_batched_data;
+    }
+
+    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
+    batched_out->set_lod(batched_lod);
+    to_seq(dev_ctx, *batched_out, hidden_out);
+  }
+#undef INIT_VEC_FUNC
+#undef INIT_BASE_SIZES
+#undef INIT_BASE_INPUT_OUTPUT
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(fusion_gru, ops::FusionGRUOp, ops::FusionGRUOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OP_CPU_KERNEL(fusion_gru, ops::FusionGRUKernel<float>,
+                       ops::FusionGRUKernel<double>);
diff --git a/paddle/fluid/operators/fusion_gru_op.h b/paddle/fluid/operators/fusion_gru_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..eaa59cd412f8f2fd0089428f5e25202c70f032c7
--- /dev/null
+++ b/paddle/fluid/operators/fusion_gru_op.h
@@ -0,0 +1,41 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using LoDTensor = framework::LoDTensor;
+using Tensor = framework::Tensor;
+
+class FusionGRUOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override;
+};
+
+class FusionGRUOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override;
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/fusion_lstm_op.cc b/paddle/fluid/operators/fusion_lstm_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..55e465e3af08c012b8cff7714452ed32b32a5556
--- /dev/null
+++ b/paddle/fluid/operators/fusion_lstm_op.cc
@@ -0,0 +1,588 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/fusion_lstm_op.h"
+#include <string>
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/math/cpu_vec.h"
+#include "paddle/fluid/operators/math/fc_compute.h"
+#include "paddle/fluid/operators/math/sequence2batch.h"
+#include "paddle/fluid/platform/cpu_info.h"
+
+namespace paddle {
+namespace operators {
+
+void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
+  PADDLE_ENFORCE(ctx->HasInput("X"), "Assert only one Input(X) of LSTM.");
+  PADDLE_ENFORCE(ctx->HasInput("WeightX"),
+                 "Assert only one Input(WeightX) of LSTM.");
+  PADDLE_ENFORCE(ctx->HasInput("WeightH"),
+                 "Assert only one Input(WeightH) of LSTM.");
+  PADDLE_ENFORCE(ctx->HasInput("Bias"), "Assert only one Input(Bias) of LSTM.");
+  PADDLE_ENFORCE(ctx->HasOutput("XX"), "Assert only one Output(XX) of LSTM.");
+  PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
+                 "Assert only one Output(Hidden) of LSTM.");
+  PADDLE_ENFORCE(ctx->HasOutput("Cell"),
+                 "Assert only one Output(Cell) of LSTM.");
+
+  auto x_dims = ctx->GetInputDim("X");
+  PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank must be 2.");
+
+  if (ctx->HasInput("H0")) {
+    PADDLE_ENFORCE(ctx->HasInput("C0"),
+                   "Input(Cell) and Input(Hidden) of LSTM should not "
+                   "be null at the same time.");
+    auto h_dims = ctx->GetInputDim("H0");
+    auto c_dims = ctx->GetInputDim("C0");
+    PADDLE_ENFORCE(h_dims == c_dims,
+                   "The dimension of Input(H0) and Input(C0) "
+                   "should be the same.");
+  }
+
+  auto wx_dims = ctx->GetInputDim("WeightX");
+  PADDLE_ENFORCE_EQ(wx_dims.size(), 2,
+                    "The rank of Input(WeightX) should be 2.");
+  PADDLE_ENFORCE_EQ(wx_dims[0], x_dims[1],
+                    "The first dimension of Input(WeightX) "
+                    "should be %d.",
+                    x_dims[1]);
+
+  int frame_size = wx_dims[1] / 4;
+  auto wh_dims = ctx->GetInputDim("WeightH");
+  PADDLE_ENFORCE_EQ(wh_dims.size(), 2,
+                    "The rank of Input(WeightH) should be 2.");
+  PADDLE_ENFORCE_EQ(wh_dims[0], frame_size,
+                    "The first dimension of Input(WeightH) "
+                    "should be %d.",
+                    frame_size);
+  PADDLE_ENFORCE_EQ(wh_dims[1], 4 * frame_size,
+                    "The second dimension of Input(WeightH) "
+                    "should be 4 * %d.",
+                    frame_size);
+
+  auto b_dims = ctx->GetInputDim("Bias");
+  PADDLE_ENFORCE_EQ(b_dims.size(), 2, "The rank of Input(Bias) should be 2.");
+  PADDLE_ENFORCE_EQ(b_dims[0], 1,
+                    "The first dimension of Input(Bias) should be 1.");
+  PADDLE_ENFORCE_EQ(
+      b_dims[1], (ctx->Attrs().Get<bool>("use_peepholes") ? 7 : 4) * frame_size,
+      "The second dimension of Input(Bias) should be "
+      "7 * %d if enable peepholes connection or"
+      "4 * %d if disable peepholes",
+      frame_size, frame_size);
+
+  framework::DDim out_dims({x_dims[0], frame_size});
+  ctx->SetOutputDim("Hidden", out_dims);
+  ctx->SetOutputDim("Cell", out_dims);
+  ctx->ShareLoD("X", "Hidden");
+  ctx->ShareLoD("X", "Cell");
+  int xx_width;
+  if (ctx->Attrs().Get<bool>("use_seq")) {
+    xx_width = wx_dims[1];
+  } else {
+    xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1];
+    PADDLE_ENFORCE(ctx->HasOutput("BatchedInput"),
+                   "Assert only one Output(BatchedInput) of LSTM.");
+    PADDLE_ENFORCE(ctx->HasOutput("BatchedHidden"),
+                   "Assert only one Output(BatchedHidden) of LSTM.");
+    PADDLE_ENFORCE(ctx->HasOutput("BatchedCell"),
+                   "Assert only one Output(BatchedCell) of LSTM.");
+    PADDLE_ENFORCE(ctx->HasOutput("ReorderedH0"),
+                   "Assert only one Output(ReorderedH0) of LSTM");
+    PADDLE_ENFORCE(ctx->HasOutput("ReorderedC0"),
+                   "Assert only one Output(ReorderedC0) of LSTM.");
+    ctx->SetOutputDim("BatchedInput", {x_dims[0], wx_dims[1]});
+    ctx->SetOutputDim("BatchedHidden", out_dims);
+    ctx->SetOutputDim("BatchedCell", out_dims);
+  }
+  ctx->SetOutputDim("XX", {x_dims[0], xx_width});
+  ctx->ShareLoD("X", "XX");
+}
+
+framework::OpKernelType FusionLSTMOp::GetExpectedKernelType(
+    const framework::ExecutionContext& ctx) const {
+  return framework::OpKernelType(
+      framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+      ctx.device_context());
+}
+
+void FusionLSTMOpMaker::Make() {
+  AddInput("X",
+           "(LoDTensor) the input is a LodTensor, which support "
+           "variable-time length input sequence. The underlying tensor in "
+           "this LoDTensor is a matrix with shape (T X M), where T is the "
+           "total time steps in this mini-batch, M is the dim size of x.");
+  AddInput("WeightX",
+           "(Tensor) the learnable weights of X."
+           " - The shape is (M x 4D), where M is the dim size of x, D is the "
+           "hidden size. "
+           " - Weight = {W_cx, W_ix, W_fx, W_ox}");
+  AddInput("WeightH",
+           "(Tensor) same as LSTMOp, the learnable hidden-hidden weights."
+           " - The shape is (D x 4D), where D is the hidden size. "
+           " - Weight = {W_ch, W_ih, W_fh, W_oh}");
+  AddInput("Bias",
+           "(Tensor) the learnable weights. Almost same as LSTMOp"
+           "Note: we should add the fc bias into this (1x4D) in bias."
+           "input-hidden bias weight and peephole connections weight if "
+           "setting `use_peepholes` True. "
+           "1. `use_peepholes = False` "
+           " - The shape is (1 x 4D). "
+           " - Bias = {b_c, b_i, b_f, b_o}."
+           "2. `use_peepholes = True` "
+           " - The shape is (1 x 7D). "
+           " - Bias = {b_c, b_i, b_f, b_o, W_ic, W_fc, W_oc}.");
+  AddInput("H0",
+           "(Tensor, optional) (same as LSTMOp) the initial hidden state is an "
+           "optional "
+           "input. This is a tensor with shape (N x D), where N is the "
+           "batch size and D is the hidden size.")
+      .AsDispensable();
+  AddInput("C0",
+           "(Tensor, optional) (same as LSTMOp) (the initial cell state is an "
+           "optional "
+           "input. This is a tensor with shape (N x D), where N is the "
+           "batch size. `H0` and `C0` can be NULL but only at the same time.")
+      .AsDispensable();
+  AddOutput("Hidden",
+            "(LoDTensor) (same as LSTMOp) the hidden state of LSTM operator. "
+            "The shape is (T x D), and lod is the same with the `Input`.");
+  AddOutput("Cell",
+            "(LoDTensor) (same as LSTMOp) the cell state of LSTM operator. "
+            "The shape is (T x D), and lod is the same with the `Input`.");
+  AddOutput("XX",
+            "(LoDTensor) the result after X * WeightX (size is T x 4D)"
+            " or batched_X (size is T x M), this will be automatically chosen,"
+            " where T is the total time steps in this mini-batch,"
+            " D is the hidden size, M is the dim size of x input.")
+      .AsIntermediate();
+  AddOutput("BatchedInput", "(LoDTensor) (T x 4D).").AsIntermediate();
+  AddOutput("BatchedHidden", "(LoDTensor) (T x D).").AsIntermediate();
+  AddOutput("BatchedCell", "(LoDTensor) (T x D).").AsIntermediate();
+  AddOutput("ReorderedH0", "(LoDTensor) (N x D).").AsIntermediate();
+  AddOutput("ReorderedC0", "(LoDTensor) (N x D).").AsIntermediate();
+  AddAttr<bool>("use_peepholes",
+                "(bool, defalut: True) "
+                "whether to enable diagonal/peephole connections.")
+      .SetDefault(true);
+  AddAttr<bool>("is_reverse",
+                "(bool, defalut: False) "
+                "whether to compute reversed LSTM.")
+      .SetDefault(false);
+  AddAttr<bool>("use_seq",
+                "(bool, defalut: True) "
+                "whether to use seq mode to compute.")
+      .SetDefault(true);
+  AddAttr<std::string>("gate_activation",
+                       "(string, default: sigmoid)"
+                       "The activation for input gate, forget gate and output "
+                       "gate, `sigmoid` by default.")
+      .SetDefault("sigmoid")
+      .InEnum({"sigmoid", "tanh", "relu", "identity"});
+  AddAttr<std::string>("cell_activation",
+                       "(string, default: tanh)"
+                       "The activation for cell output, `tanh` by defalut.")
+      .SetDefault("tanh")
+      .InEnum({"sigmoid", "tanh", "relu", "identity"});
+  AddAttr<std::string>("candidate_activation",
+                       "(string, default: tanh)"
+                       "The activation for candidate hidden state, "
+                       "`tanh` by default.")
+      .SetDefault("tanh")
+      .InEnum({"sigmoid", "tanh", "relu", "identity"});
+  AddComment(R"DOC(
+Fusion Long-Short Term Memory (LSTM) Operator.
+This operator fuse the X into LSTM, more details can refer to LSTM op.
+)DOC");
+}
+
+template <typename T>
+class FuisonLSTMKernel : public framework::OpKernel<T> {
+ public:
+#define INIT_VEC_FUNC                                                          \
+  std::function<void(const int, const T *, T *)> act_gate, act_cell, act_cand; \
+  auto& act_gate_str = ctx.Attr<std::string>("gate_activation");               \
+  auto& act_cell_str = ctx.Attr<std::string>("cell_activation");               \
+  auto& act_cand_str = ctx.Attr<std::string>("candidate_activation");          \
+  if (platform::jit::MayIUse(platform::jit::avx)) {                            \
+    math::VecActivations<T, platform::jit::avx> act_functor;                   \
+    act_gate = act_functor(act_gate_str);                                      \
+    act_cell = act_functor(act_cell_str);                                      \
+    act_cand = act_functor(act_cand_str);                                      \
+  } else {                                                                     \
+    math::VecActivations<T, platform::jit::isa_any> act_functor;               \
+    act_gate = act_functor(act_gate_str);                                      \
+    act_cell = act_functor(act_cell_str);                                      \
+    act_cand = act_functor(act_cand_str);                                      \
+  }
+
+#define INIT_BASE_INPUT_OUTPUT                        \
+  auto* x = ctx.Input<LoDTensor>("X");                \
+  auto* h0 = ctx.Input<Tensor>("H0");                 \
+  auto* c0 = ctx.Input<Tensor>("C0");                 \
+  auto* wx = ctx.Input<Tensor>("WeightX");            \
+  auto* wh = ctx.Input<Tensor>("WeightH");            \
+  auto* bias = ctx.Input<Tensor>("Bias");             \
+  auto* xx = ctx.Output<LoDTensor>("XX");             \
+  auto* hidden_out = ctx.Output<LoDTensor>("Hidden"); \
+  auto* cell_out = ctx.Output<LoDTensor>("Cell");     \
+  bool is_reverse = ctx.Attr<bool>("is_reverse");     \
+  bool use_peepholes = ctx.Attr<bool>("use_peepholes");
+
+#define INIT_BASE_SIZES                  \
+  auto x_dims = x->dims();   /* T x M*/  \
+  auto wh_dims = wh->dims(); /* D x 4D*/ \
+  const int M = x_dims[1];               \
+  const int D = wh_dims[0];              \
+  const int D2 = D * 2;                  \
+  const int D3 = D * 3;                  \
+  const int D4 = wh_dims[1];
+
+#define INIT_BASE_INPUT_DATAS                                        \
+  const T* x_data = x->data<T>();                                    \
+  const T* wx_data = wx->data<T>();                                  \
+  const T* wh_data = wh->data<T>();                                  \
+  /* diagonal weight*/                                               \
+  const T* wc_data = bias->data<T>() + D4;                           \
+  /* for peephole only*/                                             \
+  Tensor checked_cell;                                               \
+  T* checked_cell_data = nullptr;                                    \
+  auto place = ctx.GetPlace();                                       \
+  if (use_peepholes) {                                               \
+    /* w_ic * Ct-1, w_fc * Ct-1  ; w_oc * Ct => ih*/                 \
+    checked_cell_data = checked_cell.mutable_data<T>({2, D}, place); \
+  }
+
+/// Compute LSTM
+#define GEMM_WH_ADDON(bs, prev, out)                                           \
+  blas.GEMM(CblasNoTrans, CblasNoTrans, bs, D4, D, static_cast<T>(1), prev, D, \
+            wh_data, D4, static_cast<T>(1), out, D4)
+
+// gates: W_ch, W_ih, W_fh, W_oh
+#define GET_Ct(ct_1, gates, ct)                   \
+  /* C_t = C_t-1 * fgated + cand_gated * igated*/ \
+  act_cand(D, gates, gates);                      \
+  blas.VMUL(D, gates, gates + D, gates + D);      \
+  blas.VMUL(D, ct_1, gates + D2, gates + D2);     \
+  blas.VADD(D, gates + D, gates + D2, ct)
+
+#define GET_Ht(ct, gates, ht)        \
+  /* H_t = act_cell(C_t) * ogated */ \
+  act_cell(D, ct, gates + D2);       \
+  blas.VMUL(D, gates + D2, gates + D3, ht)
+
+#define GET_Ct_NOH0C0(gates, ct)     \
+  /* C_t = igated * cgated*/         \
+  act_gate(D, gates + D, gates + D); \
+  act_cand(D, gates, gates);         \
+  blas.VMUL(D, gates, gates + D, ct)
+
+#define COMPUTE_CtHt_NOH0C0(gates, ct, ht) \
+  GET_Ct_NOH0C0(gates, ct);                \
+  act_gate(D, gates + D3, gates + D3);     \
+  GET_Ht(ct, gates, ht)
+
+#define COMPUTE_CtHt_PEEPHOLE_NOH0C0(gates, ct, ht) \
+  GET_Ct_NOH0C0(gates, ct);                         \
+  /* get outgated, put W_oc * C_t on igated */      \
+  blas.VMUL(D, wc_data + D2, ct, gates + D);        \
+  blas.VADD(D, gates + D, gates + D3, gates + D3);  \
+  act_gate(D, gates + D3, gates + D3);              \
+  GET_Ht(ct, gates, ht)
+
+#define COMPUTE_CtHt(gates, ct_1, ct, ht) \
+  act_gate(D3, gates + D, gates + D);     \
+  GET_Ct(ct_1, gates, ct);                \
+  GET_Ht(ct, gates, ht)
+
+#define COMPUTE_CtHt_PEEPHOLE(gates, ct_1, ct, ht)        \
+  /* get fgated and igated*/                              \
+  blas.VMUL(D, wc_data, ct_1, checked_cell_data);         \
+  blas.VMUL(D, wc_data + D, ct_1, checked_cell_data + D); \
+  blas.VADD(D2, checked_cell_data, gates + D, gates + D); \
+  act_gate(D2, gates + D, gates + D);                     \
+  GET_Ct(ct_1, gates, ct);                                \
+  /* get ogated*/                                         \
+  blas.VMUL(D, wc_data + D2, ct, gates + D);              \
+  blas.VADD(D, gates + D, gates + D3, gates + D3);        \
+  act_gate(D, gates + D3, gates + D3);                    \
+  GET_Ht(ct, gates, ht)
+
+  void SeqCompute(const framework::ExecutionContext& ctx) const {
+    using DeviceContext = paddle::platform::CPUDeviceContext;
+    INIT_BASE_INPUT_OUTPUT
+    INIT_BASE_SIZES
+    INIT_VEC_FUNC
+    INIT_BASE_INPUT_DATAS
+
+    auto x_lod = x->lod();
+    const int total_T = x_dims[0];
+    const int N = x_lod[0].size() - 1;
+    const T* h0_data = h0 ? h0->data<T>() : nullptr;
+    const T* c0_data = c0 ? c0->data<T>() : nullptr;
+    T* xx_data = xx->mutable_data<T>(place);
+    T* h_out_data = hidden_out->mutable_data<T>(place);
+    T* c_out_data = cell_out->mutable_data<T>(place);
+    auto blas = math::GetBlas<DeviceContext, T>(ctx);
+    math::FCCompute<DeviceContext, T>(blas, total_T, D4, M, x_data, wx_data,
+                                      xx_data, bias->data<T>());
+
+    int xx_offset = D4;
+    int gate_offset = D;
+    if (is_reverse) {
+      const int offset = (total_T - 1) * D;
+      xx_data = xx_data + offset * 4;
+      h_out_data = h_out_data + offset;
+      c_out_data = c_out_data + offset;
+      xx_offset = -D4;
+      gate_offset = -D;
+    }
+
+#define MOVE_ONE_STEP                    \
+  prev_h_data = h_out_data;              \
+  prev_c_data = c_out_data;              \
+  xx_data = xx_data + xx_offset;         \
+  h_out_data = h_out_data + gate_offset; \
+  c_out_data = c_out_data + gate_offset
+
+#define PROCESS_H0C0_DEFINES                       \
+  int bid = is_reverse ? N - 1 - i : i;            \
+  int seq_len = x_lod[0][bid + 1] - x_lod[0][bid]; \
+  const T* prev_c_data = nullptr;                  \
+  const T* prev_h_data = nullptr;                  \
+  int tstart = 0
+
+#define PROCESS_H0C0_PEEPHOLE                                      \
+  PROCESS_H0C0_DEFINES;                                            \
+  if (h0_data) {                                                   \
+    prev_h_data = h0_data + bid * D;                               \
+    prev_c_data = c0_data + bid * D;                               \
+  } else {                                                         \
+    COMPUTE_CtHt_PEEPHOLE_NOH0C0(xx_data, c_out_data, h_out_data); \
+    MOVE_ONE_STEP;                                                 \
+    tstart = 1;                                                    \
+  }
+
+#define PROCESS_H0C0                                      \
+  PROCESS_H0C0_DEFINES;                                   \
+  if (h0_data) {                                          \
+    prev_h_data = h0_data + bid * D;                      \
+    prev_c_data = c0_data + bid * D;                      \
+  } else {                                                \
+    COMPUTE_CtHt_NOH0C0(xx_data, c_out_data, h_out_data); \
+    MOVE_ONE_STEP;                                        \
+    tstart = 1;                                           \
+  }
+
+    if (use_peepholes) {
+      for (int i = 0; i < N; ++i) {
+        PROCESS_H0C0_PEEPHOLE
+        for (int step = tstart; step < seq_len; ++step) {
+          GEMM_WH_ADDON(1, prev_h_data, xx_data);
+          COMPUTE_CtHt_PEEPHOLE(xx_data, prev_c_data, c_out_data, h_out_data);
+          MOVE_ONE_STEP;
+        }
+      }
+    } else {
+      for (int i = 0; i < N; ++i) {
+        PROCESS_H0C0
+        for (int step = tstart; step < seq_len; ++step) {
+          GEMM_WH_ADDON(1, prev_h_data, xx_data);
+          COMPUTE_CtHt(xx_data, prev_c_data, c_out_data, h_out_data);
+          MOVE_ONE_STEP;
+        }
+      }
+    }
+#undef PROCESS_H0C0_DEFINES
+#undef PROCESS_H0C0_PEEPHOLE
+#undef PROCESS_H0C0
+#undef MOVE_ONE_STEP
+  }
+
+  void BatchCompute(const framework::ExecutionContext& ctx) const {
+    using DeviceContext = platform::CPUDeviceContext;
+    INIT_BASE_INPUT_OUTPUT
+    if (x->lod()[0].size() == 2) {
+      SeqCompute(ctx);
+      return;
+    }
+    INIT_BASE_SIZES
+    INIT_VEC_FUNC
+    INIT_BASE_INPUT_DATAS
+
+    auto* reordered_h0 = ctx.Output<Tensor>("ReorderedH0");
+    auto* reordered_c0 = ctx.Output<Tensor>("ReorderedC0");
+    auto* batched_input = ctx.Output<LoDTensor>("BatchedInput");
+    auto* batched_c_out = ctx.Output<LoDTensor>("BatchedCell");
+    auto* batched_h_out = ctx.Output<LoDTensor>("BatchedHidden");
+    T* xx_data = xx->mutable_data<T>(place);
+    T* batched_input_data = batched_input->mutable_data<T>(place);
+    T* batched_c_out_data = batched_c_out->mutable_data<T>(place);
+    T* batched_h_out_data = batched_h_out->mutable_data<T>(place);
+    hidden_out->mutable_data<T>(place);
+    cell_out->mutable_data<T>(place);
+
+    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
+    if (M > D4) {
+      math::FCCompute<DeviceContext, T>(blas, x_dims[0], D4, M, x_data, wx_data,
+                                        xx_data, bias->data<T>());
+      to_batch(dev_ctx, *xx, batched_input, true, is_reverse);
+    } else {
+      to_batch(dev_ctx, *x, xx, true, is_reverse);
+      batched_input->set_lod(xx->lod());
+      math::FCCompute<DeviceContext, T>(blas, x_dims[0], D4, M, xx_data,
+                                        wx_data, batched_input_data,
+                                        bias->data<T>());
+    }
+
+    auto batched_lod = batched_input->lod();
+    const auto& seq_order = batched_lod[2];
+    const int max_bs = seq_order.size();
+    reordered_h0->Resize({max_bs, D});
+    reordered_c0->Resize({max_bs, D});
+
+    int tstart = 0;
+    T* prev_h_data = nullptr;
+    T* prev_c_data = nullptr;
+    if (h0) {
+      // reorder h0, c0
+      T* reordered_h0_data = reordered_h0->mutable_data<T>(place);
+      T* reordered_c0_data = reordered_c0->mutable_data<T>(place);
+      const T* h0_data = h0->data<T>();
+      const T* c0_data = c0->data<T>();
+      prev_h_data = reordered_h0_data;
+      prev_c_data = reordered_c0_data;
+      size_t sz = sizeof(T) * D;
+      for (int i = 0; i < max_bs; ++i) {
+        std::memcpy(reordered_h0_data, h0_data + seq_order[i] * D, sz);
+        std::memcpy(reordered_c0_data, c0_data + seq_order[i] * D, sz);
+        reordered_h0_data += D;
+        reordered_c0_data += D;
+      }
+    } else {
+      // compute without h0, c0
+      T* cur_in_data = batched_input_data;
+      T* cur_h_out_data = batched_h_out_data;
+      T* cur_c_out_data = batched_c_out_data;
+      for (int i = 0; i < max_bs; ++i) {
+        GET_Ct_NOH0C0(cur_in_data, cur_c_out_data);
+        if (use_peepholes) {
+          blas.VMUL(D, wc_data + D2, cur_c_out_data, cur_in_data + D);
+          blas.VADD(D, cur_in_data + D, cur_in_data + D3, cur_in_data + D3);
+        }
+        act_gate(D, cur_in_data + D3, cur_in_data + D3);
+        GET_Ht(cur_c_out_data, cur_in_data, cur_h_out_data);
+        cur_in_data += D4;
+        cur_c_out_data += D;
+        cur_h_out_data += D;
+      }
+      tstart = 1;
+      prev_h_data = batched_h_out_data;
+      prev_c_data = batched_c_out_data;
+    }
+    const auto& batch_starts = batched_lod[0];
+    const int max_seq_len = batch_starts.size() - 1;
+    const int offset = tstart * max_bs * D;
+    batched_input_data = batched_input_data + offset * 4;
+    batched_h_out_data = batched_h_out_data + offset;
+    batched_c_out_data = batched_c_out_data + offset;
+
+#define DEFINE_CUR                        \
+  T* cur_in_data = batched_input_data;    \
+  T* cur_prev_c_data = prev_c_data;       \
+  T* cur_c_out_data = batched_c_out_data; \
+  T* cur_h_out_data = batched_h_out_data
+
+#define MOVE_ONE_BATCH  \
+  cur_in_data += D4;    \
+  cur_prev_c_data += D; \
+  cur_c_out_data += D;  \
+  cur_h_out_data += D
+
+#define MOVE_ONE_STEP                  \
+  prev_c_data = batched_c_out_data;    \
+  prev_h_data = batched_h_out_data;    \
+  batched_c_out_data = cur_c_out_data; \
+  batched_h_out_data = cur_h_out_data; \
+  batched_input_data = cur_in_data
+
+    if (use_peepholes) {
+      for (int step = tstart; step < max_seq_len; ++step) {
+        const int cur_bs = batch_starts[step + 1] - batch_starts[step];
+        GEMM_WH_ADDON(cur_bs, prev_h_data, batched_input_data);
+        DEFINE_CUR;
+        for (int i = 0; i < cur_bs; ++i) {
+          COMPUTE_CtHt_PEEPHOLE(cur_in_data, cur_prev_c_data, cur_c_out_data,
+                                cur_h_out_data);
+          MOVE_ONE_BATCH;
+        }
+        MOVE_ONE_STEP;
+      }
+    } else {
+      for (int step = tstart; step < max_seq_len; ++step) {
+        const int cur_bs = batch_starts[step + 1] - batch_starts[step];
+        GEMM_WH_ADDON(cur_bs, prev_h_data, batched_input_data);
+        DEFINE_CUR;
+        for (int i = 0; i < cur_bs; ++i) {
+          COMPUTE_CtHt(cur_in_data, cur_prev_c_data, cur_c_out_data,
+                       cur_h_out_data);
+          MOVE_ONE_BATCH;
+        }
+        MOVE_ONE_STEP;
+      }
+    }
+#undef MOVE_ONE_STEP
+#undef MOVE_ONE_BATCH
+#undef DEFINE_CUR
+
+    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
+    batched_h_out->set_lod(batched_lod);
+    to_seq(dev_ctx, *batched_h_out, hidden_out);
+    batched_c_out->set_lod(batched_lod);
+    to_seq(dev_ctx, *batched_c_out, cell_out);
+  }
+
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    if (ctx.Attr<bool>("use_seq")) {
+      SeqCompute(ctx);
+    } else {
+      BatchCompute(ctx);
+    }
+  }
+
+#undef COMPUTE_CtHt_PEEPHOLE
+#undef COMPUTE_CtHt
+#undef GET_Ct_NOH0C0
+#undef COMPUTE_CtHt_NOH0C0
+#undef COMPUTE_CtHt_PEEPHOLE_NOH0C0
+#undef GET_Ht
+#undef GET_Ct
+#undef GEMM_WH_ADDON
+#undef INIT_BASE_INPUT_DATAS
+#undef INIT_BASE_SIZES
+#undef INIT_BASE_INPUT_OUTPUT
+#undef INIT_VEC_FUNC
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(fusion_lstm, ops::FusionLSTMOp, ops::FusionLSTMOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+
+REGISTER_OP_CPU_KERNEL(fusion_lstm, ops::FuisonLSTMKernel<float>,
+                       ops::FuisonLSTMKernel<double>);
diff --git a/paddle/fluid/operators/fusion_lstm_op.h b/paddle/fluid/operators/fusion_lstm_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..7f79601602348ac454fc6c0cefcba0643ad8e6e2
--- /dev/null
+++ b/paddle/fluid/operators/fusion_lstm_op.h
@@ -0,0 +1,41 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using LoDTensor = framework::LoDTensor;
+using Tensor = framework::Tensor;
+
+class FusionLSTMOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override;
+};
+
+class FusionLSTMOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override;
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/fusion_seqexpand_concat_fc_op.cc b/paddle/fluid/operators/fusion_seqexpand_concat_fc_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0cd3d3887cf5167c779a8b20442fdb458cd7eab4
--- /dev/null
+++ b/paddle/fluid/operators/fusion_seqexpand_concat_fc_op.cc
@@ -0,0 +1,206 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/fusion_seqexpand_concat_fc_op.h"
+#include <string>
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/math/cpu_vec.h"
+#include "paddle/fluid/operators/math/fc_compute.h"
+#include "paddle/fluid/platform/cpu_info.h"
+
+namespace paddle {
+namespace operators {
+
+void FusionSeqExpandConcatFCOp::InferShape(
+    framework::InferShapeContext* ctx) const {
+  PADDLE_ENFORCE_GT(
+      ctx->Inputs("X").size(), 1UL,
+      "Inputs(X) of FusionSeqExpandConcatFCOp should larger than 1.");
+  PADDLE_ENFORCE(
+      ctx->HasInput("FCWeight"),
+      "Input(FCWeight) of FusionSeqExpandConcatFCOp should not be null.");
+  PADDLE_ENFORCE(
+      ctx->HasOutput("Out"),
+      "Output(Out) of FusionSeqExpandConcatFCOp should not be null.");
+  PADDLE_ENFORCE(
+      ctx->HasOutput("FCOut"),
+      "Output(FCOut) of FusionSeqExpandConcatFCOp should not be null.");
+
+  auto ins_dims = ctx->GetInputsDim("X");
+  auto w_dims = ctx->GetInputDim("FCWeight");  // (M0+M1+M2+..) x D
+  PADDLE_ENFORCE_EQ(w_dims.size(), 2UL, "Input(FCWeight)'s rank must be 2.");
+  const int D = w_dims[1];
+  int sum = ins_dims[0][1];
+  for (size_t i = 1; i < ins_dims.size(); ++i) {
+    sum += ins_dims[i][1];
+  }
+  PADDLE_ENFORCE_EQ(sum, w_dims[0],
+                    "FC height should be sum of all inputs width.");
+  if (ctx->HasInput("FCBias")) {
+    auto b_dims = ctx->GetInputDim("FCBias");
+    PADDLE_ENFORCE(b_dims.size() == 1 || b_dims.size() == 2,
+                   "b_dims should be 1 or 2, get %d", b_dims.size());
+    if (b_dims.size() == 1) {
+      PADDLE_ENFORCE_EQ(b_dims[0], D, "FCBias shapes must be %d.", D);
+    } else {
+      PADDLE_ENFORCE_EQ(b_dims[0], 1, "FCBias shapes must be 1x%d.", D);
+      PADDLE_ENFORCE_EQ(b_dims[1], D, "FCBias shapes must be 1x%d.", D);
+    }
+  }
+
+  ctx->SetOutputDim("Out", {ins_dims[0][0], D});
+  // fcout should be reshape when run since can not get lod in infershape
+  // explicit share the ref lod
+  ctx->ShareLoD("X", "Out", 0);
+}
+
+framework::OpKernelType FusionSeqExpandConcatFCOp::GetExpectedKernelType(
+    const framework::ExecutionContext& ctx) const {
+  return framework::OpKernelType(
+      framework::ToDataType(ctx.MultiInput<LoDTensor>("X")[0]->type()),
+      ctx.device_context());
+}
+
+void FusionSeqExpandConcatFCOpMaker::Make() {
+  AddInput("X",
+           "(LoDTensor) input LodDTensors, the first one must be have ref lod "
+           "for sequence expand, and the rest input should have same lod.")
+      .AsDuplicable();
+  AddInput("FCWeight", "(Tensor) the weights of fc.");
+  AddInput("FCBias", "(Tensor, optional) the bias of fc.").AsDispensable();
+  AddOutput("Out", "(LoDTensor) Output LodTensor.");
+  AddOutput(
+      "FCOut",
+      "(Tensor) the intermediate tensor to keep the result of fc."
+      "Shape is (N x D), where N is the batch size, D is the output dim of fc")
+      .AsIntermediate();
+  AddAttr<std::string>("fc_activation",
+                       "(string, default: identity)"
+                       "The activation for the result of fc."
+                       "`identity` by default.")
+      .SetDefault("identity")
+      .InEnum({"sigmoid", "tanh", "relu", "identity"});
+  AddComment(R"DOC(
+Fusion Sequence expand + concat + fc Operator.
+
+All below conditions should be meet:
+
+The ref_level of seq_expand should be 0.
+
+The ref lod of seq_expand level is the first input of concat.
+
+The other inputs should have same lod and same batch size of ref lod.
+
+The seq len of other inputs should be 1.
+
+The concat axis should be 1.
+
+)DOC");
+}
+
+template <typename T>
+class FusionSeqExpandConcatFCOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    using DeviceContext = paddle::platform::CPUDeviceContext;
+    auto ins = ctx.MultiInput<LoDTensor>("X");
+    auto* w = ctx.Input<Tensor>("FCWeight");
+    auto* b = ctx.Input<Tensor>("FCBias");
+    auto* out = ctx.Output<LoDTensor>("Out");
+    auto* fc_out = ctx.Output<Tensor>("FCOut");
+
+    auto* ref_in = ins[0];
+    auto ref_lod = ref_in->lod();
+    auto in1_lod = ins[1]->lod();
+    auto ref_dims = ref_in->dims();  // T x M0
+    auto in1_dims = ins[1]->dims();  // N x M1
+    auto w_dims = w->dims();
+    const int N = ref_lod[0].size() - 1;
+    const int total_T = ref_dims[0];
+    const int M0 = ref_dims[1];
+    const int M1 = in1_dims[1];
+    const int D = w_dims[1];
+
+    // some check and fcout should be reshape here
+    // since infershape can not get lod info
+    PADDLE_ENFORCE_EQ(ref_lod.size(), 1UL, "Only support input lod size is 1.");
+    PADDLE_ENFORCE_EQ(in1_lod.size(), 1UL, "Only support input lod size is 1.");
+    PADDLE_ENFORCE_EQ(in1_lod[0].size() - 1, N,
+                      "Batch size of all inputs should be equal.");
+    PADDLE_ENFORCE_EQ(in1_lod[0][N], N,
+                      "Seq_length of other inputs should be 1.");
+    PADDLE_ENFORCE_EQ(in1_dims[0], N, "input height should be batch size.");
+    for (size_t i = 2; i < ins.size(); ++i) {
+      PADDLE_ENFORCE_EQ(ins[i]->dims()[0], N,
+                        "All other inputs height should be equal");
+      PADDLE_ENFORCE_EQ(ins[i]->lod(), in1_lod,
+                        "All other inputs should have same lod");
+    }
+    fc_out->Resize({N, D});
+
+    std::function<void(const int, const T*, T*)> fc_act;
+    auto& fc_act_str = ctx.Attr<std::string>("fc_activation");
+    if (platform::jit::MayIUse(platform::jit::avx)) {
+      math::VecActivations<T, platform::jit::avx> act_functor;
+      fc_act = act_functor(fc_act_str);
+    } else {
+      math::VecActivations<T, platform::jit::isa_any> act_functor;
+      fc_act = act_functor(fc_act_str);
+    }
+
+    const T* ref_in_data = ref_in->data<T>();
+    const T* in1_data = ins[1]->data<T>();
+    const T* w_data = w->data<T>();
+    T* out_data = out->mutable_data<T>(ctx.GetPlace());
+    T* fc_out_data = fc_out->mutable_data<T>(ctx.GetPlace());
+
+    auto blas = math::GetBlas<DeviceContext, T>(ctx);
+    math::FCCompute<DeviceContext, T>(blas, total_T, D, M0, ref_in_data, w_data,
+                                      out_data, b ? b->data<T>() : NULL);
+    w_data = w_data + M0 * D;
+    // first write on
+    blas.MatMul(N, D, M1, in1_data, w_data, fc_out_data);
+    w_data = w_data + M1 * D;
+    for (size_t i = 2; i < ins.size(); ++i) {
+      // add on
+      const T* in_data = ins[i]->data<T>();
+      const int K = ins[i]->dims()[1];
+      blas.GEMM(CblasNoTrans, CblasNoTrans, N, D, K, static_cast<T>(1), in_data,
+                K, w_data, D, static_cast<T>(1), fc_out_data, D);
+      w_data = w_data + K * D;
+    }
+    T* cur_out_data = out_data;
+    for (int i = 0; i < N; ++i) {
+      int seq_len = ref_lod[0][i + 1] - ref_lod[0][i];
+      T* src = fc_out_data + i * D;
+      for (int step = 0; step < seq_len; ++step) {
+        blas.VADD(D, cur_out_data, src, cur_out_data);
+        cur_out_data = cur_out_data + D;
+      }
+    }
+    fc_act(total_T * D, out_data, out_data);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(fusion_seqexpand_concat_fc, ops::FusionSeqExpandConcatFCOp,
+                  ops::FusionSeqExpandConcatFCOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+
+REGISTER_OP_CPU_KERNEL(fusion_seqexpand_concat_fc,
+                       ops::FusionSeqExpandConcatFCOpKernel<float>,
+                       ops::FusionSeqExpandConcatFCOpKernel<double>);
diff --git a/paddle/fluid/operators/fusion_seqexpand_concat_fc_op.h b/paddle/fluid/operators/fusion_seqexpand_concat_fc_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..f78e820f603354944bd7fc23aff2d1d72e5ba750
--- /dev/null
+++ b/paddle/fluid/operators/fusion_seqexpand_concat_fc_op.h
@@ -0,0 +1,42 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using LoDTensor = framework::LoDTensor;
+using Tensor = framework::Tensor;
+
+class FusionSeqExpandConcatFCOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override;
+};
+
+class FusionSeqExpandConcatFCOpMaker
+    : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override;
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/gather_op.cc b/paddle/fluid/operators/gather_op.cc
index aa3e05b83b23569a4dd9c83294916e289f993abc..089b541a0a61adb5efda6b2e027c913d5808dff0 100644
--- a/paddle/fluid/operators/gather_op.cc
+++ b/paddle/fluid/operators/gather_op.cc
@@ -101,5 +101,8 @@ namespace ops = paddle::operators;
 REGISTER_OPERATOR(gather, ops::GatherOp, ops::GatherOpMaker,
                   paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(gather_grad, ops::GatherGradOp);
-REGISTER_OP_CPU_KERNEL(gather, ops::GatherOpKernel<float>);
-REGISTER_OP_CPU_KERNEL(gather_grad, ops::GatherGradientOpKernel<float>);
+REGISTER_OP_CPU_KERNEL(gather, ops::GatherOpKernel<float>,
+                       ops::GatherOpKernel<int>, ops::GatherOpKernel<double>);
+REGISTER_OP_CPU_KERNEL(gather_grad, ops::GatherGradientOpKernel<float>,
+                       ops::GatherGradientOpKernel<int>,
+                       ops::GatherGradientOpKernel<double>);
diff --git a/paddle/fluid/operators/gru_op.cc b/paddle/fluid/operators/gru_op.cc
index 5c746878823b3dcde2573feec00d3d9dac5ceab8..087f903a8bba9a4bfcd7eaabd7098555442a904e 100644
--- a/paddle/fluid/operators/gru_op.cc
+++ b/paddle/fluid/operators/gru_op.cc
@@ -14,6 +14,11 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/gru_op.h"
 #include <string>
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/math/detail/gru_cpu_kernel.h"
+#include "paddle/fluid/operators/math/detail/gru_kernel.h"
+
+DECLARE_int32(paddle_num_threads);
 
 namespace paddle {
 namespace operators {
@@ -211,6 +216,158 @@ class GRUGradOp : public framework::OperatorWithKernel {
   }
 };
 
+template <typename T>
+class GRUCPUKernel : public framework::OpKernel<T> {
+ public:
+  void BatchCompute(const framework::ExecutionContext& context) const {
+    using DeviceContext = paddle::platform::CPUDeviceContext;
+    auto* input = context.Input<LoDTensor>("Input");
+    auto* h0 = context.Input<Tensor>("H0");
+    auto* weight = context.Input<Tensor>("Weight");
+    const T* weight_data = weight->data<T>();
+    auto* bias = context.Input<Tensor>("Bias");
+    auto* batch_gate = context.Output<LoDTensor>("BatchGate");
+    batch_gate->mutable_data<T>(context.GetPlace());
+    auto* batch_reset_hidden_prev =
+        context.Output<LoDTensor>("BatchResetHiddenPrev");
+    batch_reset_hidden_prev->mutable_data<T>(context.GetPlace());
+    auto* batch_hidden = context.Output<LoDTensor>("BatchHidden");
+    batch_hidden->mutable_data<T>(context.GetPlace());
+    auto* hidden = context.Output<LoDTensor>("Hidden");
+    hidden->mutable_data<T>(context.GetPlace());
+
+    auto hidden_dims = hidden->dims();
+
+    bool is_reverse = context.Attr<bool>("is_reverse");
+    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    to_batch(dev_ctx, *input, batch_gate, true, is_reverse);
+
+    if (bias) {
+      math::RowwiseAdd<DeviceContext, T> add_bias;
+      add_bias(dev_ctx, *batch_gate, *bias, batch_gate);
+    }
+
+    int frame_size = hidden_dims[1];
+    math::GRUMetaValue<T> gru_value;
+    gru_value.gate_weight = const_cast<T*>(weight_data);
+    gru_value.state_weight =
+        const_cast<T*>(weight_data + 2 * frame_size * frame_size);
+    Tensor ordered_h0;
+
+    framework::Vector<size_t> order(batch_gate->lod()[2]);
+
+    if (h0) {
+      // Since the batch computing for GRU reorders the input sequences
+      // according to their length. The initialized cell state also needs
+      // to reorder.
+      ReorderInitState<DeviceContext, T>(
+          context.template device_context<DeviceContext>(), *h0, order,
+          &ordered_h0, true);
+      gru_value.prev_out_value = ordered_h0.data<T>();
+    } else {
+      gru_value.prev_out_value = nullptr;
+    }
+    auto batch_starts = batch_gate->lod()[0];
+    size_t seq_len = batch_starts.size() - 1;
+    auto active_node = math::detail::GetActivationType(
+        context.Attr<std::string>("activation"));
+    auto active_gate = math::detail::GetActivationType(
+        context.Attr<std::string>("gate_activation"));
+
+#ifdef PADDLE_WITH_MKLML
+    // use MKL packed to speedup GEMM
+    if (FLAGS_paddle_num_threads >= 4) {
+      auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
+      T* packed_gate = blas.GEMM_ALLOC(CblasBMatrix, 1 /*height of C*/,
+                                       frame_size * 2 /*width of weight*/,
+                                       frame_size /*height of height*/);
+      PADDLE_ENFORCE(packed_gate);
+      blas.GEMM_PACK(CblasBMatrix, CblasNoTrans, 1 /*cur bs?*/, frame_size * 2,
+                     frame_size, T(1.0), gru_value.gate_weight, frame_size * 2,
+                     packed_gate);
+      T* packed_state = blas.GEMM_ALLOC(CblasBMatrix, 1 /*height of C*/,
+                                        frame_size /*width of weight*/,
+                                        frame_size /*height of height*/);
+      PADDLE_ENFORCE(packed_state);
+      blas.GEMM_PACK(CblasBMatrix, CblasNoTrans, 1 /*cur bs?*/, frame_size,
+                     frame_size, T(1.0), gru_value.state_weight, frame_size,
+                     packed_state);
+      for (size_t n = 0; n < seq_len; n++) {
+        int bstart = static_cast<int>(batch_starts[n]);
+        int bend = static_cast<int>(batch_starts[n + 1]);
+        int cur_batch_size = bend - bstart;
+
+        Tensor gate_t = batch_gate->Slice(bstart, bend);
+        Tensor reset_hidden_prev_t =
+            batch_reset_hidden_prev->Slice(bstart, bend);
+        Tensor hidden_t = batch_hidden->Slice(bstart, bend);
+        gru_value.output_value = hidden_t.data<T>();
+        gru_value.gate_value = gate_t.data<T>();
+        gru_value.reset_output_value = reset_hidden_prev_t.data<T>();
+
+        if (gru_value.prev_out_value) {
+          blas.GEMM_COMPUTE(
+              CblasNoTrans, CblasPacked, cur_batch_size, frame_size * 2,
+              frame_size, gru_value.prev_out_value, frame_size, packed_gate,
+              frame_size * 2, T(1), gru_value.gate_value, frame_size * 3);
+        }
+
+        math::detail::forward_reset_output(
+            math::detail::forward::gru_resetOutput<T>(), gru_value, frame_size,
+            cur_batch_size, active_gate);
+
+        if (gru_value.prev_out_value) {
+          blas.GEMM_COMPUTE(
+              CblasNoTrans, CblasPacked, cur_batch_size, frame_size, frame_size,
+              gru_value.reset_output_value, frame_size, packed_state,
+              frame_size, T(1), gru_value.gate_value + frame_size * 2,
+              frame_size * 3);
+        }
+
+        math::detail::forward_final_output(
+            math::detail::forward::gru_finalOutput<T>(), gru_value, frame_size,
+            cur_batch_size, active_node);
+
+        gru_value.prev_out_value = gru_value.output_value;
+      }
+
+      blas.GEMM_FREE(packed_gate);
+      blas.GEMM_FREE(packed_state);
+    } else {
+#endif
+      for (size_t n = 0; n < seq_len; n++) {
+        int bstart = static_cast<int>(batch_starts[n]);
+        int bend = static_cast<int>(batch_starts[n + 1]);
+        int cur_batch_size = bend - bstart;
+
+        Tensor gate_t = batch_gate->Slice(bstart, bend);
+        Tensor reset_hidden_prev_t =
+            batch_reset_hidden_prev->Slice(bstart, bend);
+        Tensor hidden_t = batch_hidden->Slice(bstart, bend);
+        gru_value.output_value = hidden_t.data<T>();
+        gru_value.gate_value = gate_t.data<T>();
+        gru_value.reset_output_value = reset_hidden_prev_t.data<T>();
+
+        math::GRUUnitFunctor<DeviceContext, T>::compute(
+            dev_ctx, gru_value, frame_size, cur_batch_size, active_node,
+            active_gate);
+
+        gru_value.prev_out_value = gru_value.output_value;
+      }
+#ifdef PADDLE_WITH_MKLML
+    }
+#endif
+    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
+    batch_hidden->set_lod(batch_gate->lod());
+    to_seq(dev_ctx, *batch_hidden, hidden);
+  }
+
+  void Compute(const framework::ExecutionContext& context) const override {
+    BatchCompute(context);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -218,9 +375,8 @@ namespace ops = paddle::operators;
 REGISTER_OPERATOR(gru, ops::GRUOp, ops::GRUOpMaker,
                   paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(gru_grad, ops::GRUGradOp);
-REGISTER_OP_CPU_KERNEL(
-    gru, ops::GRUKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GRUKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(gru, ops::GRUCPUKernel<float>,
+                       ops::GRUCPUKernel<double>);
 REGISTER_OP_CPU_KERNEL(
     gru_grad, ops::GRUGradKernel<paddle::platform::CPUDeviceContext, float>,
     ops::GRUGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/gru_op.cu.cc b/paddle/fluid/operators/gru_op.cu.cc
index baf455a840314d1ab94eb8e0a2e5c660ba4202da..55721c283dd18c2f9642563a9ce1eabfce16fd7b 100644
--- a/paddle/fluid/operators/gru_op.cu.cc
+++ b/paddle/fluid/operators/gru_op.cu.cc
@@ -14,6 +14,96 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/gru_op.h"
 
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class GRUKernel : public framework::OpKernel<T> {
+ public:
+  void BatchCompute(const framework::ExecutionContext& context) const {
+    auto* input = context.Input<LoDTensor>("Input");
+    auto* h0 = context.Input<Tensor>("H0");
+    auto* weight = context.Input<Tensor>("Weight");
+    const T* weight_data = weight->data<T>();
+    auto* bias = context.Input<Tensor>("Bias");
+    auto* batch_gate = context.Output<LoDTensor>("BatchGate");
+    batch_gate->mutable_data<T>(context.GetPlace());
+    auto* batch_reset_hidden_prev =
+        context.Output<LoDTensor>("BatchResetHiddenPrev");
+    batch_reset_hidden_prev->mutable_data<T>(context.GetPlace());
+    auto* batch_hidden = context.Output<LoDTensor>("BatchHidden");
+    batch_hidden->mutable_data<T>(context.GetPlace());
+    auto* hidden = context.Output<LoDTensor>("Hidden");
+    hidden->mutable_data<T>(context.GetPlace());
+
+    auto hidden_dims = hidden->dims();
+
+    bool is_reverse = context.Attr<bool>("is_reverse");
+    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    to_batch(dev_ctx, *input, batch_gate, true, is_reverse);
+
+    if (bias) {
+      math::RowwiseAdd<DeviceContext, T> add_bias;
+      add_bias(dev_ctx, *batch_gate, *bias, batch_gate);
+    }
+
+    int frame_size = hidden_dims[1];
+    math::GRUMetaValue<T> gru_value;
+    gru_value.gate_weight = const_cast<T*>(weight_data);
+    gru_value.state_weight =
+        const_cast<T*>(weight_data + 2 * frame_size * frame_size);
+    Tensor ordered_h0;
+
+    framework::Vector<size_t> order(batch_gate->lod()[2]);
+
+    if (h0) {
+      // Since the batch computing for GRU reorders the input sequences
+      // according to their length. The initialized cell state also needs
+      // to reorder.
+      ReorderInitState<DeviceContext, T>(
+          context.template device_context<DeviceContext>(), *h0, order,
+          &ordered_h0, true);
+      gru_value.prev_out_value = ordered_h0.data<T>();
+    } else {
+      gru_value.prev_out_value = nullptr;
+    }
+    auto batch_starts = batch_gate->lod()[0];
+    size_t num_batch = batch_starts.size() - 1;
+    auto active_node = math::detail::GetActivationType(
+        context.Attr<std::string>("activation"));
+    auto active_gate = math::detail::GetActivationType(
+        context.Attr<std::string>("gate_activation"));
+    for (size_t n = 0; n < num_batch; n++) {
+      int bstart = static_cast<int>(batch_starts[n]);
+      int bend = static_cast<int>(batch_starts[n + 1]);
+      int cur_batch_size = bend - bstart;
+
+      Tensor gate_t = batch_gate->Slice(bstart, bend);
+      Tensor reset_hidden_prev_t = batch_reset_hidden_prev->Slice(bstart, bend);
+      Tensor hidden_t = batch_hidden->Slice(bstart, bend);
+      gru_value.output_value = hidden_t.data<T>();
+      gru_value.gate_value = gate_t.data<T>();
+      gru_value.reset_output_value = reset_hidden_prev_t.data<T>();
+      math::GRUUnitFunctor<DeviceContext, T>::compute(
+          dev_ctx, gru_value, frame_size, cur_batch_size, active_node,
+          active_gate);
+      gru_value.prev_out_value = gru_value.output_value;
+    }
+
+    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
+    batch_hidden->set_lod(batch_gate->lod());
+    to_seq(dev_ctx, *batch_hidden, hidden);
+  }
+
+  void Compute(const framework::ExecutionContext& context) const override {
+    BatchCompute(context);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
     gru, ops::GRUKernel<paddle::platform::CUDADeviceContext, float>,
diff --git a/paddle/fluid/operators/gru_op.h b/paddle/fluid/operators/gru_op.h
index 3b0d93e54b72910de1429ddf41eb6b0fe9646942..0b551e8046be16c95f7d6b10b68b32a9af594f73 100644
--- a/paddle/fluid/operators/gru_op.h
+++ b/paddle/fluid/operators/gru_op.h
@@ -37,90 +37,6 @@ inline void ReorderInitState(const DeviceContext& ctx,
   row_shuffle(ctx, src, index_lod, dst, indexed_src);
 }
 
-template <typename DeviceContext, typename T>
-class GRUKernel : public framework::OpKernel<T> {
- public:
-  void BatchCompute(const framework::ExecutionContext& context) const {
-    auto* input = context.Input<LoDTensor>("Input");
-    auto* h0 = context.Input<Tensor>("H0");
-    auto* weight = context.Input<Tensor>("Weight");
-    const T* weight_data = weight->data<T>();
-    auto* bias = context.Input<Tensor>("Bias");
-    auto* batch_gate = context.Output<LoDTensor>("BatchGate");
-    batch_gate->mutable_data<T>(context.GetPlace());
-    auto* batch_reset_hidden_prev =
-        context.Output<LoDTensor>("BatchResetHiddenPrev");
-    batch_reset_hidden_prev->mutable_data<T>(context.GetPlace());
-    auto* batch_hidden = context.Output<LoDTensor>("BatchHidden");
-    batch_hidden->mutable_data<T>(context.GetPlace());
-    auto* hidden = context.Output<LoDTensor>("Hidden");
-    hidden->mutable_data<T>(context.GetPlace());
-
-    auto hidden_dims = hidden->dims();
-
-    bool is_reverse = context.Attr<bool>("is_reverse");
-    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    to_batch(dev_ctx, *input, batch_gate, true, is_reverse);
-
-    if (bias) {
-      math::RowwiseAdd<DeviceContext, T> add_bias;
-      add_bias(dev_ctx, *batch_gate, *bias, batch_gate);
-    }
-
-    int frame_size = hidden_dims[1];
-    math::GRUMetaValue<T> gru_value;
-    gru_value.gate_weight = const_cast<T*>(weight_data);
-    gru_value.state_weight =
-        const_cast<T*>(weight_data + 2 * frame_size * frame_size);
-    Tensor ordered_h0;
-
-    framework::Vector<size_t> order(batch_gate->lod()[2]);
-
-    if (h0) {
-      // Since the batch computing for GRU reorders the input sequences
-      // according to their length. The initialized cell state also needs
-      // to reorder.
-      ReorderInitState<DeviceContext, T>(
-          context.template device_context<DeviceContext>(), *h0, order,
-          &ordered_h0, true);
-      gru_value.prev_out_value = ordered_h0.data<T>();
-    } else {
-      gru_value.prev_out_value = nullptr;
-    }
-    auto batch_starts = batch_gate->lod()[0];
-    size_t num_batch = batch_starts.size() - 1;
-    auto active_node = math::detail::GetActivationType(
-        context.Attr<std::string>("activation"));
-    auto active_gate = math::detail::GetActivationType(
-        context.Attr<std::string>("gate_activation"));
-    for (size_t n = 0; n < num_batch; n++) {
-      int bstart = static_cast<int>(batch_starts[n]);
-      int bend = static_cast<int>(batch_starts[n + 1]);
-      int cur_batch_size = bend - bstart;
-
-      Tensor gate_t = batch_gate->Slice(bstart, bend);
-      Tensor reset_hidden_prev_t = batch_reset_hidden_prev->Slice(bstart, bend);
-      Tensor hidden_t = batch_hidden->Slice(bstart, bend);
-      gru_value.output_value = hidden_t.data<T>();
-      gru_value.gate_value = gate_t.data<T>();
-      gru_value.reset_output_value = reset_hidden_prev_t.data<T>();
-      math::GRUUnitFunctor<DeviceContext, T>::compute(
-          dev_ctx, gru_value, frame_size, cur_batch_size, active_node,
-          active_gate);
-      gru_value.prev_out_value = gru_value.output_value;
-    }
-
-    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
-    batch_hidden->set_lod(batch_gate->lod());
-    to_seq(dev_ctx, *batch_hidden, hidden);
-  }
-
-  void Compute(const framework::ExecutionContext& context) const override {
-    BatchCompute(context);
-  }
-};
-
 template <typename DeviceContext, typename T>
 class GRUGradKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/gru_unit_op.h b/paddle/fluid/operators/gru_unit_op.h
index 2d9faed648aef78da60706e13db3862080c96514..451ec61ba1f7239d92c6dfbad0b2961e74e1bc17 100644
--- a/paddle/fluid/operators/gru_unit_op.h
+++ b/paddle/fluid/operators/gru_unit_op.h
@@ -92,12 +92,12 @@ class GRUUnitKernel : public framework::OpKernel<T> {
               gate_data, frame_size * 3);
 
     // calculate activited gate
-    Eigen::array<int, 2> extents({{batch_size, frame_size}});
-    Eigen::array<int, 2> u_offsets({{0, 0}});
+    Eigen::array<int, 2> extents{{batch_size, frame_size}};
+    Eigen::array<int, 2> u_offsets{{0, 0}};
     ActCompute(context.Attr<int>("gate_activation"), place,
                g.slice(u_offsets, extents), g.slice(u_offsets, extents));
     auto u = g.slice(u_offsets, extents);  // update gate
-    Eigen::array<int, 2> r_offsets({{0, frame_size}});
+    Eigen::array<int, 2> r_offsets{{0, frame_size}};
     ActCompute(context.Attr<int>("gate_activation"), place,
                g.slice(r_offsets, extents), g.slice(r_offsets, extents));
     auto r = g.slice(r_offsets, extents);  // reset gate
@@ -107,7 +107,7 @@ class GRUUnitKernel : public framework::OpKernel<T> {
               weight_data + frame_size * frame_size * 2, frame_size, 1,
               gate_data + frame_size * 2, frame_size * 3);
 
-    Eigen::array<int, 2> c_offsets({{0, frame_size * 2}});
+    Eigen::array<int, 2> c_offsets{{0, frame_size * 2}};
     ActCompute(context.Attr<int>("activation"), place,
                g.slice(c_offsets, extents), g.slice(c_offsets, extents));
     auto c = g.slice(c_offsets, extents);  // output candidate
@@ -171,12 +171,12 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
     int batch_size = input->dims()[0];
     int frame_size = hidden_prev->dims()[1];
 
-    Eigen::array<int, 2> extents({{batch_size, frame_size}});
-    Eigen::array<int, 2> u_offsets({{0, 0}});
+    Eigen::array<int, 2> extents{{batch_size, frame_size}};
+    Eigen::array<int, 2> u_offsets{{0, 0}};
     auto u = g.slice(u_offsets, extents);  // update gate
-    Eigen::array<int, 2> r_offsets({{0, frame_size}});
+    Eigen::array<int, 2> r_offsets{{0, frame_size}};
     auto r = g.slice(r_offsets, extents);  // reset gate
-    Eigen::array<int, 2> c_offsets({{0, frame_size * 2}});
+    Eigen::array<int, 2> c_offsets{{0, frame_size * 2}};
     auto c = g.slice(c_offsets, extents);  // output candidate
 
     // backward for unactivated update gate
diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.cc b/paddle/fluid/operators/hierarchical_sigmoid_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..dadd054b9a6f8d44f4e5832888052bffde34c827
--- /dev/null
+++ b/paddle/fluid/operators/hierarchical_sigmoid_op.cc
@@ -0,0 +1,167 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/hierarchical_sigmoid_op.h"
+#include <vector>
+
+namespace paddle {
+namespace operators {
+
+/**
+ * Organize the classes into a binary tree. At each node, a sigmoid function
+ * is used to calculate the probability of belonging to the right branch.
+ * This idea is from "F. Morin, Y. Bengio (AISTATS 05):
+ * Hierarchical Probabilistic Neural Network Language Model."
+ *
+ * Here we uses a simple way of making the binary tree.
+ * Assuming the number of classes C = 6,
+ * The classes are organized as a binary tree in the following way:
+ *
+ * @code{.py}
+ * *-*-*- 2
+ * | | |- 3
+ * | |
+ * | |-*- 4
+ * |   |- 5
+ * |
+ * |-*- 0
+ *   |- 1
+ * @endcode
+ *
+ * where * indicates an internal node, and each leaf node represents a class.
+ * - Node 0 ... C-2 are internal nodes.
+ * - Node C-1 ... 2C-2 are leaf nodes.
+ * - Class c is represented by leaf node \f$c+C-1\f$.
+ *
+ * We assign an id for each node:
+ * - the id of root be 0.
+ * - the left child of a node i is 2*i+1.
+ * - the right child of a node i is 2*i+2.
+ *
+ * It's easy to see that:
+ * - the parent of node i is \f$\left\lfloor(i-1)/2\right\rfloor\f$.
+ * - the j-th level ancestor of node i is
+ * \f$\left\lfloor(i+1)/2^{j+1}\right\rfloor - 1\f$.
+ * - A node i is a left child of its parent if \f$(i-1)\%2==0\f$.
+ *
+ */
+
+class HierarchicalSigmoidOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("W"), "Input(W) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("PreOut"),
+                   "Output(PreOut) should not be null.");
+    const int64_t batch_size = ctx->GetInputDim("X")[0];
+    std::vector<int64_t> output_shape({batch_size, 1});
+    ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
+        ctx.GetPlace());
+  }
+};
+
+template <typename AttrType>
+class HierarchicalSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(Tensor, required) The input tensor with shape [N, D], "
+             "where N is the size of mini-batch, and D is the feature size.");
+    AddInput("W",
+             "(Tensor, required), The parameters of hierarchical "
+             "sigmoid operator, each of them is a 2-D tensor, the shape is"
+             "[num_classes - 1, D].");
+    AddInput("Label",
+             "(Tensor, required), The labels of training data. It's a"
+             "tensor with shape [N, 1].");
+    AddInput("Bias",
+             "(Tensor, optional), The bias is a tensor with shape"
+             "[1, num_classes - 1].");
+    AddOutput("Out",
+              "(Tensor, required) The output of hierarchical sigmoid operator."
+              "The shape is [N, 1].");
+    AddOutput("PreOut",
+              "(Tensor, required) A intermedia 2-D tensor with shape "
+              "[batch_size, code_length], where code_length represents the "
+              "maximum path length from root to leaf nodes.")
+        .AsIntermediate();
+    AddAttr<AttrType>("num_classes", "(int, required), The number of classes")
+        .SetDefault(2);
+    AddComment(R"DOC(
+The hierarchical sigmoid operator organize the classes into a binary tree.
+At each node, a sigmoid function is used to calculate the probability of
+belonging to the right branch. This idea is from
+"F. Morin, Y. Bengio (AISTATS 05):
+Hierarchical Probabilistic Neural Network Language Model."
+      )DOC");
+  }
+};
+
+class HierarchicalSigmoidGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("W"), "Input(W) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("PreOut"),
+                   "Input(Preout) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("W")),
+                   "Output(W@Grad should not be null.)");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")));
+    if (ctx->HasOutput(framework::GradVarName("Bias"))) {
+      ctx->SetOutputDim(framework::GradVarName("Bias"),
+                        ctx->GetInputDim("Bias"));
+    }
+    ctx->SetOutputDim(framework::GradVarName("W"), ctx->GetInputDim("W"));
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
+        ctx.GetPlace());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(hierarchical_sigmoid, ops::HierarchicalSigmoidOp,
+                  ops::HierarchicalSigmoidOpMaker<int>,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(hierarchical_sigmoid_grad, ops::HierarchicalSigmoidGradOp);
+REGISTER_OP_CPU_KERNEL(
+    hierarchical_sigmoid,
+    ops::HierarchicalSigmoidOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::HierarchicalSigmoidOpKernel<paddle::platform::CPUDeviceContext,
+                                     double>);
+REGISTER_OP_CPU_KERNEL(
+    hierarchical_sigmoid_grad,
+    ops::HierarchicalSigmoidGradOpKernel<paddle::platform::CPUDeviceContext,
+                                         float>,
+    ops::HierarchicalSigmoidGradOpKernel<paddle::platform::CPUDeviceContext,
+                                         double>);
diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.h b/paddle/fluid/operators/hierarchical_sigmoid_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..64096a717b12ed231344649f5eb76b7e4b9af4a6
--- /dev/null
+++ b/paddle/fluid/operators/hierarchical_sigmoid_op.h
@@ -0,0 +1,135 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <iostream>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/clip_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/matrix_bit_code.h"
+#include "paddle/fluid/platform/transform.h"
+namespace paddle {
+namespace operators {
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+using platform::Transform;
+
+template <typename DeviceContext, typename T>
+class HierarchicalSigmoidOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<framework::Tensor>("X");
+    auto* w = ctx.Input<framework::Tensor>("W");
+    auto* label = ctx.Input<framework::Tensor>("Label");
+    auto* bias = ctx.Input<framework::Tensor>("Bias");
+    auto* out = ctx.Output<framework::Tensor>("Out");
+    auto* pre_out = ctx.Output<framework::Tensor>("PreOut");
+    size_t num_classes = static_cast<size_t>(ctx.Attr<int>("num_classes"));
+    int64_t code_length = math::FindLastSet(num_classes - 1);
+    int64_t batch_size = in->dims()[0];
+    framework::Tensor sum;
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    auto* pre_out_data = pre_out->mutable_data<T>(
+        framework::make_ddim({batch_size, code_length}), ctx.GetPlace());
+    auto pre_out_mat = EigenMatrix<T>::From(*pre_out);
+    // Not all class(leaf) nodes' path lengths equal code_length, thus init as
+    // 0s can avoid out of path's loss.
+    math::SetConstant<DeviceContext, T> zero;
+    zero(dev_ctx, pre_out, static_cast<T>(0.0));
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+    math::RowwiseSum<DeviceContext, T> row_sum;
+    math::MatrixBitCodeFunctor<T> bit_code(num_classes, label->data<int64_t>());
+
+    std::vector<int64_t> sum_dims({batch_size, 1UL});
+    sum.mutable_data<T>(framework::make_ddim(sum_dims), ctx.GetPlace());
+    auto sum_mat = EigenMatrix<T>::From(sum);
+    out->mutable_data<T>(ctx.GetPlace());
+    auto out_mat = framework::EigenVector<T>::Flatten(*out);
+    if (bias) {
+      bit_code.Add(pre_out, *bias);
+    }
+    bit_code.Mul(pre_out, *w, *in);
+    // clip to [-40, 40]
+    Transform<DeviceContext> trans;
+    trans(ctx.template device_context<DeviceContext>(), pre_out_data,
+          pre_out_data + pre_out->numel(), pre_out_data,
+          ClipFunctor<T>(static_cast<T>(-40.0), static_cast<T>(40.0)));
+    bit_code.Sum(*pre_out, out, static_cast<T>(-1));
+    // use softrelu to calculate cross entropy
+    pre_out_mat.device(place) = (static_cast<T>(1.0) + pre_out_mat.exp()).log();
+    row_sum(dev_ctx, *pre_out, &sum);
+    // TODO(guosheng): Subtract the out of path's loss, since not all
+    // class(leaf) nodes' path lengths equal code_length. But it won't break the
+    // gradient check since both have the out of path's loss and will cancel out
+    // each other.
+    out_mat.device(place) = sum_mat + out_mat;
+  }
+};
+
+template <typename DeviceContext, typename T>
+class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<framework::Tensor>("X");
+    auto* w = ctx.Input<framework::Tensor>("W");
+    auto* in_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto* w_grad = ctx.Output<framework::Tensor>(framework::GradVarName("W"));
+    auto* bias_grad =
+        ctx.Output<framework::Tensor>(framework::GradVarName("Bias"));
+    auto* label = ctx.Input<framework::Tensor>("Label");
+    auto* pre_out = ctx.Input<framework::Tensor>("PreOut");
+    auto* out_grad =
+        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    framework::Tensor pre_out_grad;
+
+    pre_out_grad.mutable_data<T>(pre_out->dims(), ctx.GetPlace());
+    in_grad->mutable_data<T>(ctx.GetPlace());
+    w_grad->mutable_data<T>(ctx.GetPlace());
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    math::SetConstant<DeviceContext, T> zero;
+    zero(dev_ctx, in_grad, static_cast<T>(0.0));
+    zero(dev_ctx, w_grad, static_cast<T>(0.0));
+
+    size_t num_classes = static_cast<size_t>(ctx.Attr<int>("num_classes"));
+    math::MatrixBitCodeFunctor<T> bit_code(num_classes, label->data<int64_t>());
+
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+    auto pre_out_mat = EigenMatrix<T>::From(*pre_out);
+    auto pre_out_grad_mat = EigenMatrix<T>::From(pre_out_grad);
+    auto out_grad_mat = EigenMatrix<T>::From(*out_grad);
+    Eigen::array<int, 2> bcast({{1, static_cast<int>(pre_out_grad.dims()[1])}});
+
+    // softrelu derivative
+    pre_out_grad_mat.device(place) =
+        static_cast<T>(1.0) - static_cast<T>(1.0) / pre_out_mat.exp();
+    bit_code.Sub(&pre_out_grad);  // the gradient of clip(w * x + b)
+    pre_out_grad_mat.device(place) =
+        pre_out_grad_mat * out_grad_mat.broadcast(bcast);
+    // TODO(guosheng): multiply pre_out_grad with subgradient of clipping to
+    // be consistent with the clipping in forward.
+    if (bias_grad) {
+      bias_grad->mutable_data<T>(ctx.GetPlace());
+      zero(dev_ctx, bias_grad, static_cast<T>(0.0));
+      bit_code.AddGrad(pre_out_grad, bias_grad);
+    }
+    bit_code.MulGradWeight(pre_out_grad, w_grad, *in);
+    bit_code.MulGradError(pre_out_grad, *w, in_grad);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/im2sequence_op.cc b/paddle/fluid/operators/im2sequence_op.cc
index 0669661d225c664010fce97f0a526b62988b92c5..8efd43928aac994c7630a213f6724e8f50abc7e0 100644
--- a/paddle/fluid/operators/im2sequence_op.cc
+++ b/paddle/fluid/operators/im2sequence_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/im2sequence_op.h"
+#include <string>
 #include <vector>
 
 namespace paddle {
@@ -28,27 +29,18 @@ class Im2SequenceOp : public framework::OperatorWithKernel {
                    "Input(X) of Im2SequenceOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of Im2SequenceOp op should not be null.");
-
     auto in_dim = ctx->GetInputDim("X");
+
     PADDLE_ENFORCE_EQ(in_dim.size(), 4,
                       "Input(X) format must be 4D tensor, eg., NCHW.");
+    int img_channels = in_dim[1];
 
     auto kernels = ctx->Attrs().Get<std::vector<int>>("kernels");
     auto strides = ctx->Attrs().Get<std::vector<int>>("strides");
     auto paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
 
-    int batch_size = in_dim[0];
-    int img_channels = in_dim[1];
-    int img_height = in_dim[2];
-    int img_width = in_dim[3];
-
-    int output_height = Im2SeqOutputSize(img_height, kernels[0], paddings[0],
-                                         paddings[2], strides[0]);
-    int output_width = Im2SeqOutputSize(img_width, kernels[1], paddings[1],
-                                        paddings[3], strides[1]);
-
-    ctx->SetOutputDim("Out", {batch_size * output_height * output_width,
-                              img_channels * kernels[0] * kernels[1]});
+    ctx->SetOutputDim("Out",
+                      {in_dim[0], img_channels * kernels[0] * kernels[1]});
   }
 };
 
@@ -61,6 +53,10 @@ class Im2SequenceOpMaker : public framework::OpProtoAndCheckerMaker {
              "C: channels"
              "H: height"
              "W: width");
+    AddInput("Y",
+             "(Tensor) The input tensor of image real size(H, W)."
+             "2-D with shape [batchsize, 2]")
+        .AsDispensable();
     AddOutput("Out", "(LodTensor) The output data of im2sequence op,");
     AddAttr<std::vector<int>>("kernels",
                               "(vector<int>), the "
@@ -73,6 +69,13 @@ class Im2SequenceOpMaker : public framework::OpProtoAndCheckerMaker {
                               "(vector<int> default:{0, 0, 0, 0}), the "
                               "paddings(up_pad, left_pad, down_pad, right_pad)")
         .SetDefault({0, 0, 0, 0});
+    AddAttr<std::vector<int>>("out_stride",
+                              "the attribute is valid only when input(Y)"
+                              "is not NULL.this attribute represents the"
+                              "scaling of the pic through the CNN"
+                              "(vector<int> dedault:{1,1}),the out_stride"
+                              " (out_stride_height, out_stride_width)")
+        .SetDefault({1, 1});
     AddComment(R"DOC(
 This op uses kernels to scan images and converts these images to sequences.
 After expanding, The number of time steps are output_height * output_width
@@ -123,7 +126,7 @@ output.data = [[ 6.  2.  8.  3.  2.  4.  6.  3.]
                [ 7.  1.  7.  9.  2.  1.  3.  5.]
                [ 5.  7.  2.  4.  1.  3.  9.  0.]
                [ 7.  9.  4.  8.  3.  5.  0.  8.]]
-output.dims = {8, 9}
+output.dims = {8, 8}
 output.lod = [[0, 4, 8]]
 
 )DOC");
diff --git a/paddle/fluid/operators/im2sequence_op.h b/paddle/fluid/operators/im2sequence_op.h
index d792c68f784d8ffec0eb303a6ab9b59c9f121fa7..4a9942819414d552eb69bd0b30b66aab76a2dbf4 100644
--- a/paddle/fluid/operators/im2sequence_op.h
+++ b/paddle/fluid/operators/im2sequence_op.h
@@ -13,6 +13,7 @@
    limitations under the License. */
 
 #pragma once
+#include <string>
 #include <vector>
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/eigen.h"
@@ -39,50 +40,107 @@ class Im2SequenceKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     const Tensor* in = ctx.Input<Tensor>("X");
     LoDTensor* out = ctx.Output<LoDTensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-    // TODO(wanghaoshuang): Add layout checker after 'set_layout'
-    // being available for python API
-    // PADDLE_ENFORCE_EQ(in->layout(), framework::DataLayout::kNCHW,
-    //                  "Input(X) layout must be NCHW");
     auto in_dim = in->dims();
     int batch_size = in_dim[0];
     int img_channels = in_dim[1];
     int img_height = in_dim[2];
     int img_width = in_dim[3];
-
     auto kernels = ctx.Attr<std::vector<int>>("kernels");
     auto strides = ctx.Attr<std::vector<int>>("strides");
     auto paddings = ctx.Attr<std::vector<int>>("paddings");
-    int output_height = Im2SeqOutputSize(img_height, kernels[0], paddings[0],
-                                         paddings[2], strides[0]);
-    int output_width = Im2SeqOutputSize(img_width, kernels[1], paddings[1],
-                                        paddings[3], strides[1]);
-
-    const std::vector<int> dilations({1, 1});
-
-    auto out_dims = out->dims();
-    out->Resize({batch_size, out->numel() / batch_size});
-    for (int i = 0; i < batch_size; i++) {
-      const Tensor src =
-          in->Slice(i, i + 1).Resize({img_channels, img_height, img_width});
-      Tensor dst = out->Slice(i, i + 1).Resize(
-          {output_height, output_width, img_channels, kernels[0], kernels[1]});
-
-      math::Im2ColFunctor<math::ColFormat::kOCF, DeviceContext, T> f;
-      auto& dev_ctx = ctx.template device_context<DeviceContext>();
-      f(dev_ctx, src, dilations, strides, paddings, &dst);
-    }
-    out->Resize(out_dims);
-
-    // set lod information
-    // TODO(wanghaoshuang): Move this to InferShape
-    framework::LoD lod(1);
-    lod[0].reserve(batch_size + 1);
-    for (int i = 0, offset = 0; i < batch_size + 1; ++i) {
+    if (ctx.HasInput("Y") && batch_size > 1) {
+      const Tensor* imgrealsize = ctx.Input<Tensor>("Y");
+      auto out_stride = ctx.Attr<std::vector<int>>("out_stride");
+      Tensor cpu_shape_tensor;
+      TensorCopySync(*imgrealsize, platform::CPUPlace(), &cpu_shape_tensor);
+      std::vector<int> imgreal_h;
+      std::vector<int> imgreal_w;
+      std::vector<int> output_height;
+      std::vector<int> output_width;
+      int result = 0;
+      for (int i = 0; i < batch_size; i++) {
+        int tmp_real_h = static_cast<int>((cpu_shape_tensor.data<T>())[2 * i]);
+        int tmp_real_w =
+            static_cast<int>((cpu_shape_tensor.data<T>())[2 * i + 1]);
+        if (tmp_real_h % out_stride[0] == 0) {
+          tmp_real_h = tmp_real_h / out_stride[0];
+        } else {
+          tmp_real_h = tmp_real_h / out_stride[0] + 1;
+        }
+        if (tmp_real_w % out_stride[1] == 0) {
+          tmp_real_w = tmp_real_w / out_stride[1];
+        } else {
+          tmp_real_w = tmp_real_w / out_stride[1] + 1;
+        }
+        imgreal_h.push_back(tmp_real_h);
+        imgreal_w.push_back(tmp_real_w);
+        output_height.push_back(Im2SeqOutputSize(
+            imgreal_h[i], kernels[0], paddings[0], paddings[2], strides[0]));
+        output_width.push_back(Im2SeqOutputSize(
+            imgreal_w[i], kernels[1], paddings[1], paddings[3], strides[1]));
+        result += output_height[i] * output_width[i];
+      }
+
+      out->mutable_data<T>({result, img_channels * kernels[0] * kernels[1]},
+                           ctx.GetPlace());
+
+      const std::vector<int> dilations({1, 1});
+      int offset_out = 0;
+      for (int i = 0; i < batch_size; i++) {
+        const Tensor src =
+            in->Slice(i, i + 1).Resize({img_channels, img_height, img_width});
+        Tensor dst = out->Slice(offset_out,
+                                offset_out + output_height[i] * output_width[i])
+                         .Resize({output_height[i], output_width[i],
+                                  img_channels, kernels[0], kernels[1]});
+        offset_out += output_height[i] * output_width[i];
+
+        math::Im2ColFunctor<math::ColFormat::kOCF, DeviceContext, T> f;
+        auto& dev_ctx = ctx.template device_context<DeviceContext>();
+        f(dev_ctx, src, dilations, strides, paddings, &dst);
+      }
+      framework::LoD lod(1);
+      lod[0].reserve(batch_size + 1);
+      int offset = 0;
+      lod[0].push_back(offset);
+      for (int i = 0; i < batch_size; ++i) {
+        offset += output_height[i] * output_width[i];
+        lod[0].push_back(offset);
+      }
+      out->set_lod(lod);
+    } else {
+      int output_height = Im2SeqOutputSize(img_height, kernels[0], paddings[0],
+                                           paddings[2], strides[0]);
+      int output_width = Im2SeqOutputSize(img_width, kernels[1], paddings[1],
+                                          paddings[3], strides[1]);
+      out->mutable_data<T>({batch_size * output_height * output_width,
+                            img_channels * kernels[0] * kernels[1]},
+                           ctx.GetPlace());
+      const std::vector<int> dilations({1, 1});
+      auto out_dims = out->dims();
+      out->Resize({batch_size, out->numel() / batch_size});
+      for (int i = 0; i < batch_size; i++) {
+        const Tensor src =
+            in->Slice(i, i + 1).Resize({img_channels, img_height, img_width});
+        Tensor dst =
+            out->Slice(i, i + 1).Resize({output_height, output_width,
+                                         img_channels, kernels[0], kernels[1]});
+
+        math::Im2ColFunctor<math::ColFormat::kOCF, DeviceContext, T> f;
+        auto& dev_ctx = ctx.template device_context<DeviceContext>();
+        f(dev_ctx, src, dilations, strides, paddings, &dst);
+      }
+      out->Resize(out_dims);
+      framework::LoD lod(1);
+      lod[0].reserve(batch_size + 1);
+      int offset = 0;
       lod[0].push_back(offset);
-      offset += output_height * output_width;
+      for (int i = 0; i < batch_size; ++i) {
+        offset += output_height * output_width;
+        lod[0].push_back(offset);
+      }
+      out->set_lod(lod);
     }
-    out->set_lod(lod);
   }
 };
 
diff --git a/paddle/fluid/operators/label_smooth_op.h b/paddle/fluid/operators/label_smooth_op.h
index f56fd95e96526c59e040fbbd2812360e59570a08..f3da17de011053fa118b5a4257bb5c3b00084741 100644
--- a/paddle/fluid/operators/label_smooth_op.h
+++ b/paddle/fluid/operators/label_smooth_op.h
@@ -38,7 +38,8 @@ class LabelSmoothKernel : public framework::OpKernel<T> {
       auto dist = framework::EigenVector<T>::Flatten(*dist_t);
       out.device(dev) =
           static_cast<T>(1 - epsilon) * in +
-          epsilon * dist.broadcast(Eigen::DSizes<int, 1>(in_t->numel()));
+          static_cast<T>(epsilon) *
+              dist.broadcast(Eigen::DSizes<int, 1>(in_t->numel()));
     } else {
       out.device(dev) = static_cast<T>(1 - epsilon) * in +
                         static_cast<T>(epsilon / label_dim);
diff --git a/paddle/fluid/operators/layer_norm_op.cu b/paddle/fluid/operators/layer_norm_op.cu
index 6840e1e08f3d5bc84a05f15e30982c7cfb59680b..22343d7724b2f0dc01bff8c2274e3dd914bf70ef 100644
--- a/paddle/fluid/operators/layer_norm_op.cu
+++ b/paddle/fluid/operators/layer_norm_op.cu
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,8 +12,512 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <cub/cub.cuh>
 #include "paddle/fluid/operators/layer_norm_op.h"
 
+namespace paddle {
+namespace operators {
+
+inline static int GetDesiredBlockDim(int block_dim) {
+  const int kMaxBlockDim = 512;
+  return block_dim >= kMaxBlockDim
+             ? kMaxBlockDim
+             : (1 << (static_cast<int>(std::log2f(block_dim))));
+}
+
+#define FIXED_BLOCK_DIM_CASE_BASE(log2_block_dim, ...)  \
+  case (1 << (log2_block_dim)): {                       \
+    constexpr auto kBlockDim = (1 << (log2_block_dim)); \
+    __VA_ARGS__;                                        \
+  } break
+
+#define FIXED_BLOCK_DIM_CASE(...)              \
+  FIXED_BLOCK_DIM_CASE_BASE(9, ##__VA_ARGS__); \
+  FIXED_BLOCK_DIM_CASE_BASE(8, ##__VA_ARGS__); \
+  FIXED_BLOCK_DIM_CASE_BASE(7, ##__VA_ARGS__); \
+  FIXED_BLOCK_DIM_CASE_BASE(6, ##__VA_ARGS__); \
+  FIXED_BLOCK_DIM_CASE_BASE(5, ##__VA_ARGS__); \
+  FIXED_BLOCK_DIM_CASE_BASE(4, ##__VA_ARGS__); \
+  FIXED_BLOCK_DIM_CASE_BASE(3, ##__VA_ARGS__); \
+  FIXED_BLOCK_DIM_CASE_BASE(2, ##__VA_ARGS__); \
+  FIXED_BLOCK_DIM_CASE_BASE(1, ##__VA_ARGS__)
+
+static __device__ __forceinline__ float real_sqrt(float x) { return sqrtf(x); }
+static __device__ __forceinline__ double real_sqrt(double x) { return sqrt(x); }
+
+template <typename T>
+struct PairForLayerNorm {
+  __device__ __forceinline__ PairForLayerNorm() {}
+  __device__ __forceinline__ PairForLayerNorm(const T &first, const T &second)
+      : first_(first), second_(second) {}
+
+  T first_;
+  T second_;
+};
+
+template <typename T>
+struct PairForLayerNormAddFunctor {
+  __device__ __forceinline__ PairForLayerNorm<T> operator()(
+      const PairForLayerNorm<T> &p1, const PairForLayerNorm<T> &p2) {
+    return PairForLayerNorm<T>(p1.first_ + p2.first_, p1.second_ + p2.second_);
+  }
+};
+
+template <typename T, int BlockDim>
+__global__ void LayerNormForward(const T *x, const T *scale, const T *bias,
+                                 T *y, T *mean, T *var, float epsilon,
+                                 int feature_size) {
+  using BlockReduce = cub::BlockReduce<PairForLayerNorm<double>, BlockDim>;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+
+  int beg_idx = blockIdx.x * feature_size + threadIdx.x;
+  int end_idx = (blockIdx.x + 1) * feature_size;
+
+  // Step 1: Reduce to calculate mean and var
+  double mean_val = 0;
+  double var_val = 0;
+  for (int i = beg_idx; i < end_idx; i += BlockDim) {
+    T tmp = x[i];
+    mean_val += tmp;
+    var_val += (tmp * tmp);
+  }
+  auto pair = BlockReduce(temp_storage)
+                  .Reduce(PairForLayerNorm<double>(mean_val, var_val),
+                          PairForLayerNormAddFunctor<double>());
+  if (threadIdx.x == 0) {
+    auto tmp = pair.first_ / feature_size;
+    mean[blockIdx.x] = static_cast<T>(tmp);
+    var[blockIdx.x] = static_cast<T>(pair.second_ / feature_size - tmp * tmp);
+  }
+  __syncthreads();
+  mean_val = mean[blockIdx.x];
+  var_val = static_cast<T>(real_sqrt(var[blockIdx.x] + epsilon));
+
+  // Step 2: Calculate y
+  if (scale != nullptr) {
+    if (bias != nullptr) {
+      for (int i = beg_idx, j = threadIdx.x; i < end_idx;
+           i += BlockDim, j += BlockDim) {
+        y[i] = scale[j] * (x[i] - mean_val) / var_val + bias[j];
+      }
+    } else {
+      for (int i = beg_idx, j = threadIdx.x; i < end_idx;
+           i += BlockDim, j += BlockDim) {
+        y[i] = scale[j] * (x[i] - mean_val) / var_val;
+      }
+    }
+  } else {  // scale == nullptr
+    if (bias != nullptr) {
+      for (int i = beg_idx, j = threadIdx.x; i < end_idx;
+           i += BlockDim, j += BlockDim) {
+        y[i] = (x[i] - mean_val) / var_val + bias[j];
+      }
+    } else {
+      for (int i = beg_idx, j = threadIdx.x; i < end_idx;
+           i += BlockDim, j += BlockDim) {
+        y[i] = (x[i] - mean_val) / var_val;
+      }
+    }
+  }
+}
+
+// Make sure that d_scale != nullptr && d_bias != nullptr
+// Since d_scale != nullptr, scale would not be nullptr
+template <typename T, int BlockDim, bool HasDx>
+__global__ void LayerNormBackwardGradientAll(const T *x, const T *d_y,
+                                             T *d_scale, T *d_bias, T *d_x,
+                                             const T *mean, const T *var,
+                                             const T *scale, float epsilon,
+                                             int batch_size, int feature_size) {
+  using BlockReduce = cub::BlockReduce<PairForLayerNorm<T>, BlockDim>;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+
+  int beg_idx = threadIdx.x * feature_size + blockIdx.x;
+  int end_idx = batch_size * feature_size + blockIdx.x;
+  int stride = BlockDim * feature_size;
+
+  T d_scale_partial = 0, d_bias_partial = 0;
+
+  for (int i = beg_idx; i < end_idx; i += stride) {
+    int row_idx = i / feature_size;
+    auto var_val = static_cast<T>(real_sqrt(var[row_idx] + epsilon));
+    d_scale_partial += d_y[i] * (x[i] - mean[row_idx]) / var_val;
+    d_bias_partial += d_y[i];
+    if (HasDx) {
+      d_x[i] = d_y[i] * scale[blockIdx.x] / var_val;
+    }
+  }
+
+  auto pair = BlockReduce(temp_storage)
+                  .Reduce(PairForLayerNorm<T>(d_scale_partial, d_bias_partial),
+                          PairForLayerNormAddFunctor<T>());
+
+  if (threadIdx.x == 0) {
+    d_scale[blockIdx.x] = pair.first_;
+    d_bias[blockIdx.x] = pair.second_;
+  }
+}
+
+// Make sure that there is only one true expression: d_scale != nullptr
+// or d_bias != nullptr
+// Notice: scale may be nullptr
+template <typename T, int BlockDim, bool HasDx, bool HasDScale>
+__global__ void LayerNormBackwardGradientScaleOrBias(
+    const T *x, const T *d_y, T *d_scale, T *d_bias, T *d_x, const T *mean,
+    const T *var, const T *scale, float epsilon, int batch_size,
+    int feature_size) {
+  using BlockReduce = cub::BlockReduce<T, BlockDim>;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  int beg_idx = threadIdx.x * feature_size + blockIdx.x;
+  int end_idx = batch_size * feature_size + blockIdx.x;
+  int stride = BlockDim * feature_size;
+  T d_scale_or_d_bias_partial = 0;
+
+  for (int i = beg_idx; i < end_idx; i += stride) {
+    int row_idx = i / feature_size;
+    auto var_val = static_cast<T>(real_sqrt(var[row_idx] + epsilon));
+    if (HasDScale) {
+      d_scale_or_d_bias_partial += d_y[i] * (x[i] - mean[row_idx]) / var_val;
+    } else {  // d_bias != nullptr
+      d_scale_or_d_bias_partial += d_y[i];
+    }
+
+    if (HasDx) {
+      if (scale != nullptr) {
+        d_x[i] = d_y[i] * scale[blockIdx.x] / var_val;
+      } else {
+        d_x[i] = d_y[i] / var_val;
+      }
+    }
+  }
+
+  d_scale_or_d_bias_partial =
+      BlockReduce(temp_storage).Reduce(d_scale_or_d_bias_partial, cub::Sum());
+
+  if (threadIdx.x == 0) {
+    if (HasDScale) {
+      d_scale[blockIdx.x] = d_scale_or_d_bias_partial;
+    } else {
+      d_bias[blockIdx.x] = d_scale_or_d_bias_partial;
+    }
+  }
+}
+
+template <typename T, int BlockDim>
+__global__ void LayerNormBackwardPostProcessToCalculateDX(const T *x, T *d_x,
+                                                          const T *mean,
+                                                          const T *var,
+                                                          float epsilon,
+                                                          int feature_size) {
+  using BlockReduce = cub::BlockReduce<PairForLayerNorm<T>, BlockDim>;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  __shared__ T d_x_reduce_tmp[2];
+
+  int beg_idx = blockIdx.x * feature_size + threadIdx.x;
+  int end_idx = (blockIdx.x + 1) * feature_size;
+
+  T block_mean = mean[blockIdx.x];
+  T block_var = var[blockIdx.x];
+  T d_x_mean_partial = 0, d_x_var_partial = 0;
+  for (int i = beg_idx; i < end_idx; i += BlockDim) {
+    d_x_mean_partial += d_x[i];
+    d_x_var_partial += d_x[i] * (x[i] - block_mean);
+  }
+
+  auto pair =
+      BlockReduce(temp_storage)
+          .Reduce(PairForLayerNorm<T>(d_x_mean_partial, d_x_var_partial),
+                  PairForLayerNormAddFunctor<T>());
+
+  if (threadIdx.x == 0) {
+    d_x_reduce_tmp[0] = pair.first_ / feature_size;
+    d_x_reduce_tmp[1] = pair.second_ / (feature_size * (block_var + epsilon));
+  }
+  __syncthreads();
+
+  d_x_mean_partial = d_x_reduce_tmp[0];
+  d_x_var_partial = d_x_reduce_tmp[1];
+  for (int i = beg_idx; i < end_idx; i += BlockDim) {
+    d_x[i] -= d_x_mean_partial;
+    d_x[i] -= (x[i] - block_mean) * d_x_var_partial;
+  }
+}
+
+// Here, we only calculate d_x
+template <typename T, int BlockDim>
+__global__ void LayerNormBackwardGradientOnlyDX(const T *x, const T *d_y,
+                                                T *d_x, const T *mean,
+                                                const T *var, const T *scale,
+                                                float epsilon,
+                                                int feature_size) {
+  using BlockReduce = cub::BlockReduce<PairForLayerNorm<T>, BlockDim>;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  __shared__ T d_x_reduce_tmp[2];
+
+  int beg_idx = blockIdx.x * feature_size + threadIdx.x;
+  int end_idx = (blockIdx.x + 1) * feature_size;
+
+  T block_mean = mean[blockIdx.x], block_var = var[blockIdx.x];
+  T d_x_mean_partial = 0, d_x_var_partial = 0;
+  for (int i = beg_idx; i < end_idx; i += BlockDim) {
+    auto var_val = static_cast<T>(real_sqrt(block_var + epsilon));
+    if (scale != nullptr) {
+      int col_idx = i % feature_size;
+      d_x[i] = d_y[i] * scale[col_idx] / var_val;
+    } else {
+      d_x[i] = d_y[i] / var_val;
+    }
+    d_x_mean_partial += d_x[i];
+    d_x_var_partial += d_x[i] * (x[i] - block_mean);
+  }
+
+  auto pair =
+      BlockReduce(temp_storage)
+          .Reduce(PairForLayerNorm<T>(d_x_mean_partial, d_x_var_partial),
+                  PairForLayerNormAddFunctor<T>());
+
+  if (threadIdx.x == 0) {
+    d_x_reduce_tmp[0] = pair.first_ / feature_size;
+    d_x_reduce_tmp[1] = pair.second_ / (feature_size * (block_var + epsilon));
+  }
+  __syncthreads();
+
+  d_x_mean_partial = d_x_reduce_tmp[0];
+  d_x_var_partial = d_x_reduce_tmp[1];
+  for (int i = beg_idx; i < end_idx; i += BlockDim) {
+    d_x[i] -= d_x_mean_partial;
+    d_x[i] -= (x[i] - block_mean) * d_x_var_partial;
+  }
+}
+
+template <typename T>
+__global__ void LayerNormBackwardWhenBatchSizeIsOne(
+    const T *x, const T *d_y, T *d_x, T *d_scale, T *d_bias, const T *mean,
+    const T *var, const T *scale, float epsilon, int feature_size) {
+  int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < feature_size) {
+    auto var_val = static_cast<T>(real_sqrt(var[idx] + epsilon));
+    if (d_x != nullptr) {
+      if (d_scale == nullptr) {
+        d_x[idx] = d_y[idx] / var_val;
+      } else {
+        d_x[idx] = d_y[idx] * scale[idx] / var_val;
+      }
+    }
+
+    if (d_scale != nullptr) {
+      d_scale[idx] = d_y[idx] * (x[idx] - mean[idx]) / var_val;
+    }
+
+    if (d_bias != nullptr) d_bias[idx] = d_y[idx];
+  }
+}
+
+template <typename T>
+static void LayerNormBackward(const T *x, const T *d_y, const T *scale,
+                              const T *mean, const T *var, T *d_x, T *d_scale,
+                              T *d_bias, float epsilon, int batch_size,
+                              int feature_size, cudaStream_t stream) {
+  const int kMaxBlockDim = 512;
+  int gradient_flag = ((d_x != nullptr ? 1 : 0) << 2) |
+                      ((d_scale != nullptr ? 1 : 0) << 1) |
+                      ((d_bias != nullptr ? 1 : 0));
+  if (gradient_flag == 0) return;
+
+  if (batch_size == 1) {
+    LayerNormBackwardWhenBatchSizeIsOne<
+        T><<<(feature_size + kMaxBlockDim - 1) / kMaxBlockDim, kMaxBlockDim, 0,
+             stream>>>(x, d_y, d_x, d_scale, d_bias, mean, var, scale, epsilon,
+                       feature_size);
+
+    if (d_x != nullptr) {
+      switch (GetDesiredBlockDim(feature_size)) {
+        FIXED_BLOCK_DIM_CASE(LayerNormBackwardPostProcessToCalculateDX<
+                             T, kBlockDim><<<1, kBlockDim, 0, stream>>>(
+            x, d_x, mean, var, epsilon, feature_size));
+      }
+    }
+    return;
+  }
+
+  auto block_dim = GetDesiredBlockDim(batch_size);
+  switch (gradient_flag) {
+    case 1:  // d_x == nulptr, d_scale == nullptr, d_bias != nullptr
+      switch (block_dim) {
+        FIXED_BLOCK_DIM_CASE(LayerNormBackwardGradientScaleOrBias<
+                             T, kBlockDim, false,
+                             false><<<feature_size, kBlockDim, 0, stream>>>(
+            x, d_y, d_scale, d_bias, d_x, mean, var, scale, epsilon, batch_size,
+            feature_size));
+      }
+      break;
+    case 2:  // d_x == nullptr, d_scale != nullptr, d_bias == nullptr
+      switch (block_dim) {
+        FIXED_BLOCK_DIM_CASE(LayerNormBackwardGradientScaleOrBias<
+                             T, kBlockDim, false,
+                             true><<<feature_size, kBlockDim, 0, stream>>>(
+            x, d_y, d_scale, d_bias, d_x, mean, var, scale, epsilon, batch_size,
+            feature_size));
+      }
+      break;
+    case 3:  // d_x == nullptr, d_scale != nulptr, d_bias != nullptr
+      switch (block_dim) {
+        FIXED_BLOCK_DIM_CASE(
+            LayerNormBackwardGradientAll<
+                T, kBlockDim, false><<<feature_size, kBlockDim, 0, stream>>>(
+                x, d_y, d_scale, d_bias, d_x, mean, var, scale, epsilon,
+                batch_size, feature_size));
+      }
+      break;
+    case 4:  // d_x != nullptr, d_scale == nullptr, d_bias == nullptr
+      switch (GetDesiredBlockDim(feature_size)) {
+        FIXED_BLOCK_DIM_CASE(
+            LayerNormBackwardGradientOnlyDX<
+                T, kBlockDim><<<batch_size, kBlockDim, 0, stream>>>(
+                x, d_y, d_x, mean, var, scale, epsilon, feature_size));
+      }
+      break;
+    case 5:  // d_x != nulptr, d_scale == nullptr, d_bias != nullptr
+      switch (block_dim) {
+        FIXED_BLOCK_DIM_CASE(LayerNormBackwardGradientScaleOrBias<
+                             T, kBlockDim, true,
+                             false><<<feature_size, kBlockDim, 0, stream>>>(
+            x, d_y, d_scale, d_bias, d_x, mean, var, scale, epsilon, batch_size,
+            feature_size));
+      }
+      switch (GetDesiredBlockDim(feature_size)) {
+        FIXED_BLOCK_DIM_CASE(
+            LayerNormBackwardPostProcessToCalculateDX<
+                T, kBlockDim><<<batch_size, kBlockDim, 0, stream>>>(
+                x, d_x, mean, var, epsilon, feature_size));
+      }
+      break;
+    case 6:  // d_x != nullptr, d_scale != nullptr, d_bias == nullptr
+      switch (block_dim) {
+        FIXED_BLOCK_DIM_CASE(LayerNormBackwardGradientScaleOrBias<
+                             T, kBlockDim, true,
+                             true><<<feature_size, kBlockDim, 0, stream>>>(
+            x, d_y, d_scale, d_bias, d_x, mean, var, scale, epsilon, batch_size,
+            feature_size));
+      }
+      switch (GetDesiredBlockDim(feature_size)) {
+        FIXED_BLOCK_DIM_CASE(
+            LayerNormBackwardPostProcessToCalculateDX<
+                T, kBlockDim><<<batch_size, kBlockDim, 0, stream>>>(
+                x, d_x, mean, var, epsilon, feature_size));
+      }
+      break;
+    case 7:  // d_x != nullptr, d_scale != nullptr, d_bias != nullptr
+      switch (block_dim) {
+        FIXED_BLOCK_DIM_CASE(
+            LayerNormBackwardGradientAll<
+                T, kBlockDim, true><<<feature_size, kBlockDim, 0, stream>>>(
+                x, d_y, d_scale, d_bias, d_x, mean, var, scale, epsilon,
+                batch_size, feature_size));
+      }
+      switch (GetDesiredBlockDim(feature_size)) {
+        FIXED_BLOCK_DIM_CASE(
+            LayerNormBackwardPostProcessToCalculateDX<
+                T, kBlockDim><<<batch_size, kBlockDim, 0, stream>>>(
+                x, d_x, mean, var, epsilon, feature_size));
+      }
+      break;
+    default:
+      break;
+  }
+}
+
+template <typename T>
+class LayerNormKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    const float epsilon = ctx.Attr<float>("epsilon");
+    auto *scale = ctx.Input<Tensor>("Scale");
+    auto *bias = ctx.Input<Tensor>("Bias");
+    auto *x = ctx.Input<Tensor>("X");
+
+    auto *y = ctx.Output<Tensor>("Y");
+    auto *mean = ctx.Output<Tensor>("Mean");
+    auto *var = ctx.Output<Tensor>("Variance");
+    const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
+
+    const auto x_dims = x->dims();
+    auto *x_data = x->data<T>();
+    auto *y_data = y->mutable_data<T>(ctx.GetPlace());
+    auto *mean_data = mean->mutable_data<T>(ctx.GetPlace());
+    auto *var_data = var->mutable_data<T>(ctx.GetPlace());
+    auto *scale_data = (scale == nullptr ? nullptr : scale->data<T>());
+    auto *bias_data = (bias == nullptr ? nullptr : bias->data<T>());
+
+    auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis);
+    int batch_size = static_cast<int>(matrix_dim[0]);
+    int feature_size = static_cast<int>(matrix_dim[1]);
+
+    auto stream = ctx.cuda_device_context().stream();
+
+    switch (GetDesiredBlockDim(feature_size)) {
+      FIXED_BLOCK_DIM_CASE(
+          LayerNormForward<T, kBlockDim><<<batch_size, kBlockDim, 0, stream>>>(
+              x_data, scale_data, bias_data, y_data, mean_data, var_data,
+              epsilon, feature_size));
+      default:
+        PADDLE_THROW(
+            "Product from begin_norm_axis to end must be larger than 1");
+        break;
+    }
+  }
+};
+
+template <typename T>
+class LayerNormGradKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    const float epsilon = ctx.Attr<float>("epsilon");
+    // d_x, d_scale, d_bias may be nullptr
+    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
+    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+
+    auto *x = ctx.Input<Tensor>("X");
+    auto *mean = ctx.Input<Tensor>("Mean");
+    auto *var = ctx.Input<Tensor>("Variance");
+    auto *scale = ctx.Input<Tensor>("Scale");
+    auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
+
+    auto *x_data = x->data<T>();
+    auto *d_y_data = d_y->data<T>();
+    auto *mean_data = mean->data<T>();
+    auto *var_data = var->data<T>();
+    auto *scale_data = (scale == nullptr ? nullptr : scale->data<T>());
+    auto *d_scale_data =
+        (d_scale == nullptr ? nullptr
+                            : d_scale->mutable_data<T>(ctx.GetPlace()));
+    auto *d_bias_data =
+        (d_bias == nullptr ? nullptr : d_bias->mutable_data<T>(ctx.GetPlace()));
+    auto *d_x_data =
+        (d_x == nullptr ? nullptr : d_x->mutable_data<T>(ctx.GetPlace()));
+
+    const auto &x_dims = x->dims();
+    const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
+    auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis);
+    int batch_size = static_cast<int>(matrix_dim[0]);
+    int feature_size = static_cast<int>(matrix_dim[1]);
+
+    auto stream = ctx.cuda_device_context().stream();
+
+    LayerNormBackward<T>(x_data, d_y_data, scale_data, mean_data, var_data,
+                         d_x_data, d_scale_data, d_bias_data, epsilon,
+                         batch_size, feature_size, stream);
+  }
+};
+
+#undef FIXED_BLOCK_DIM_CASE_BASE
+#undef FIXED_BLOCK_DIM_CASE
+}  // namespace operators
+}  // namespace paddle
+
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
     layer_norm,
diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc
index 56e39649b409f7eed108027f6df58c19dd3c8ab8..dc008d16971bc762b401ddece56f9ec56f7a47d6 100644
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -19,11 +19,13 @@ limitations under the License. */
 #include <thread>  // NOLINT
 #include <vector>
 
+#include "gflags/gflags.h"
+
 #include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/operators/math/math_function.h"
 
 #include "paddle/fluid/operators/distributed/request_handler_impl.h"
 #include "paddle/fluid/operators/listen_and_serv_op.h"
-#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace operators {
@@ -57,15 +59,16 @@ static void ParallelExecuteBlocks(
     framework::ProgramDesc *program, framework::Scope *scope) {
   std::vector<std::future<void>> fs;
   for (size_t idx : parallel_blkids) {
-    fs.push_back(
-        framework::Async([&executor, &prepared, &program, &scope, idx]() {
-          int run_block = idx;  // thread local
-          try {
-            executor->RunPreparedContext(prepared[run_block].get(), scope);
-          } catch (const std::exception &e) {
-            LOG(ERROR) << "run sub program error " << e.what();
-          }
-        }));
+    fs.push_back(framework::Async([&executor, &prepared, &scope, idx]() {
+      int run_block = idx;  // thread local
+      try {
+        VLOG(3) << "running server block: " << run_block
+                << "pointer: " << prepared[run_block].get();
+        executor->RunPreparedContext(prepared[run_block].get(), scope);
+      } catch (const std::exception &e) {
+        LOG(ERROR) << "run sub program error " << e.what();
+      }
+    }));
   }
   for (size_t i = 0; i < fs.size(); ++i) fs[i].wait();
 }
@@ -98,27 +101,36 @@ static int64_t GetTimestamp() {
 
 void ListenAndServOp::RunSyncLoop(
     framework::Executor *executor, framework::ProgramDesc *program,
-    framework::Scope *recv_scope,
+    framework::Scope *recv_scope, platform::DeviceContext *dev_ctx,
     const std::vector<int> &prefetch_block_id_list,
     const int checkpoint_point_block_id) const {
+  VLOG(2) << "RunSyncLoop";
   size_t num_blocks = program->Size();
   auto optimize_blocks =
       Attr<std::vector<framework::BlockDesc *>>(kOptimizeBlocks);
   PADDLE_ENFORCE_GE(num_blocks, 2,
                     "server program should have at least 2 blocks");
 
-  std::vector<int> optimize_blocks_idx;
-  for (auto blk : optimize_blocks) {
-    optimize_blocks_idx.push_back(blk->ID());
+  // Prepare all the server block
+  std::vector<int> optimize_blocks_list;
+  for (size_t i = 1; i < program->Size(); ++i) {
+    optimize_blocks_list.push_back(i);
   }
-  auto optimize_prepared = executor->Prepare(*program, optimize_blocks_idx);
-  // Insert placeholder for block0 which holds current op itself.
+  auto optimize_prepared = executor->Prepare(*program, optimize_blocks_list);
+  // Insert placeholder for block0 which holds current op itself,
+  // NOTE the first block in `optimize_prepared` should never be ran.
   optimize_prepared.insert(
       optimize_prepared.begin(),
       std::shared_ptr<framework::ExecutorPrepareContext>(nullptr));
 
+  // Trainers will get all parameters from pserver in the
+  // startup program, so we will wait RequestGet first
+  rpc_service_->SetCond(distributed::kRequestGet);
+  rpc_service_->WaitBarrier(distributed::kRequestGet);
   rpc_service_->ResetBarrierCounter();
+
   while (true) {
+    rpc_service_->Profiler().OneStep();
     // Get from multiple trainers, we don't care about the order in which
     // the gradients arrives, just add suffix 0~n and merge the gradient.
     rpc_service_->SetCond(distributed::kRequestSend);
@@ -154,18 +166,54 @@ void ListenAndServOp::RunSyncLoop(
                           recv_scope);
     VLOG(2) << "run all blocks spent " << GetTimestamp() - ts << "(ms)";
 
+    ResetReceivedVars(recv_scope, dev_ctx, rpc_service_->NeedResetAllVars());
+
     rpc_service_->SetCond(distributed::kRequestGet);
     rpc_service_->WaitBarrier(distributed::kRequestGet);
     rpc_service_->ResetBarrierCounter();
-    // reset received sparse vars to avoid reuse it in the next mini-batch
-    dynamic_cast<distributed::RequestSendHandler *>(request_send_handler_.get())
-        ->ResetSparseVarRecorder();
   }  // while(true)
 }
 
+void ListenAndServOp::ResetReceivedVars(framework::Scope *recv_scope,
+                                        platform::DeviceContext *dev_ctx,
+                                        bool reset_all) const {
+  for (auto &varname : sparse_vars_) {
+    auto var = recv_scope->FindVar(varname);
+    if (var == nullptr) {
+      VLOG(2) << "can not find var " << varname << " in received scope";
+      continue;
+    }
+    if (var->IsType<framework::SelectedRows>()) {
+      VLOG(3) << "reset sparse var: " << varname;
+      var->GetMutable<framework::SelectedRows>()->mutable_rows()->clear();
+    } else {
+      PADDLE_THROW("The type of sparse var should be SelectedRows");
+    }
+  }
+  if (UNLIKELY(reset_all)) {
+    for (auto &varname : dense_vars_) {
+      auto var = recv_scope->FindVar(varname);
+      if (var == nullptr) {
+        VLOG(2) << "can not find var " << varname << " in received scope";
+        continue;
+      }
+      if (var->IsType<framework::LoDTensor>()) {
+        math::set_constant(*dev_ctx, var->GetMutable<framework::LoDTensor>(),
+                           static_cast<float>(0));
+      } else if (var->IsType<framework::Tensor>()) {
+        math::set_constant(*dev_ctx, var->GetMutable<framework::Tensor>(),
+                           static_cast<float>(0));
+      } else {
+        PADDLE_THROW("The type of dense var should be in [LoDTensor, Tensor]");
+      }
+    }
+  }
+}
+
 void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
                                    framework::ProgramDesc *program,
                                    framework::Scope *recv_scope) const {
+  VLOG(2) << "RunAsyncLoop";
   // grad name to block id
   std::unordered_map<std::string, int32_t> grad_to_block_id;
   std::unordered_map<int32_t, std::string> id_to_grad;
@@ -235,6 +283,25 @@ static void FillRequestCtx(
   h->SetCheckpointNotifyPreparedCtx(checkpoint_ctx);
 }
 
+void ListenAndServOp::CacheVarsType(const std::vector<std::string> &varnames,
+                                    const framework::Scope &scope) const {
+  for (const auto &varname : varnames) {
+    auto var = scope.FindVar(varname);
+    PADDLE_ENFORCE(var != nullptr,
+                   "Received var should be initialized in the received scope.");
+    if (var->IsType<framework::SelectedRows>()) {
+      sparse_vars_.push_back(varname);
+    } else if (var->IsType<framework::LoDTensor>() ||
+               var->IsType<framework::Tensor>()) {
+      dense_vars_.push_back(varname);
+    } else {
+      PADDLE_THROW(
+          "The type of received var should be in [SelectedRows, LoDTensor, "
+          "Tensor].");
+    }
+  }
+}
+
 void ListenAndServOp::RunImpl(const framework::Scope &scope,
                               const platform::Place &dev_place) const {
   // Mark this as PS that it should decide profiling by listening from trainer.
@@ -245,6 +312,7 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
 
   bool sync_mode = Attr<bool>("sync_mode");
   auto fan_in = Attr<int>("Fanin");
+  auto inputs = Inputs("X");
 
   PADDLE_ENFORCE(!rpc_service_);
   std::string endpoint = Attr<std::string>("endpoint");
@@ -335,11 +403,16 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
   signal(SIGINT, SignalHandler::StopAndExit);
   signal(SIGTERM, SignalHandler::StopAndExit);
 
+  // Cache the type of the received vars as `sparse_vars_` and `dense_vars_`
+  // so that we can reset them at the end of each iteration.
+  // NOTE: only used in sync update
+  CacheVarsType(inputs, recv_scope);
+
   // Write to a file of server selected port for python use.
   SavePort();
   if (sync_mode) {
-    RunSyncLoop(&executor, program, &recv_scope, prefetch_block_id_list,
-                checkpoint_block_id);
+    RunSyncLoop(&executor, program, &recv_scope, &dev_ctx,
+                prefetch_block_id_list, checkpoint_block_id);
   } else {
     RunAsyncLoop(&executor, program, &recv_scope);
   }
diff --git a/paddle/fluid/operators/listen_and_serv_op.h b/paddle/fluid/operators/listen_and_serv_op.h
index 978969cc515c7954b59f2bf7a4f2c0e1b13f9bc0..5f889793ab16249a4e06801090db087a089dbed1 100644
--- a/paddle/fluid/operators/listen_and_serv_op.h
+++ b/paddle/fluid/operators/listen_and_serv_op.h
@@ -26,6 +26,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/threadpool.h"
 #include "paddle/fluid/operators/distributed/request_handler.h"
 #include "paddle/fluid/operators/distributed/rpc_server.h"
+#include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
@@ -48,6 +49,7 @@ class ListenAndServOp : public framework::OperatorBase {
   void RunSyncLoop(framework::Executor* executor,
                    framework::ProgramDesc* program,
                    framework::Scope* recv_scope,
+                   platform::DeviceContext* dev_ctx,
                    const std::vector<int>& prefetch_block_id_list,
                    const int checkpoint_point_block_id) const;
 
@@ -64,6 +66,13 @@ class ListenAndServOp : public framework::OperatorBase {
   void RunImpl(const framework::Scope& scope,
                const platform::Place& dev_place) const override;
 
+  void ResetReceivedVars(framework::Scope* recv_scope,
+                         platform::DeviceContext* dev_ctx,
+                         bool reset_all = false) const;
+
+  void CacheVarsType(const std::vector<std::string>& varnames,
+                     const framework::Scope& scope) const;
+
  protected:
   mutable std::shared_ptr<distributed::RPCServer> rpc_service_;
   mutable std::shared_ptr<distributed::RequestHandler> request_send_handler_;
@@ -74,6 +83,8 @@ class ListenAndServOp : public framework::OperatorBase {
       request_checkpoint_handler_;
 
   mutable std::shared_ptr<std::thread> server_thread_;
+  mutable std::vector<std::string> sparse_vars_;
+  mutable std::vector<std::string> dense_vars_;
 };
 
 class SignalHandler {
diff --git a/paddle/fluid/operators/load_op.cc b/paddle/fluid/operators/load_op.cc
index ac35cf0b89bfaa0c0f8e64445f18a3bbd478e70a..51219504ffa2a778b56351f759e8a8dfb951ad91 100644
--- a/paddle/fluid/operators/load_op.cc
+++ b/paddle/fluid/operators/load_op.cc
@@ -31,9 +31,6 @@ class LoadOp : public framework::OperatorBase {
  private:
   void RunImpl(const framework::Scope &scope,
                const platform::Place &place) const override {
-    auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place);
-    platform::RecordEvent record_event(Type(), dev_ctx);
-
     // FIXME(yuyang18): We save variable to local file now, but we should change
     // it to save an output stream.
     auto filename = Attr<std::string>("file_path");
@@ -95,6 +92,7 @@ class LoadOp : public framework::OperatorBase {
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
     auto &dev_ctx = *pool.Get(place);
     framework::DeserializeFromStream(fin, selectedRows, dev_ctx);
+    selectedRows->SyncIndex();
   }
 };
 
diff --git a/paddle/fluid/operators/lod_tensor_to_array_op.cc b/paddle/fluid/operators/lod_tensor_to_array_op.cc
index 00ba5ce8ee5e4084c8af204cfc37fe80c437f0d7..b3f7e0c0097b469998049a1db65d56a28cf02b5e 100644
--- a/paddle/fluid/operators/lod_tensor_to_array_op.cc
+++ b/paddle/fluid/operators/lod_tensor_to_array_op.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/port.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/lookup_sparse_table_op.cc b/paddle/fluid/operators/lookup_sparse_table_op.cc
index 2ce11e712fb1a8aa9748313ec7cf4e895a931465..de3f0990e109cacd49c4d888bbc1f797fb196e01 100644
--- a/paddle/fluid/operators/lookup_sparse_table_op.cc
+++ b/paddle/fluid/operators/lookup_sparse_table_op.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
@@ -46,10 +45,6 @@ class LookupSparseTableOp : public framework::OperatorBase {
     auto out_var = scope.FindVar(Output("Out"));
     auto w_var = scope.FindVar(Input("W"));
     auto ids_var = scope.FindVar(Input("Ids"));
-    unsigned int seed = static_cast<unsigned int>(Attr<int>("seed"));
-    float min = Attr<float>("min");
-    float max = Attr<float>("max");
-    bool auto_grown_table = Attr<bool>("auto_grown_table");
 
     PADDLE_ENFORCE(out_var->IsType<framework::LoDTensor>(),
                    "The type of Out var should be LodTensor.");
@@ -60,46 +55,17 @@ class LookupSparseTableOp : public framework::OperatorBase {
     auto &ids_t = ids_var->Get<framework::LoDTensor>();
     auto out_t = out_var->GetMutable<framework::LoDTensor>();
     auto w_t = w_var->GetMutable<framework::SelectedRows>();
-    std::vector<int64_t> keys;
-    keys.resize(ids_t.numel());
-    for (int64_t i = 0; i < ids_t.numel(); ++i) {
-      keys[i] = ids_t.data<int64_t>()[i];
-    }
 
     // TODO(Yancey1989): support CUDA Place for the sparse table
     platform::CPUPlace cpu;
     auto out_shape = w_t->value().dims();
-    out_shape[0] = keys.size();
+    out_shape[0] = ids_t.numel();
     out_t->Resize(out_shape);
     out_t->mutable_data(cpu, w_t->value().type());
     PADDLE_ENFORCE_EQ(framework::ToDataType(w_t->value().type()),
                       framework::proto::VarType::FP32,
                       "The sparse table only support FP32");
-    auto non_keys_pair = w_t->Get(keys, out_t);
-    if (!auto_grown_table) {
-      PADDLE_ENFORCE_EQ(non_keys_pair.size(), static_cast<size_t>(0),
-                        "there is some keys does exists in the sparse table.");
-    }
-    auto value_shape = w_t->value().dims();
-    value_shape[0] = 1;
-    for (const auto &it : non_keys_pair) {
-      const auto key = it.first;
-      const auto index = it.second;
-      framework::Tensor value;
-      value.Resize(value_shape);
-      auto data = value.mutable_data<float>(cpu);
-
-      std::minstd_rand engine;
-      engine.seed(seed);
-      std::uniform_real_distribution<float> dist(min, max);
-      int64_t size = value.numel();
-      for (int64_t i = 0; i < size; ++i) {
-        data[i] = dist(engine);
-      }
-      w_t->Set(key, value);
-      memory::Copy(cpu, out_t->mutable_data<float>(cpu) + index * value.numel(),
-                   cpu, value.data<float>(), value.numel() * sizeof(float));
-    }
+    w_t->Get(ids_t, out_t, true);
   }
 };
 
@@ -121,21 +87,6 @@ class LookupSparseTableOpMaker : public framework::OpProtoAndCheckerMaker {
                      "Otherwise the given value indicates padding the output "
                      "with zeros whenever lookup encounters it in Ids.")
         .SetDefault(kNoPadding);
-    AddAttr<float>("min",
-                   "(float, default -1.0) "
-                   "Minimum value of uniform random")
-        .SetDefault(-1.0f);
-    AddAttr<float>("max",
-                   "(float, default 1.0) "
-                   "Maximum value of uniform random")
-        .SetDefault(1.0f);
-    AddAttr<int>("seed",
-                 "(int, default 0) "
-                 "Random seed used for generating samples. "
-                 "0 means use a seed generated by the system."
-                 "Note that if seed is not 0, this operator will always "
-                 "generate the same random numbers every time.")
-        .SetDefault(0);
     AddAttr<bool>("auto_grown_table",
                   "(bool default false)"
                   "Whether create new value if for nonexistent key.")
diff --git a/paddle/fluid/operators/lookup_table_op.cc b/paddle/fluid/operators/lookup_table_op.cc
index bda499432214b8841c8dfc406ee45ca0367920e7..d77b095c5d783a2a9fab87eb8b458117a6a3d225 100644
--- a/paddle/fluid/operators/lookup_table_op.cc
+++ b/paddle/fluid/operators/lookup_table_op.cc
@@ -32,20 +32,21 @@ class LookupTableOp : public framework::OperatorWithKernel {
 
     auto table_dims = ctx->GetInputDim("W");
     auto ids_dims = ctx->GetInputDim("Ids");
+    int ids_rank = ids_dims.size();
 
-    auto ids_var_type = ctx->GetInputsVarType("Ids").front();
-    // The type of Ids(Input) is SelectedRows or LoDTensor, when Ids's type
-    // is LoDTensor, this tensor contains the ids to be looked up in W
-    // and it must be a column vector with rank = 2 while the 2nd dimension
-    // size must be 1, when Ids's type is SelectedRows, the rows of Ids
-    // contains the ids to be looked up in W;
-    if (ids_var_type == framework::proto::VarType::LOD_TENSOR) {
-      PADDLE_ENFORCE_EQ(ids_dims.size(), 2);
-      PADDLE_ENFORCE_EQ(ids_dims[1], 1);
-    }
+    PADDLE_ENFORCE_EQ(table_dims.size(), 2);
+    PADDLE_ENFORCE_EQ(ids_dims[ids_rank - 1], 1,
+                      "The last dimension of the 'Ids' tensor must be 1.");
+
+    auto output_dims =
+        framework::vectorize(framework::slice_ddim(ids_dims, 0, ids_rank - 1));
+    output_dims.push_back(table_dims[1]);
+    ctx->SetOutputDim("Out", framework::make_ddim(output_dims));
 
-    ctx->SetOutputDim("Out", {ids_dims[0], table_dims[1]});
-    ctx->ShareLoD("Ids", /*->*/ "Out");
+    if (ctx->GetOutputsVarType("Out")[0] ==
+        framework::proto::VarType::LOD_TENSOR) {
+      ctx->ShareLoD("Ids", /*->*/ "Out");
+    }
   }
 
  protected:
@@ -62,17 +63,11 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("W",
              "(Tensor) The input represents embedding tensors, "
              "which is a learnable parameter.");
-    AddInput(
-        "Ids",
-        "(Tensor or SelectedRows) Ids's type can be Tensor or "
-        "SelectedRows, when Ids's type is Tensor, this tensor contains "
-        "the ids to be looked up in W and it must be a column vector with "
-        "rank = 2 while the 2nd dimension size must be 1; when Ids's type is "
-        "SelectedRows, the rows of Ids contains the ids to be looked up "
-        "in W.");
-    AddOutput("Out",
-              "(Tensor or SelectedRows) The lookup results, which have the "
-              "same type as W.");
+    AddInput("Ids",
+             "An input with type int32 or int64 "
+             "contains the ids to be looked up in W. "
+             "The last dimension size must be 1.");
+    AddOutput("Out", "The lookup results, which have the same type as W.");
     AddAttr<bool>("is_sparse",
                   "(boolean, default false) "
                   "Sparse update.")
@@ -90,15 +85,10 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
 Lookup Table Operator.
 
 This operator is used to perform lookups on the parameter W,
-then concatenated into a dense or sparse tensor.
-
-The type of Ids(Input) is SelectedRows, Tensor or LoDTensor, when Ids's
-type is SelectedRows, the rows of Ids contains the ids to be looked up in W;
-when Ids's type is Tensor, this tensor contains the ids to be looked up in W
-and it must be a column vector with rank = 2 while the 2nd dimension size must be 1,
-at this time, Ids can carry the LoD (Level of Details) information, or not, and
-the output only shares the LoD information with input Ids.
+then concatenated into a dense tensor.
 
+The input Ids can carry the LoD (Level of Details) information,
+or not. And the output only shares the LoD information with input Ids.
 
 )DOC");
   }
diff --git a/paddle/fluid/operators/lookup_table_op.cu b/paddle/fluid/operators/lookup_table_op.cu
index 77722c50d39003d9342afb04a61ae3aaf6b21100..74823dab09cac358f647c074ac2f2ee2fed17e55 100644
--- a/paddle/fluid/operators/lookup_table_op.cu
+++ b/paddle/fluid/operators/lookup_table_op.cu
@@ -23,7 +23,7 @@ namespace operators {
 
 template <typename T, int BlockDimX, int BlockDimY, int GridDimX,
           bool PaddingFlag>
-__global__ void LookupTable(T* output, const T* table, const int64_t* ids,
+__global__ void LookupTable(T *output, const T *table, const int64_t *ids,
                             const int64_t N, const int64_t K, const int64_t D,
                             const int64_t padding_idx) {
   int idx = threadIdx.x;
@@ -33,8 +33,8 @@ __global__ void LookupTable(T* output, const T* table, const int64_t* ids,
     int64_t id = ids[idy];
     PADDLE_ASSERT(id >= 0);
     PADDLE_ASSERT(id < N);
-    T* out = output + idy * D;
-    const T* tab = table + id * D;
+    T *out = output + idy * D;
+    const T *tab = table + id * D;
     for (int i = idx; i < D; i += BlockDimX) {
       if (PaddingFlag) {
         if (id == padding_idx)
@@ -50,7 +50,7 @@ __global__ void LookupTable(T* output, const T* table, const int64_t* ids,
 }
 
 template <typename T, int BlockDimX, int BlockDimY, int GridDimX>
-__global__ void LookupTableGrad(T* table, const T* output, const int64_t* ids,
+__global__ void LookupTableGrad(T *table, const T *output, const int64_t *ids,
                                 const int64_t N, const int64_t K,
                                 const int64_t D) {
   int idx = threadIdx.x;
@@ -60,8 +60,8 @@ __global__ void LookupTableGrad(T* table, const T* output, const int64_t* ids,
     int id = ids[idy];
     PADDLE_ASSERT(id >= 0);
     PADDLE_ASSERT(id < N);
-    const T* out = output + idy * D;
-    T* tab = table + id * D;
+    const T *out = output + idy * D;
+    T *tab = table + id * D;
     for (int i = idx; i < D; i += BlockDimX) {
       paddle::platform::CudaAtomicAdd(&tab[i], out[i]);
     }
@@ -72,36 +72,19 @@ __global__ void LookupTableGrad(T* table, const T* output, const int64_t* ids,
 template <typename T>
 class LookupTableCUDAKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* table_t = context.Input<LoDTensor>("W");
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *table_t = context.Input<LoDTensor>("W");
+    auto *ids_t = context.Input<LoDTensor>("Ids");
+    auto *output_t = context.Output<LoDTensor>("Out");
     int64_t padding_idx = context.Attr<int64_t>("padding_idx");
-    auto* ids_var = context.InputVar("Ids");
-    Tensor* output_t = context.Output<Tensor>("Out");
-
-    int64_t* ids;
-    int64_t K;
-
-    // The type of Ids(Input) is SelectedRows or LoDTensor, when Ids's type
-    // is LoDTensor, this tensor contains the ids to be looked up in W;
-    // when Ids's type is SelectedRows, the rows of Ids contains the
-    // ids to be looked up in W.
-    if (ids_var->IsType<framework::LoDTensor>()) {
-      auto* ids_t = context.Input<LoDTensor>("Ids");
-      ids = const_cast<int64_t*>(ids_t->data<int64_t>());
-      K = ids_t->numel();
-    } else if (ids_var->IsType<framework::SelectedRows>()) {
-      auto* ids_t = context.Input<framework::SelectedRows>("Ids");
-      ids = const_cast<int64_t*>(ids_t->rows().CUDAData(context.GetPlace()));
-      K = ids_t->rows().size();
-      output_t->Resize({K, table_t->dims()[1]});
-    } else {
-      PADDLE_THROW("Unsupported Variable Type of Ids");
-    }
 
     size_t N = table_t->dims()[0];
     size_t D = table_t->dims()[1];
-    auto* table = table_t->data<T>();
-    auto* output = output_t->mutable_data<T>(context.GetPlace());
+    size_t K = ids_t->numel();
+
+    auto *ids = ids_t->data<int64_t>();
+    auto *table = table_t->data<T>();
+    auto *output = output_t->mutable_data<T>(context.GetPlace());
 
     dim3 threads(128, 8);
     dim3 grids(8, 1);
@@ -122,41 +105,44 @@ class LookupTableCUDAKernel : public framework::OpKernel<T> {
 template <typename T>
 class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto& dev_ctx =
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto &dev_ctx =
         context.template device_context<platform::CUDADeviceContext>();
     bool is_sparse = context.Attr<bool>("is_sparse");
     // Since paddings are not trainable and fixed in forward, the gradient of
     // paddings makes no sense and we don't deal with it in backward.
     if (is_sparse) {
-      auto* ids = context.Input<LoDTensor>("Ids");
-      auto* table = context.Input<LoDTensor>("W");
-      auto* d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
-      auto* d_table = context.Output<SelectedRows>(framework::GradVarName("W"));
+      auto *ids = context.Input<LoDTensor>("Ids");
+      auto *table = context.Input<LoDTensor>("W");
+      auto *d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
+      auto *d_table = context.Output<SelectedRows>(framework::GradVarName("W"));
 
-      auto* ids_data = ids->data<int64_t>();
-      auto ids_dim = ids->dims();
+      auto *ids_data = ids->data<int64_t>();
+      int64_t ids_num = ids->numel();
 
       auto stream = dev_ctx.stream();
       // copy GPU memory to CPU pinned memory
       framework::Vector<int64_t> new_rows;
-      new_rows.resize(ids_dim[0]);
+      new_rows.resize(ids_num);
       auto gpu_place = boost::get<platform::CUDAPlace>(context.GetPlace());
 
       // TODO(yuyang18): Strange code here.
       memory::Copy(platform::CPUPlace(),
                    new_rows.CUDAMutableData(context.GetPlace()), gpu_place,
-                   ids_data, ids_dim[0] * sizeof(int64_t), stream);
+                   ids_data, ids_num * sizeof(int64_t), stream);
 
       d_table->set_rows(new_rows);
 
-      auto* d_table_value = d_table->mutable_value();
-      d_table_value->Resize({ids_dim[0], table->dims()[1]});
+      auto *d_table_value = d_table->mutable_value();
+      d_table_value->Resize({ids_num, table->dims()[1]});
       d_table_value->mutable_data<T>(context.GetPlace());
 
-      auto* d_table_data = d_table_value->data<T>();
-      auto* d_output_data = d_output->data<T>();
-      PADDLE_ENFORCE_EQ(d_table_value->dims(), d_output->dims());
+      auto *d_table_data = d_table_value->data<T>();
+      auto *d_output_data = d_output->data<T>();
+      auto d_output_dims = d_output->dims();
+      PADDLE_ENFORCE_EQ(
+          d_table_value->dims(),
+          framework::flatten_to_2d(d_output_dims, d_output_dims.size() - 1));
       memory::Copy(gpu_place, d_table_data, gpu_place, d_output_data,
                    d_output->numel() * sizeof(T), stream);
 
@@ -168,9 +154,9 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
       int N = d_table_t->dims()[0];
       int D = d_table_t->dims()[1];
       int K = ids_t->numel();
-      const int64_t* ids = ids_t->data<int64_t>();
-      const T* d_output = d_output_t->data<T>();
-      T* d_table = d_table_t->mutable_data<T>(context.GetPlace());
+      const int64_t *ids = ids_t->data<int64_t>();
+      const T *d_output = d_output_t->data<T>();
+      T *d_table = d_table_t->mutable_data<T>(context.GetPlace());
 
       auto t = framework::EigenVector<T>::Flatten(*d_table_t);
       t.device(*dev_ctx.eigen_device()) = t.constant(static_cast<T>(0));
diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h
index d482506bf0361c11a019e32efbf348a64aaf5164..58463dc4d6fd7cc3454de766814a947fee161070 100644
--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
@@ -36,43 +36,13 @@ template <typename T>
 class LookupTableKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
+    auto *ids_t = context.Input<LoDTensor>("Ids");      // int tensor
+    auto *output_t = context.Output<LoDTensor>("Out");  // float tensor
     auto *table_var = context.InputVar("W");
-    auto *ids_var = context.InputVar("Ids");
-    Tensor *output_t = context.Output<Tensor>("Out");
-    int64_t padding_idx = context.Attr<int64_t>("padding_idx");
-
-    DDim table_dim;
-
-    if (table_var->IsType<LoDTensor>()) {
-      table_dim = context.Input<LoDTensor>("W")->dims();
-    } else if (table_var->IsType<SelectedRows>()) {
-      auto *table_t = context.Input<SelectedRows>("W");
-      table_dim = table_t->value().dims();
-    } else {
-      PADDLE_THROW(
-          "The parameter W of a LookupTable "
-          "must be either LoDTensor or SelectedRows");
-    }
 
-    int64_t *ids;
-    int64_t ids_numel;
-
-    // The type of Ids(Input) is SelectedRows or LoDTensor, when Ids's type
-    // is LoDTensor, this tensor contains the ids to be looked up in W;
-    // when Ids's type is SelectedRows, the rows of Ids contains the
-    // ids to be looked up in W.
-    if (ids_var->IsType<LoDTensor>()) {
-      auto *ids_t = context.Input<LoDTensor>("Ids");
-      ids = const_cast<int64_t *>(ids_t->data<int64_t>());
-      ids_numel = ids_t->numel();
-    } else if (ids_var->IsType<SelectedRows>()) {
-      auto *ids_t = context.Input<SelectedRows>("Ids");
-      ids = const_cast<int64_t *>(ids_t->rows().data());
-      ids_numel = ids_t->rows().size();
-      output_t->Resize({ids_numel, table_dim[1]});
-    } else {
-      PADDLE_THROW("Unsupported Variable Type of Ids");
-    }
+    int64_t padding_idx = context.Attr<int64_t>("padding_idx");
+    int64_t *ids = const_cast<int64_t *>(ids_t->data<int64_t>());
+    int64_t ids_numel = ids_t->numel();
 
     if (table_var->IsType<LoDTensor>()) {
       auto *table_t = context.Input<LoDTensor>("W");
@@ -87,7 +57,7 @@ class LookupTableKernel : public framework::OpKernel<T> {
           memset(output + i * row_width, 0, row_width * sizeof(T));
         } else {
           PADDLE_ENFORCE_LT(ids[i], row_number);
-          PADDLE_ENFORCE_GE(ids[i], 0);
+          PADDLE_ENFORCE_GE(ids[i], 0, "ids %d", i);
           memcpy(output + i * row_width, table + ids[i] * row_width,
                  row_width * sizeof(T));
         }
@@ -139,17 +109,17 @@ class LookupTableGradKernel : public framework::OpKernel<T> {
       auto *d_table = context.Output<SelectedRows>(framework::GradVarName("W"));
 
       auto *ids_data = ids->data<int64_t>();
-      auto ids_dim = ids->dims();
+      int64_t ids_num = ids->numel();
 
       framework::Vector<int64_t> new_rows;
-      new_rows.reserve(ids_dim[0]);
-      for (int64_t i = 0; i < ids_dim[0]; i++) {
+      new_rows.reserve(ids_num);
+      for (int64_t i = 0; i < ids_num; i++) {
         new_rows.push_back(ids_data[i]);
       }
       d_table->set_rows(new_rows);
 
       auto *d_table_value = d_table->mutable_value();
-      d_table_value->Resize({ids_dim[0], table_dim[1]});
+      d_table_value->Resize({ids_num, table_dim[1]});
       d_table_value->mutable_data<T>(context.GetPlace());
 
       d_table->set_height(table_dim[0]);
@@ -157,7 +127,10 @@ class LookupTableGradKernel : public framework::OpKernel<T> {
       auto *d_output_data = d_output->data<T>();
       auto *d_table_data = d_table_value->data<T>();
 
-      PADDLE_ENFORCE_EQ(d_table_value->dims(), d_output->dims());
+      auto d_output_dims = d_output->dims();
+      PADDLE_ENFORCE_EQ(
+          d_table_value->dims(),
+          framework::flatten_to_2d(d_output_dims, d_output_dims.size() - 1));
       memcpy(d_table_data, d_output_data, sizeof(T) * d_output->numel());
     } else {
       auto *ids = context.Input<LoDTensor>("Ids");
@@ -165,10 +138,9 @@ class LookupTableGradKernel : public framework::OpKernel<T> {
       auto *d_table = context.Output<LoDTensor>(framework::GradVarName("W"));
 
       auto *ids_data = ids->data<int64_t>();
-      auto ids_dim = ids->dims();
 
       int N = table_dim[0];
-      int D = d_output->dims()[1];
+      int D = table_dim[1];
 
       auto *d_output_data = d_output->data<T>();
       auto *d_table_data = d_table->mutable_data<T>(context.GetPlace());
diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index 5571ff9a7151c1f971ad1805bf001815a651202b..d7f0f3c6280db7d121bf8821ec6d578e22a33da6 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -1,4 +1,6 @@
+if (NOT WIN32)
 add_subdirectory(detail)
+endif(NOT WIN32)
 
 function(math_library TARGET)
     # math_library is a function to create math library. 
@@ -38,9 +40,13 @@ math_library(context_project DEPS im2col math_function)
 math_library(cross_entropy)
 math_library(cos_sim_functor)
 math_library(depthwise_conv)
-math_library(gru_compute DEPS activation_functions math_function)
 math_library(im2col)
+
+if (NOT WIN32) # windows do not support avx functions yet.
+math_library(gru_compute DEPS activation_functions math_function)
 math_library(lstm_compute DEPS activation_functions)
+endif (NOT WIN32)
+
 cc_library(blas SRCS blas.cc DEPS cblas framework_proto device_context)
 math_library(math_function DEPS blas)
 math_library(maxouting)
@@ -51,6 +57,9 @@ math_library(sequence_padding)
 math_library(sequence_pooling DEPS math_function)
 math_library(sequence_scale)
 math_library(softmax DEPS math_function)
+if (NOT WIN32)
+math_library(matrix_bit_code)
+endif (NOT WIN32)
 math_library(unpooling)
 math_library(vol2col)
 
@@ -64,3 +73,4 @@ if(WITH_GPU)
     nv_test(selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu DEPS selected_rows_functor math_function)
 endif()
 cc_test(concat_test SRCS concat_test.cc DEPS concat)
+cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info)
diff --git a/paddle/fluid/operators/math/blas.h b/paddle/fluid/operators/math/blas.h
index 9f6c1e5c35f02cd4bc729eea78b17fac017aa90e..da185d93c09f9b06bd5968b9c8e93176f9ef014b 100644
--- a/paddle/fluid/operators/math/blas.h
+++ b/paddle/fluid/operators/math/blas.h
@@ -21,6 +21,10 @@
 #include "paddle/fluid/platform/dynload/mklml.h"
 #endif
 
+#ifdef PADDLE_WITH_LIBXSMM
+#include <libxsmm.h>
+#endif
+
 #ifdef PADDLE_USE_OPENBLAS
 #include <cblas.h>
 #endif
@@ -86,6 +90,34 @@ class Blas {
   void GEMM(bool transA, bool transB, int M, int N, int K, T alpha, const T* A,
             int lda, const T* B, int ldb, T beta, T* C, int ldc) const;
 
+  template <typename T>
+  void GEMM(CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K,
+            T alpha, const T* A, int lda, const T* B, int ldb, T beta, T* C,
+            int ldc) const;
+
+#ifdef PADDLE_WITH_MKLML
+  template <typename T>
+  T* GEMM_ALLOC(const CBLAS_IDENTIFIER id, const int M, const int N,
+                const int K) const;
+
+  template <typename T>
+  void GEMM_PACK(const CBLAS_IDENTIFIER id, const CBLAS_TRANSPOSE trans, int M,
+                 int N, int K, const T alpha, const T* src, const int ld,
+                 T* dst) const;
+
+  template <typename T>
+  void GEMM_COMPUTE(int transA, int transB, int M, int N, int K, const T* A,
+                    const int lda, const T* B, const int ldb, T beta, T* C,
+                    const int ldc) const;
+
+  template <typename T>
+  void GEMM_FREE(T* data) const;
+#endif
+
+  template <typename T>
+  void MatMul(const int M, const int N, const int K, const T* A, const T* B,
+              T* C) const;
+
   template <typename T>
   void MatMul(const framework::Tensor& mat_a, bool trans_a,
               const framework::Tensor& mat_b, bool trans_b, T alpha,
@@ -111,13 +143,25 @@ class Blas {
   template <typename T>
   void VADD(int n, const T* x, const T* y, T* z) const;
 
+  template <typename T>
+  void VMUL(int n, const T* x, const T* y, T* z) const;
+
   template <typename T>
   void VCOPY(int n, const T* x, T* y) const;
 
+  template <typename T>
+  void VEXP(int n, const T* x, T* y) const;
+
   template <typename T>
   void GEMV(bool trans_a, int M, int N, T alpha, const T* A, const T* B, T beta,
             T* C) const;
 
+  template <typename T>
+  T DOT(int n, const T* x, const T* y) const;
+
+  template <typename T>
+  void SCAL(int n, const T a, T* x) const;
+
   template <typename T>
   void BatchedGEMM(CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N,
                    int K, T alpha, const T* A, const T* B, T beta, T* C,
@@ -142,6 +186,28 @@ class BlasT : private Blas<DeviceContext> {
     Base()->template GEMM<T>(args...);
   }
 
+#ifdef PADDLE_WITH_MKLML
+  template <typename... ARGS>
+  T* GEMM_ALLOC(ARGS... args) const {
+    return Base()->template GEMM_ALLOC<T>(args...);
+  }
+
+  template <typename... ARGS>
+  void GEMM_PACK(ARGS... args) const {
+    Base()->template GEMM_PACK<T>(args...);
+  }
+
+  template <typename... ARGS>
+  void GEMM_COMPUTE(ARGS... args) const {
+    Base()->template GEMM_COMPUTE<T>(args...);
+  }
+
+  template <typename... ARGS>
+  void GEMM_FREE(ARGS... args) const {
+    Base()->template GEMM_FREE<T>(args...);
+  }
+#endif
+
   template <typename... ARGS>
   void MatMul(ARGS... args) const {
     Base()->template MatMul<T>(args...);
@@ -157,16 +223,36 @@ class BlasT : private Blas<DeviceContext> {
     Base()->template VADD<T>(args...);
   }
 
+  template <typename... ARGS>
+  void VMUL(ARGS... args) const {
+    Base()->template VMUL<T>(args...);
+  }
+
   template <typename... ARGS>
   void VCOPY(ARGS... args) const {
     Base()->template VCOPY<T>(args...);
   }
 
+  template <typename... ARGS>
+  void VEXP(ARGS... args) const {
+    Base()->template VEXP<T>(args...);
+  }
+
   template <typename... ARGS>
   void GEMV(ARGS... args) const {
     Base()->template GEMV<T>(args...);
   }
 
+  template <typename... ARGS>
+  T DOT(ARGS... args) const {
+    return Base()->template DOT<T>(args...);
+  }
+
+  template <typename... ARGS>
+  void SCAL(ARGS... args) const {
+    Base()->template SCAL<T>(args...);
+  }
+
   template <typename... ARGS>
   void BatchedGEMM(ARGS... args) const {
     Base()->template BatchedGEMM<T>(args...);
diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h
index 2ce94cfc93823aa891114ef8fd1e851727ebc623..e1df78d11e41c5f74e244643f40c6d0581fa6a4a 100644
--- a/paddle/fluid/operators/math/blas_impl.h
+++ b/paddle/fluid/operators/math/blas_impl.h
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
+#include <limits>
 #include <vector>
 #include "paddle/fluid/operators/math/math_function.h"
 
@@ -30,6 +31,33 @@ struct CBlas<float> {
     platform::dynload::cblas_sgemm(args...);
   }
 
+  template <typename... ARGS>
+  static float *GEMM_ALLOC(ARGS... args) {
+    return platform::dynload::cblas_sgemm_alloc(args...);
+  }
+
+  template <typename... ARGS>
+  static void GEMM_PACK(ARGS... args) {
+    platform::dynload::cblas_sgemm_pack(args...);
+  }
+
+  template <typename... ARGS>
+  static void GEMM_COMPUTE(ARGS... args) {
+    platform::dynload::cblas_sgemm_compute(args...);
+  }
+
+  template <typename... ARGS>
+  static void GEMM_FREE(ARGS... args) {
+    platform::dynload::cblas_sgemm_free(args...);
+  }
+
+#ifdef PADDLE_WITH_LIBXSMM
+  template <typename... ARGS>
+  static void SMM_GEMM(ARGS... args) {
+    libxsmm_sgemm(args...);
+  }
+#endif
+
   template <typename... ARGS>
   static void AXPY(ARGS... args) {
     platform::dynload::cblas_saxpy(args...);
@@ -45,6 +73,16 @@ struct CBlas<float> {
     platform::dynload::cblas_sgemv(args...);
   }
 
+  template <typename... ARGS>
+  static float DOT(ARGS... args) {
+    return platform::dynload::cblas_sdot(args...);
+  }
+
+  template <typename... ARGS>
+  static void SCAL(ARGS... args) {
+    platform::dynload::cblas_sscal(args...);
+  }
+
   template <typename... ARGS>
   static void GEMM_BATCH(ARGS... args) {
     platform::dynload::cblas_sgemm_batch(args...);
@@ -54,6 +92,16 @@ struct CBlas<float> {
   static void VADD(ARGS... args) {
     platform::dynload::vsAdd(args...);
   }
+
+  template <typename... ARGS>
+  static void VMUL(ARGS... args) {
+    platform::dynload::vsMul(args...);
+  }
+
+  template <typename... ARGS>
+  static void VEXP(ARGS... args) {
+    platform::dynload::vsExp(args...);
+  }
 };
 
 template <>
@@ -63,6 +111,33 @@ struct CBlas<double> {
     platform::dynload::cblas_dgemm(args...);
   }
 
+  template <typename... ARGS>
+  static double *GEMM_ALLOC(ARGS... args) {
+    return platform::dynload::cblas_dgemm_alloc(args...);
+  }
+
+  template <typename... ARGS>
+  static void GEMM_PACK(ARGS... args) {
+    platform::dynload::cblas_dgemm_pack(args...);
+  }
+
+  template <typename... ARGS>
+  static void GEMM_COMPUTE(ARGS... args) {
+    platform::dynload::cblas_dgemm_compute(args...);
+  }
+
+  template <typename... ARGS>
+  static void GEMM_FREE(ARGS... args) {
+    platform::dynload::cblas_dgemm_free(args...);
+  }
+
+#ifdef PADDLE_WITH_LIBXSMM
+  template <typename... ARGS>
+  static void SMM_GEMM(ARGS... args) {
+    libxsmm_dgemm(args...);
+  }
+#endif
+
   template <typename... ARGS>
   static void AXPY(ARGS... args) {
     platform::dynload::cblas_daxpy(args...);
@@ -78,6 +153,16 @@ struct CBlas<double> {
     platform::dynload::cblas_dgemv(args...);
   }
 
+  template <typename... ARGS>
+  static double DOT(ARGS... args) {
+    return platform::dynload::cblas_ddot(args...);
+  }
+
+  template <typename... ARGS>
+  static void SCAL(ARGS... args) {
+    platform::dynload::cblas_dscal(args...);
+  }
+
   template <typename... ARGS>
   static void GEMM_BATCH(ARGS... args) {
     platform::dynload::cblas_dgemm_batch(args...);
@@ -87,6 +172,16 @@ struct CBlas<double> {
   static void VADD(ARGS... args) {
     platform::dynload::vdAdd(args...);
   }
+
+  template <typename... ARGS>
+  static void VMUL(ARGS... args) {
+    platform::dynload::vdMul(args...);
+  }
+
+  template <typename... ARGS>
+  static void VEXP(ARGS... args) {
+    platform::dynload::vdExp(args...);
+  }
 };
 
 #else
@@ -137,9 +232,17 @@ struct CBlas<double> {
   }
 };
 #endif
+
 template <>
 struct CBlas<platform::float16> {
   static void GEMM(...) { PADDLE_THROW("float16 GEMM not supported on CPU"); }
+  static void SMM_GEMM(...) {
+    PADDLE_THROW("float16 SMM_GEMM not supported on CPU");
+  }
+  static void VMUL(...) { PADDLE_THROW("float16 VMUL not supported on CPU"); }
+  static void VEXP(...) { PADDLE_THROW("float16 VEXP not supported on CPU"); }
+  static void DOT(...) { PADDLE_THROW("float16 DOT not supported on CPU"); };
+  static void SCAL(...) { PADDLE_THROW("float16 SCAL not supported on CPU"); };
 #ifdef PADDLE_WITH_MKLML
   static void GEMM_BATCH(...) {
     PADDLE_THROW("float16 GEMM_BATCH not supported on CPU");
@@ -147,6 +250,41 @@ struct CBlas<platform::float16> {
 #endif
 };
 
+#ifdef PADDLE_WITH_MKLML
+template <>
+template <typename T>
+T *Blas<platform::CPUDeviceContext>::GEMM_ALLOC(const CBLAS_IDENTIFIER id,
+                                                const int M, const int N,
+                                                const int K) const {
+  return CBlas<T>::GEMM_ALLOC(id, M, N, K);
+}
+
+template <>
+template <typename T>
+void Blas<platform::CPUDeviceContext>::GEMM_PACK(const CBLAS_IDENTIFIER id,
+                                                 const CBLAS_TRANSPOSE trans,
+                                                 int M, int N, int K,
+                                                 const T alpha, const T *src,
+                                                 const int ld, T *dst) const {
+  CBlas<T>::GEMM_PACK(CblasRowMajor, id, trans, M, N, K, alpha, src, ld, dst);
+}
+
+template <>
+template <typename T>
+void Blas<platform::CPUDeviceContext>::GEMM_COMPUTE(
+    int transA, int transB, int M, int N, int K, const T *A, const int lda,
+    const T *B, const int ldb, T beta, T *C, const int ldc) const {
+  CBlas<T>::GEMM_COMPUTE(CblasRowMajor, transA, transB, M, N, K, A, lda, B, ldb,
+                         beta, C, ldc);
+}
+
+template <>
+template <typename T>
+void Blas<platform::CPUDeviceContext>::GEMM_FREE(T *data) const {
+  CBlas<T>::GEMM_FREE(data);
+}
+#endif
+
 template <>
 template <typename T>
 void Blas<platform::CPUDeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
@@ -171,6 +309,17 @@ void Blas<platform::CPUDeviceContext>::GEMM(bool transA, bool transB, int M,
                  lda, B, ldb, beta, C, ldc);
 }
 
+template <>
+template <typename T>
+void Blas<platform::CPUDeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
+                                            CBLAS_TRANSPOSE transB, int M,
+                                            int N, int K, T alpha, const T *A,
+                                            int lda, const T *B, int ldb,
+                                            T beta, T *C, int ldc) const {
+  CBlas<T>::GEMM(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
+                 beta, C, ldc);
+}
+
 template <typename DeviceContext>
 template <typename T>
 void Blas<DeviceContext>::MatMul(const framework::Tensor &mat_a, bool trans_a,
@@ -222,6 +371,61 @@ void Blas<platform::CPUDeviceContext>::VADD(int n, const T *x, const T *y,
 #endif
 }
 
+template <>
+template <typename T>
+void Blas<platform::CPUDeviceContext>::VMUL(int n, const T *x, const T *y,
+                                            T *z) const {
+#ifdef PADDLE_WITH_MKLML
+  CBlas<T>::VMUL(n, x, y, z);
+#else
+  // try to find if openblas support vmul
+  for (int i = 0; i < n; ++i) {
+    z[i] = x[i] * y[i];
+  }
+#endif
+}
+
+template <>
+template <typename T>
+void Blas<platform::CPUDeviceContext>::VEXP(int n, const T *x, T *y) const {
+#ifdef PADDLE_WITH_MKLML
+  CBlas<T>::VEXP(n, x, y);
+#else
+  // try to find if openblas support vexp
+  for (int i = 0; i < n; ++i) {
+    y[i] = std::exp(x[i]);
+  }
+#endif
+}
+
+template <>
+template <typename T>
+T Blas<platform::CPUDeviceContext>::DOT(int n, const T *x, const T *y) const {
+#ifdef PADDLE_WITH_MKLML
+  return CBlas<T>::DOT(n, x, 1, y, 1);
+#else
+  // try to find if openblas support cblas_dot
+  T sum = 0;
+  for (int i = 0; i < n; ++i) {
+    sum += x[i] * y[i];
+  }
+  return sum;
+#endif
+}
+
+template <>
+template <typename T>
+void Blas<platform::CPUDeviceContext>::SCAL(int n, const T a, T *x) const {
+#ifdef PADDLE_WITH_MKLML
+  CBlas<T>::SCAL(n, a, x, 1);
+#else
+  // try to find if openblas support cblas_scal
+  for (int i = 0; i < n; ++i) {
+    x[i] = a * x[i];
+  }
+#endif
+}
+
 template <>
 template <typename T>
 void Blas<platform::CPUDeviceContext>::GEMV(bool trans_a, int M, int N, T alpha,
@@ -263,6 +467,42 @@ void Blas<platform::CPUDeviceContext>::BatchedGEMM(
 #endif
 }
 
+template <typename DeviceContext>
+template <typename T>
+void Blas<DeviceContext>::MatMul(const int M, const int N, const int K,
+                                 const T *A, const T *B, T *C) const {
+  this->template GEMM<T>(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, K,
+                         static_cast<T>(1), A, K, B, N, static_cast<T>(0), C,
+                         N);
+}
+
+template <>
+template <typename T>
+void Blas<platform::CPUDeviceContext>::MatMul(const int M, const int N,
+                                              const int K, const T *A,
+                                              const T *B, T *C) const {
+#ifdef PADDLE_WITH_LIBXSMM
+  // Refer to https://github.com/hfp/libxsmm/blob/master/README.md
+  // But the threshold is custom constexpr int LIBXSMM_THRESHOLD = 20 * 20 * 20;
+
+  // Since the matrix is very small,
+  // so the unit of calculation is already very fast,
+  // and the if( M*N*K < LIBXSMM_THRESHOLD) would be overhead,
+  // use xsmm directly.
+  // Note: SMM use ColMajor
+  const char transa = 'N';
+  const char transb = 'N';
+  const T alpha = static_cast<T>(1);
+  const T beta = static_cast<T>(0);
+  CBlas<T>::SMM_GEMM(&transa, &transb, &N, &M, &K, &alpha, B, &N, A, &K, &beta,
+                     C, &N);
+  return;
+#endif
+
+  CBlas<T>::GEMM(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, K,
+                 static_cast<T>(1), A, K, B, N, static_cast<T>(0), C, N);
+}
+
 template <typename DeviceContext>
 template <typename T>
 void Blas<DeviceContext>::MatMul(const framework::Tensor &mat_a,
diff --git a/paddle/fluid/operators/math/compound_functors.h b/paddle/fluid/operators/math/compound_functors.h
new file mode 100644
index 0000000000000000000000000000000000000000..1d32a9585b08a9d27730076d9f7baa6056270a42
--- /dev/null
+++ b/paddle/fluid/operators/math/compound_functors.h
@@ -0,0 +1,185 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T, typename BinaryFunctor, typename UnaryFunctor>
+struct BinaryCompoundFunctor {
+  BinaryCompoundFunctor(const BinaryFunctor func1, const UnaryFunctor func2)
+      : func1_(func1), func2_(func2) {}
+  // Z = BinaryFunctor(X, UnaryFunctor(Y))
+
+  inline HOSTDEVICE T GetOut(T x, T y) { return func1_(x, func2_(y)); }
+
+  inline HOSTDEVICE T GetOutUseIntermediateOut(T x, T intermediat_out) {
+    return func1_(x, intermediat_out);
+  }
+
+  inline HOSTDEVICE T GetIntermediateOut(T x, T y) { return func2_(y); }
+
+  BinaryFunctor func1_;
+  UnaryFunctor func2_;
+};
+
+template <typename T, typename UnaryFunctor, typename BinaryFunctor>
+struct UnaryCompoundFunctor {
+  UnaryCompoundFunctor(const UnaryFunctor func1, const BinaryFunctor func2)
+      : func1_(func1), func2_(func2) {}
+  // Z = UnaryFunctor(BinaryFunctor(X, Y))
+
+  inline HOSTDEVICE T GetOut(T x, T y) { return func1_(func2_(x, y)); }
+
+  inline HOSTDEVICE T GetOutUseIntermediateOut(T x, T intermediat_out) {
+    return func1_(intermediat_out);
+  }
+
+  inline HOSTDEVICE T GetIntermediateOut(T x, T y) { return func2_(x, y); }
+
+  UnaryFunctor func1_;
+  BinaryFunctor func2_;
+};
+
+// FIXME(zcd): DBinaryFun and DUnaryFun have to method to get
+// the dx, one is to use the 'out', and the other is not to use it.
+// the former method will save the time of recomputing the
+// 'out', but it must occupy the memory to store the 'out'.
+// While the later method can avoid occupying this memory,
+// but it must recompute the 'out'.
+template <typename T, typename DBinaryFun, typename UnaryFun>
+struct BinaryCompoundGradDxFunctor {
+  BinaryCompoundGradDxFunctor(const DBinaryFun &d_binary_fun,
+                              const UnaryFun &unary_fun)
+      : d_binary_fun_(d_binary_fun), unary_fun_(unary_fun) {}
+
+  inline HOSTDEVICE T operator()(T x, T y, T out, T dout) {
+    return dout * d_binary_fun_.Dx(x, unary_fun_(y));
+  }
+
+  inline HOSTDEVICE T operator()(T x, T y, T intermediate_out, T out, T dout) {
+    return dout * d_binary_fun_.Dx(x, intermediate_out);
+  }
+
+ private:
+  DBinaryFun d_binary_fun_;
+  UnaryFun unary_fun_;
+};
+
+template <typename T, typename DBinaryFun, typename UnaryFun,
+          typename DUnaryFun>
+struct BinaryCompoundGradDyFunctor {
+  BinaryCompoundGradDyFunctor(const DBinaryFun &d_binary_fun,
+                              const UnaryFun &unary_fun,
+                              const DUnaryFun &d_unary_fun)
+      : d_binary_fun_(d_binary_fun),
+        unary_fun_(unary_fun),
+        d_unary_fun_(d_unary_fun) {}
+
+  inline HOSTDEVICE T operator()(T x, T y, T out, T dout) {
+    return dout * d_binary_fun_.Dy(x, unary_fun_(y)) * d_unary_fun_(y);
+  }
+
+  inline HOSTDEVICE T operator()(T x, T y, T intermediate_out, T out, T dout) {
+    return dout * d_binary_fun_.Dy(x, intermediate_out) *
+           d_unary_fun_(y, intermediate_out);
+  }
+
+ private:
+  DBinaryFun d_binary_fun_;
+  UnaryFun unary_fun_;
+  DUnaryFun d_unary_fun_;
+};
+
+template <typename T, typename DUnaryFun, typename BinaryFun,
+          typename DBinaryFun, bool Recomputation = true>
+struct UnaryCompoundGradDxFunctor {
+  UnaryCompoundGradDxFunctor(const DUnaryFun &d_unary_fun,
+                             const BinaryFun &binary_fun,
+                             const DBinaryFun &d_binary_fun)
+      : d_unary_fun_(d_unary_fun),
+        binary_fun_(binary_fun),
+        d_binary_fun_(d_binary_fun) {}
+
+  inline HOSTDEVICE T operator()(T x, T y, T out, T dout) {
+    T base;
+    if (Recomputation) {
+      base = dout * d_unary_fun_(binary_fun_(x, y));
+    } else {
+      base = dout * d_unary_fun_(binary_fun_(x, y), out);
+    }
+    return base * d_binary_fun_.Dx(x, y);
+  }
+
+  inline HOSTDEVICE T operator()(T x, T y, T intermediate_out, T out, T dout) {
+    T base;
+    if (Recomputation) {
+      base = dout * d_unary_fun_(intermediate_out);
+    } else {
+      base = dout * d_unary_fun_(intermediate_out, out);
+    }
+    return base * d_binary_fun_.Dx(x, y);
+  }
+
+ private:
+  DUnaryFun d_unary_fun_;
+  BinaryFun binary_fun_;
+  DBinaryFun d_binary_fun_;
+};
+
+template <typename T, typename DUnaryFun, typename BinaryFun,
+          typename DBinaryFun, bool Recomputation = true>
+struct UnaryCompoundGradDyFunctor {
+  UnaryCompoundGradDyFunctor(const DUnaryFun &d_unary_fun,
+                             const BinaryFun &binary_fun,
+                             const DBinaryFun &d_binary_fun)
+      : d_unary_fun_(d_unary_fun),
+        binary_fun_(binary_fun),
+        d_binary_fun_(d_binary_fun) {}
+
+  inline HOSTDEVICE T operator()(T x, T y, T out, T dout) {
+    T base;
+    if (Recomputation) {
+      base = dout * d_unary_fun_(binary_fun_(x, y));
+    } else {
+      base = dout * d_unary_fun_(binary_fun_(x, y), out);
+    }
+    return base * d_binary_fun_.Dy(x, y);
+  }
+
+  inline HOSTDEVICE T operator()(T x, T y, T intermediate_out, T out, T dout) {
+    T base;
+    if (Recomputation) {
+      base = dout * d_unary_fun_(intermediate_out);
+    } else {
+      base = dout * d_unary_fun_(intermediate_out, out);
+    }
+    return base * d_binary_fun_.Dy(x, y);
+  }
+
+ private:
+  DUnaryFun d_unary_fun_;
+  BinaryFun binary_fun_;
+  DBinaryFun d_binary_fun_;
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/concat.cc b/paddle/fluid/operators/math/concat.cc
index 55c8a472aca7fe700ef6a3f96bed1496d7b12b80..c3c5c160db358d39aa3f841a2b1646a21c91440e 100644
--- a/paddle/fluid/operators/math/concat.cc
+++ b/paddle/fluid/operators/math/concat.cc
@@ -48,16 +48,16 @@ class ConcatFunctor<platform::CPUDeviceContext, T> {
     auto cpu_place = boost::get<platform::CPUPlace>(context.GetPlace());
 
     // computation
-    for (int k = 0; k < out_rows; ++k) {
-      T* dst_ptr = output->data<T>() + k * out_cols;
-      int col_idx = 0;
-      for (int j = 0; j < num; ++j) {
-        int col_len = input_cols[j];
-        const T* src_prt = input[j].data<T>() + k * col_len;
-        memory::Copy(cpu_place, dst_ptr + col_idx, cpu_place, src_prt,
-                     sizeof(T) * col_len);
-        col_idx += col_len;
+    auto output_data = output->data<T>();
+    int col_idx = 0;
+    for (int j = 0; j < num; ++j) {
+      int col_len = input_cols[j];
+      auto input_data = input[j].data<T>();
+      for (int k = 0; k < out_rows; ++k) {
+        memory::Copy(cpu_place, output_data + k * out_cols + col_idx, cpu_place,
+                     input_data + k * col_len, sizeof(T) * col_len);
       }
+      col_idx += col_len;
     }
   }
 };
@@ -71,7 +71,7 @@ class ConcatGradFunctor<platform::CPUDeviceContext, T> {
  public:
   void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& input,
-                  const std::vector<const framework::Tensor*>& ref_inputs,
+                  const std::vector<const framework::LoDTensor*>& ref_inputs,
                   const int axis, std::vector<framework::Tensor*>* outputs) {
     // TODO(zcd): Add input data validity checking
     size_t num = outputs->size();
diff --git a/paddle/fluid/operators/math/concat.cu b/paddle/fluid/operators/math/concat.cu
index 5863d74fca21de8b77bc208fb95d8fd52562f7a7..342379268be36cc5b532363e664f6e73990333e1 100644
--- a/paddle/fluid/operators/math/concat.cu
+++ b/paddle/fluid/operators/math/concat.cu
@@ -177,6 +177,9 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
           dev_ins_data, dev_ins_col_data, static_cast<int>(inputs_col.size()),
           out_row, out_col, output->data<T>());
     }
+    // Wait() must be called because `inputs_data` may be destructed before
+    // kernel ends
+    context.Wait();
   }
 };
 
@@ -189,7 +192,7 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> {
  public:
   void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& input,
-                  const std::vector<const framework::Tensor*>& ref_inputs,
+                  const std::vector<const framework::LoDTensor*>& ref_inputs,
                   const int axis, std::vector<framework::Tensor*>* outputs) {
     // TODO(zcd): Add input data validity checking
     int o_num = outputs->size();
@@ -252,6 +255,9 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> {
           input.data<T>(), in_row, in_col, dev_outs_col_data,
           static_cast<int>(outputs_cols.size()), dev_out_gpu_data);
     }
+    // Wait() must be called because `outputs_data` may be destructed before
+    // kernel ends
+    context.Wait();
   }
 };
 
diff --git a/paddle/fluid/operators/math/concat.h b/paddle/fluid/operators/math/concat.h
index 9e080f2e8be23768dcea47b577043beef37b2eaf..e5d7d860b371677b3cfc67a57390bdee0d0ecc37 100644
--- a/paddle/fluid/operators/math/concat.h
+++ b/paddle/fluid/operators/math/concat.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 #include <vector>
 #include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/lod_tensor.h"
 
 namespace paddle {
 namespace operators {
@@ -57,7 +57,7 @@ template <typename DeviceContext, typename T>
 class ConcatGradFunctor {
  public:
   void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const std::vector<const framework::Tensor*>& ref_inputs,
+                  const std::vector<const framework::LoDTensor*>& ref_inputs,
                   const int axis, std::vector<framework::Tensor*>* outputs);
 };
 
diff --git a/paddle/fluid/operators/math/cpu_vec.h b/paddle/fluid/operators/math/cpu_vec.h
new file mode 100644
index 0000000000000000000000000000000000000000..9560e3a3c15ca63892fbe3552679a22f027f11e2
--- /dev/null
+++ b/paddle/fluid/operators/math/cpu_vec.h
@@ -0,0 +1,485 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <cmath>
+#include <functional>
+#include <string>
+#include "paddle/fluid/platform/cpu_info.h"
+#ifdef __AVX__
+#include <immintrin.h>
+#endif
+
+#ifdef PADDLE_WITH_MKLML
+#include "paddle/fluid/platform/dynload/mklml.h"
+#endif
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+#define SIGMOID_THRESHOLD_MIN -40.0
+#define SIGMOID_THRESHOLD_MAX 13.0
+
+#define AVX_FLOAT_BLOCK 8
+#define AVX_DOUBLE_BLOCK 4
+#define AVX2_FLOAT_BLOCK 8
+#define AVX2_DOUBLE_BLOCK 4
+#define AVX512_FLOAT_BLOCK 16
+#define AVX512_DOUBLE_BLOCK 8
+
+template <typename T>
+inline void vec_exp(const int n, const T* x, T* y) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = std::exp(x[i]);
+  }
+}
+
+template <typename T>
+inline void vec_scal(const int n, const T a, T* x) {
+  for (int i = 0; i < n; ++i) {
+    x[i] = a * x[i];
+  }
+}
+
+#ifdef PADDLE_WITH_MKLML
+template <>
+inline void vec_exp<float>(const int n, const float* x, float* y) {
+  platform::dynload::vsExp(n, x, y);
+}
+
+template <>
+inline void vec_exp<double>(const int n, const double* x, double* y) {
+  platform::dynload::vdExp(n, x, y);
+}
+
+template <>
+inline void vec_scal<float>(const int n, const float a, float* x) {
+  platform::dynload::cblas_sscal(n, a, x, 1);
+}
+
+template <>
+inline void vec_scal<double>(const int n, const double a, double* x) {
+  platform::dynload::cblas_dscal(n, a, x, 1);
+}
+#endif
+
+// MKL scal only support inplace, choose this if src and dst are not equal
+template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
+inline void vec_scal(const int n, const T a, const T* x, T* y) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = a * x[i];
+  }
+}
+
+template <>
+inline void vec_scal<float, platform::jit::avx>(const int n, const float a,
+                                                const float* x, float* y) {
+#ifdef __AVX__
+  constexpr int block = AVX_FLOAT_BLOCK;
+  if (n < block) {
+    vec_scal<float, platform::jit::isa_any>(n, a, x, y);
+    return;
+  }
+  const int rest = n % block;
+  const int end = n - rest;
+  int i = 0;
+  __m256 scalar = _mm256_set1_ps(a);
+  __m256 tmp;
+#define MOVE_ONE_STEP               \
+  tmp = _mm256_loadu_ps(x + i);     \
+  tmp = _mm256_mul_ps(tmp, scalar); \
+  _mm256_storeu_ps(y + i, tmp)
+  for (i = 0; i < end; i += block) {
+    MOVE_ONE_STEP;
+  }
+#undef MOVE_ONE_STEP
+  if (rest == 0) {
+    return;
+  }
+  // can not continue move step if src and dst are inplace
+  for (i = n - rest; i < n; ++i) {
+    y[i] = a * x[i];
+  }
+#else
+  vec_scal<float, platform::jit::isa_any>(n, a, x, y);
+#endif
+}
+
+template <>
+inline void vec_scal<float, platform::jit::avx2>(const int n, const float a,
+                                                 const float* x, float* y) {
+  vec_scal<float, platform::jit::avx>(n, a, x, y);
+}
+
+template <>
+inline void vec_scal<float, platform::jit::avx512_common>(const int n,
+                                                          const float a,
+                                                          const float* x,
+                                                          float* y) {
+  // TODO(TJ): enable me
+  vec_scal<float, platform::jit::avx2>(n, a, x, y);
+}
+
+template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
+inline void vec_bias_sub(const int n, const T a, const T* x, T* y) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = a - x[i];
+  }
+}
+
+template <>
+inline void vec_bias_sub<float, platform::jit::avx>(const int n, const float a,
+                                                    const float* x, float* y) {
+#ifdef __AVX__
+  constexpr int block = AVX_FLOAT_BLOCK;
+  if (n < block) {
+    vec_bias_sub<float, platform::jit::isa_any>(n, a, x, y);
+    return;
+  }
+  const int rest = n % block;
+  const int end = n - rest;
+  int i = 0;
+  __m256 bias = _mm256_set1_ps(a);
+  __m256 tmp;
+#define MOVE_ONE_STEP             \
+  tmp = _mm256_loadu_ps(x + i);   \
+  tmp = _mm256_sub_ps(bias, tmp); \
+  _mm256_storeu_ps(y + i, tmp)
+  for (i = 0; i < end; i += block) {
+    MOVE_ONE_STEP;
+  }
+#undef MOVE_ONE_STEP
+  if (rest == 0) {
+    return;
+  }
+  // can not continue move step if src and dst are inplace
+  for (i = n - rest; i < n; ++i) {
+    y[i] = a - x[i];
+  }
+#else
+  vec_bias_sub<float, platform::jit::isa_any>(n, a, x, y);
+#endif
+}
+
+template <>
+inline void vec_bias_sub<float, platform::jit::avx2>(const int n, const float a,
+                                                     const float* x, float* y) {
+  vec_bias_sub<float, platform::jit::avx>(n, a, x, y);
+}
+
+template <>
+inline void vec_bias_sub<float, platform::jit::avx512_common>(const int n,
+                                                              const float a,
+                                                              const float* x,
+                                                              float* y) {
+  // TODO(TJ): enable me
+  vec_bias_sub<float, platform::jit::avx2>(n, a, x, y);
+}
+
+// out = x*y + (1-x)*z
+template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
+inline void vec_cross(const int n, const T* x, const T* y, const T* z, T* out) {
+  for (int i = 0; i < n; ++i) {
+    out[i] = x[i] * y[i] + (static_cast<T>(1) - x[i]) * z[i];
+  }
+}
+
+template <>
+inline void vec_cross<float, platform::jit::avx>(const int n, const float* x,
+                                                 const float* y, const float* z,
+                                                 float* out) {
+#ifdef __AVX__
+  constexpr int block = AVX_FLOAT_BLOCK;
+  if (n < block) {
+    vec_cross<float, platform::jit::isa_any>(n, x, y, z, out);
+    return;
+  }
+  const int rest = n % block;
+  const int end = n - rest;
+  int i = 0;
+  __m256 bias = _mm256_set1_ps(1.f);
+  __m256 tmpx, tmpy, tmpz;
+  for (i = 0; i < end; i += block) {
+    tmpx = _mm256_loadu_ps(x + i);
+    tmpy = _mm256_loadu_ps(y + i);
+    tmpz = _mm256_loadu_ps(z + i);
+    tmpy = _mm256_mul_ps(tmpx, tmpy);
+    tmpx = _mm256_sub_ps(bias, tmpx);
+    tmpz = _mm256_mul_ps(tmpx, tmpz);
+    tmpz = _mm256_add_ps(tmpy, tmpz);
+    _mm256_storeu_ps(out + i, tmpz);
+  }
+  if (rest == 0) {
+    return;
+  }
+  // can not continue move step if src and dst are inplace
+  for (i = n - rest; i < n; ++i) {
+    out[i] = x[i] * y[i] + (1.f - x[i]) * z[i];
+  }
+#else
+  vec_cross<float, platform::jit::isa_any>(n, x, y, z, out);
+#endif
+}
+
+template <>
+inline void vec_cross<float, platform::jit::avx2>(const int n, const float* x,
+                                                  const float* y,
+                                                  const float* z, float* out) {
+  vec_cross<float, platform::jit::avx>(n, x, y, z, out);
+}
+
+template <>
+inline void vec_cross<float, platform::jit::avx512_common>(
+    const int n, const float* x, const float* y, const float* z, float* out) {
+  // TODO(TJ): enable me
+  vec_cross<float, platform::jit::avx>(n, x, y, z, out);
+}
+
+template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
+inline void vec_add_bias(const int n, const T a, const T* x, T* y) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = x[i] + a;
+  }
+}
+
+template <>
+inline void vec_add_bias<float, platform::jit::avx>(const int n, const float a,
+                                                    const float* x, float* y) {
+#ifdef __AVX__
+  constexpr int block = AVX_FLOAT_BLOCK;
+  if (n < block) {
+    vec_add_bias<float, platform::jit::isa_any>(n, a, x, y);
+    return;
+  }
+  const int rest = n % block;
+  const int end = n - rest;
+  int i = 0;
+  __m256 bias = _mm256_set1_ps(a);
+  __m256 tmp;
+#define MOVE_ONE_STEP             \
+  tmp = _mm256_loadu_ps(x + i);   \
+  tmp = _mm256_add_ps(tmp, bias); \
+  _mm256_storeu_ps(y + i, tmp)
+  for (i = 0; i < end; i += block) {
+    MOVE_ONE_STEP;
+  }
+#undef MOVE_ONE_STEP
+  if (rest == 0) {
+    return;
+  }
+  // can not continue move step if src and dst are inplace
+  for (i = n - rest; i < n; ++i) {
+    y[i] = x[i] + a;
+  }
+#else
+  vec_add_bias<float, platform::jit::isa_any>(n, a, x, y);
+#endif
+}
+
+template <>
+inline void vec_add_bias<float, platform::jit::avx2>(const int n, const float a,
+                                                     const float* x, float* y) {
+  vec_add_bias<float, platform::jit::avx>(n, a, x, y);
+}
+
+template <>
+inline void vec_add_bias<float, platform::jit::avx512_common>(const int n,
+                                                              const float a,
+                                                              const float* x,
+                                                              float* y) {
+  // TODO(TJ): enable me
+  vec_add_bias<float, platform::jit::avx2>(n, a, x, y);
+}
+
+template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
+inline void vec_identity(const int n, const T* x, T* y) {
+  // do nothing
+  return;
+}
+
+template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
+inline void vec_sigmoid(const int n, const T* x, T* y) {
+  const T min = SIGMOID_THRESHOLD_MIN;
+  const T max = SIGMOID_THRESHOLD_MAX;
+  for (int i = 0; i < n; ++i) {
+    y[i] = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]);
+    y[i] = static_cast<T>(0) - y[i];
+  }
+  vec_exp<T>(n, y, y);
+  for (int i = 0; i < n; ++i) {
+    y[i] = static_cast<T>(1) / (static_cast<T>(1) + y[i]);
+  }
+}
+
+template <>
+inline void vec_sigmoid<float, platform::jit::avx>(const int n, const float* x,
+                                                   float* y) {
+#ifdef __AVX__
+  constexpr int block = AVX_FLOAT_BLOCK;
+  if (n < block) {
+    vec_sigmoid<float, platform::jit::isa_any>(n, x, y);
+    return;
+  }
+  const int rest = n % block;
+  const int end = n - rest;
+  int i = 0;
+  __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX);
+  __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN);
+  __m256 zeros = _mm256_setzero_ps();
+  __m256 tmp;
+#define MOVE_ONE_STEP              \
+  tmp = _mm256_loadu_ps(x + i);    \
+  tmp = _mm256_max_ps(tmp, min);   \
+  tmp = _mm256_min_ps(tmp, max);   \
+  tmp = _mm256_sub_ps(zeros, tmp); \
+  _mm256_storeu_ps(y + i, tmp)
+  for (i = 0; i < end; i += block) {
+    MOVE_ONE_STEP;
+  }
+#undef MOVE_ONE_STEP
+  if (rest != 0) {
+    // can not continue move step since the src and dst address could be equal
+    const float xmin = SIGMOID_THRESHOLD_MIN;
+    const float xmax = SIGMOID_THRESHOLD_MAX;
+    for (i = n - rest; i < n; ++i) {
+      y[i] = 0.f - ((x[i] < xmin) ? xmin : ((x[i] > xmax) ? xmax : x[i]));
+    }
+  }
+
+  vec_exp<float>(n, y, y);
+
+  __m256 ones = _mm256_set1_ps(1.0f);
+#define MOVE_ONE_STEP             \
+  tmp = _mm256_loadu_ps(y + i);   \
+  tmp = _mm256_add_ps(ones, tmp); \
+  tmp = _mm256_div_ps(ones, tmp); \
+  _mm256_storeu_ps(y + i, tmp)
+  for (i = 0; i < end; i += block) {
+    MOVE_ONE_STEP;
+  }
+#undef MOVE_ONE_STEP
+  if (rest == 0) {
+    return;
+  }
+  // can not continue move step
+  for (i = n - rest; i < n; ++i) {
+    y[i] = 1.f / (1.f + y[i]);
+  }
+#else
+  vec_sigmoid<float, platform::jit::isa_any>(n, x, y);
+#endif
+}
+
+template <>
+inline void vec_sigmoid<float, platform::jit::avx2>(const int n, const float* x,
+                                                    float* y) {
+  vec_sigmoid<float, platform::jit::avx>(n, x, y);
+}
+
+template <>
+inline void vec_sigmoid<float, platform::jit::avx512_common>(const int n,
+                                                             const float* x,
+                                                             float* y) {
+  // TODO(TJ): enable me
+  vec_sigmoid<float, platform::jit::avx2>(n, x, y);
+}
+
+template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
+inline void vec_tanh(const int n, const T* x, T* y) {
+  vec_scal<T, isa>(n, static_cast<T>(2), x, y);
+  vec_sigmoid<T, isa>(n, y, y);
+  vec_scal<T>(n, static_cast<T>(2), y);
+  vec_add_bias<T, isa>(n, static_cast<T>(-1), y, y);
+}
+
+// TODO(TJ): make relu clip
+template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
+inline void vec_relu(const int n, const T* x, T* y) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = x[i] > 0 ? x[i] : 0;
+  }
+}
+
+template <>
+inline void vec_relu<float, platform::jit::avx>(const int n, const float* x,
+                                                float* y) {
+#ifdef __AVX__
+  constexpr int block = AVX_FLOAT_BLOCK;
+  if (n < block * 4) {
+    vec_relu<float, platform::jit::isa_any>(n, x, y);
+    return;
+  }
+
+  const int rest = n % block;
+  const int end = n - rest;
+  int i = 0;
+  __m256 zeros = _mm256_setzero_ps();
+  __m256 tmp;
+#define MOVE_ONE_STEP              \
+  tmp = _mm256_loadu_ps(x + i);    \
+  tmp = _mm256_max_ps(tmp, zeros); \
+  _mm256_storeu_ps(y + i, tmp)
+  for (i = 0; i < end; i += block) {
+    MOVE_ONE_STEP;
+  }
+  if (rest == 0) {
+    return;
+  }
+  i = n - block;
+  MOVE_ONE_STEP;
+#undef MOVE_ONE_STEP
+
+#else
+  vec_relu<float, platform::jit::isa_any>(n, x, y);
+#endif
+}
+
+template <>
+inline void vec_relu<float, platform::jit::avx2>(const int n, const float* x,
+                                                 float* y) {
+  vec_relu<float, platform::jit::avx>(n, x, y);
+}
+
+template <>
+inline void vec_relu<float, platform::jit::avx512_common>(const int n,
+                                                          const float* x,
+                                                          float* y) {
+  // TODO(TJ): enable me
+  vec_relu<float, platform::jit::avx2>(n, x, y);
+}
+
+// TODO(TJ): optimize double of sigmoid, tanh and relu if necessary
+
+template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
+class VecActivations {
+ public:
+  std::function<void(const int, const T*, T*)> operator()(
+      const std::string& type) {
+    if (type == "sigmoid") {
+      return vec_sigmoid<T, isa>;
+    } else if (type == "relu") {
+      return vec_relu<T, isa>;
+    } else if (type == "tanh") {
+      return vec_tanh<T, isa>;
+    } else if (type == "identity" || type == "") {
+      return vec_identity<T, isa>;
+    }
+    LOG(FATAL) << "Not support type: " << type;
+  }
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/cpu_vec_test.cc b/paddle/fluid/operators/math/cpu_vec_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3ce66f49ed8354c49e8af26ca6eb48fef654a40b
--- /dev/null
+++ b/paddle/fluid/operators/math/cpu_vec_test.cc
@@ -0,0 +1,203 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <sys/time.h>
+#include <cmath>
+#include <cstring>
+#include <random>
+#include <vector>
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/operators/math/cpu_vec.h"
+
+inline double GetCurrentUS() {
+  struct timeval time;
+  gettimeofday(&time, NULL);
+  return 1e+6 * time.tv_sec + time.tv_usec;
+}
+constexpr int repeat = 1000;
+
+template <typename T>
+inline T _sigmoid(T x) {
+  const T min = SIGMOID_THRESHOLD_MIN;
+  const T max = SIGMOID_THRESHOLD_MAX;
+  T tmp = (x < min) ? min : ((x > max) ? max : x);
+  return static_cast<T>(1) / (static_cast<T>(1) + std::exp(-tmp));
+}
+
+template <typename T>
+inline T _tanh(T x) {
+  return static_cast<T>(2) * _sigmoid<T>(static_cast<T>(2) * x) -
+         static_cast<T>(1);
+}
+
+template <typename T>
+void ref_sigmoid(const int n, const T* x, T* y) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = _sigmoid(x[i]);
+  }
+}
+
+template <typename T>
+void ref_tanh(const int n, const T* x, T* y) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = _tanh(x[i]);
+  }
+}
+template <typename T>
+void ref_relu(const int n, const T* x, T* y) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = x[i] > 0 ? x[i] : 0;
+  }
+}
+
+template <typename T>
+void RandomVec(const int n, T* a) {
+  static unsigned int seed = 100;
+  std::mt19937 rng(seed++);
+  std::uniform_real_distribution<double> uniform_dist(0, 1);
+  const T lower = static_cast<T>(-20.f);
+  const T upper = static_cast<T>(20.f);
+  for (int i = 0; i < n; ++i) {
+    a[i] = static_cast<T>(uniform_dist(rng) * (upper - lower) + lower);
+  }
+}
+
+template <typename T>
+void TestAndBench(const int n, std::function<void(const int, const T*, T*)> tgt,
+                  std::function<void(const int, const T*, T*)> ref) {
+  std::vector<T> x(n);
+  std::vector<T> ytgt(n), yref(n);
+  RandomVec<T>(n, x.data());
+
+  const T* x_data = x.data();
+  T* ytgt_data = ytgt.data();
+  T* yref_data = yref.data();
+  auto st = GetCurrentUS();
+  for (int i = 0; i < repeat; ++i) {
+    tgt(n, x_data, ytgt_data);
+  }
+  auto mt = GetCurrentUS();
+  for (int i = 0; i < repeat; ++i) {
+    ref(n, x_data, yref_data);
+  }
+  auto et = GetCurrentUS();
+
+  VLOG(3) << "Vec size " << n << ": refer takes: " << (et - mt) / repeat
+          << " us, tgt takes: " << (mt - st) / repeat;
+  for (int i = 0; i < n; ++i) {
+    EXPECT_NEAR(ytgt_data[i], yref_data[i], 1e-3);
+  }
+}
+
+TEST(CpuVecTest, sigmoid) {
+  namespace jit = paddle::platform::jit;
+  using namespace paddle::operators::math;  // NOLINT
+  for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
+    TestAndBench<float>(sz, vec_sigmoid<float>, ref_sigmoid<float>);
+    TestAndBench<float>(sz, vec_sigmoid<float, jit::avx>, ref_sigmoid<float>);
+    TestAndBench<float>(sz, vec_sigmoid<float, jit::avx2>, ref_sigmoid<float>);
+    TestAndBench<float>(sz, vec_sigmoid<float, jit::avx512_common>,
+                        ref_sigmoid<float>);
+  }
+  TestAndBench<double>(30, vec_sigmoid<double>, ref_sigmoid<double>);
+}
+
+TEST(CpuVecTest, tanh) {
+  namespace jit = paddle::platform::jit;
+  using namespace paddle::operators::math;  // NOLINT
+  for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
+    TestAndBench<float>(sz, vec_tanh<float>, ref_tanh<float>);
+    TestAndBench<float>(sz, vec_tanh<float, jit::avx>, ref_tanh<float>);
+    TestAndBench<float>(sz, vec_tanh<float, jit::avx2>, ref_tanh<float>);
+    TestAndBench<float>(sz, vec_tanh<float, jit::avx512_common>,
+                        ref_tanh<float>);
+  }
+  TestAndBench<double>(30, vec_tanh<double>, ref_tanh<double>);
+}
+
+TEST(CpuVecTest, relu) {
+  namespace jit = paddle::platform::jit;
+  using namespace paddle::operators::math;  // NOLINT
+  for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
+    TestAndBench<float>(sz, vec_relu<float>, ref_relu<float>);
+    TestAndBench<float>(sz, vec_relu<float, jit::avx>, ref_relu<float>);
+    TestAndBench<float>(sz, vec_relu<float, jit::avx2>, ref_relu<float>);
+    TestAndBench<float>(sz, vec_relu<float, jit::avx512_common>,
+                        ref_relu<float>);
+  }
+  TestAndBench<double>(30, vec_relu<double>, ref_relu<double>);
+}
+
+template <typename T>
+void TestInplace(const int n, std::function<void(const int, const T*, T*)> tgt,
+                 std::function<void(const int, const T*, T*)> ref) {
+  std::vector<T> x(n);
+  std::vector<T> ytgt(n), yref(n);
+  RandomVec<T>(n, x.data());
+
+  const T* x_data = x.data();
+  T* yref_data = yref.data();
+  T* ytgt_data = ytgt.data();
+  std::memcpy(yref_data, x_data, sizeof(T) * n);
+  std::memcpy(ytgt_data, x_data, sizeof(T) * n);
+
+  ref(n, yref_data, yref_data);
+  tgt(n, ytgt_data, ytgt_data);
+
+  for (int i = 0; i < n; ++i) {
+    EXPECT_NEAR(ytgt_data[i], yref_data[i], 1e-3);
+  }
+}
+
+TEST(CpuVecTest, inplace_sigmoid) {
+  namespace jit = paddle::platform::jit;
+  using namespace paddle::operators::math;  // NOLINT
+  for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
+    TestInplace<float>(sz, vec_sigmoid<float>, ref_sigmoid<float>);
+    TestInplace<float>(sz, vec_sigmoid<float, jit::avx>, ref_sigmoid<float>);
+    TestInplace<float>(sz, vec_sigmoid<float, jit::avx2>, ref_sigmoid<float>);
+    TestInplace<float>(sz, vec_sigmoid<float, jit::avx512_common>,
+                       ref_sigmoid<float>);
+  }
+  TestInplace<double>(30, vec_sigmoid<double>, ref_sigmoid<double>);
+}
+
+TEST(CpuVecTest, inplace_tanh) {
+  namespace jit = paddle::platform::jit;
+  using namespace paddle::operators::math;  // NOLINT
+  for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
+    TestInplace<float>(sz, vec_tanh<float>, ref_tanh<float>);
+    TestInplace<float>(sz, vec_tanh<float, jit::avx>, ref_tanh<float>);
+    TestInplace<float>(sz, vec_tanh<float, jit::avx2>, ref_tanh<float>);
+    TestInplace<float>(sz, vec_tanh<float, jit::avx512_common>,
+                       ref_tanh<float>);
+  }
+  TestInplace<double>(30, vec_tanh<double>, ref_tanh<double>);
+}
+
+TEST(CpuVecTest, inplace_relu) {
+  namespace jit = paddle::platform::jit;
+  using namespace paddle::operators::math;  // NOLINT
+  for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
+    TestInplace<float>(sz, vec_relu<float>, ref_relu<float>);
+    TestInplace<float>(sz, vec_relu<float, jit::avx>, ref_relu<float>);
+    TestInplace<float>(sz, vec_relu<float, jit::avx2>, ref_relu<float>);
+    TestInplace<float>(sz, vec_relu<float, jit::avx512_common>,
+                       ref_relu<float>);
+  }
+  TestInplace<double>(30, vec_relu<double>, ref_relu<double>);
+}
diff --git a/paddle/fluid/operators/math/cross_entropy.cc b/paddle/fluid/operators/math/cross_entropy.cc
index caff35e03ae3a144f799d982c859ded62cb3e93d..18bf1a66f6d9903f32048574dc93faf7e98953ac 100644
--- a/paddle/fluid/operators/math/cross_entropy.cc
+++ b/paddle/fluid/operators/math/cross_entropy.cc
@@ -28,7 +28,8 @@ class CrossEntropyFunctor<platform::CPUDeviceContext, T> {
  public:
   void operator()(const platform::CPUDeviceContext& ctx, framework::Tensor* out,
                   const framework::Tensor* prob,
-                  const framework::Tensor* labels, const bool softLabel) {
+                  const framework::Tensor* labels, const bool softLabel,
+                  const int ignore_index) {
     const int batch_size = prob->dims()[0];
     if (softLabel) {
       auto in = EigenMatrix<T>::From(*prob);
@@ -49,8 +50,12 @@ class CrossEntropyFunctor<platform::CPUDeviceContext, T> {
         int lbl = label_data[i];
         PADDLE_ENFORCE_GE(lbl, 0);
         PADDLE_ENFORCE_LT(lbl, class_num);
+        PADDLE_ENFORCE((lbl >= 0 && lbl < class_num) || lbl == ignore_index);
         int index = i * class_num + lbl;
-        loss_data[i] = -math::TolerableValue<T>()(std::log(prob_data[index]));
+        loss_data[i] =
+            lbl == ignore_index
+                ? 0
+                : -math::TolerableValue<T>()(std::log(prob_data[index]));
       }
     }
   }
diff --git a/paddle/fluid/operators/math/cross_entropy.cu b/paddle/fluid/operators/math/cross_entropy.cu
index 0de58d5fddd84d33f708c4c73e5a19dc2fe8a86b..c92341ea55ea21773acba33665e267b2f1c25fe3 100644
--- a/paddle/fluid/operators/math/cross_entropy.cu
+++ b/paddle/fluid/operators/math/cross_entropy.cu
@@ -23,11 +23,14 @@ namespace math {
 namespace {
 template <typename T>
 __global__ void CrossEntropyKernel(T* Y, const T* X, const int64_t* label,
-                                   const int N, const int D) {
+                                   const int N, const int D,
+                                   const int ignore_index) {
   for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
        i += blockDim.x * gridDim.x) {
-    PADDLE_ASSERT(label[i] >= 0 && label[i] < D);
-    Y[i] = -math::TolerableValue<T>()(log(X[i * D + label[i]]));
+    PADDLE_ASSERT(label[i] >= 0 && label[i] < D || label[i] == ignore_index);
+    Y[i] = ignore_index == label[i]
+               ? 0
+               : -math::TolerableValue<T>()(log(X[i * D + label[i]]));
   }
 }
 
@@ -57,7 +60,8 @@ class CrossEntropyFunctor<platform::CUDADeviceContext, T> {
  public:
   void operator()(const platform::CUDADeviceContext& ctx,
                   framework::Tensor* out, const framework::Tensor* prob,
-                  const framework::Tensor* labels, bool softLabel) {
+                  const framework::Tensor* labels, bool softLabel,
+                  const int ignore_index) {
     const T* prob_data = prob->data<T>();
     T* loss_data = out->mutable_data<T>(ctx.GetPlace());
 
@@ -77,7 +81,8 @@ class CrossEntropyFunctor<platform::CUDADeviceContext, T> {
       int block = 512;
       int grid = (batch_size + block - 1) / block;
       CrossEntropyKernel<T><<<grid, block, 0, ctx.stream()>>>(
-          loss_data, prob_data, label_data, batch_size, class_num);
+          loss_data, prob_data, label_data, batch_size, class_num,
+          ignore_index);
     }
   }
 };
diff --git a/paddle/fluid/operators/math/cross_entropy.h b/paddle/fluid/operators/math/cross_entropy.h
index adc5b3fe47cd3bf524eb56747b6bd51e345a2eb6..e8aeb5d0575ac0f6b8761e97896df73578e8a103 100644
--- a/paddle/fluid/operators/math/cross_entropy.h
+++ b/paddle/fluid/operators/math/cross_entropy.h
@@ -38,7 +38,8 @@ class CrossEntropyFunctor {
  public:
   void operator()(const DeviceContext& context, framework::Tensor* out,
                   const framework::Tensor* prob,
-                  const framework::Tensor* labels, const bool softLabel);
+                  const framework::Tensor* labels, const bool softLabel,
+                  const int ignore_index);
 };
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/fluid/operators/math/fc_compute.h b/paddle/fluid/operators/math/fc_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..1f5a49c0ab5a10b0d7dc1febd258ce76c467cb1c
--- /dev/null
+++ b/paddle/fluid/operators/math/fc_compute.h
@@ -0,0 +1,51 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/operators/math/blas.h"
+
+DECLARE_int32(paddle_num_threads);
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename DeviceContext, typename T>
+inline void FCCompute(const BlasT<DeviceContext, T>& blas, const int M,
+                      const int N, const int K, const T* X, const T* W, T* Y,
+                      const T* B = NULL, bool relu = false) {
+  blas.MatMul(M, N, K, X, W, Y);
+  if (B == NULL) {
+    return;
+  }
+
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for if (FLAGS_paddle_num_threads > 1)
+#endif
+  for (int i = 0; i < M; i++) {
+    blas.AXPY(N, static_cast<T>(1), B, Y + i * N);
+  }
+
+  if (!relu) {
+    return;
+  }
+
+  // TODO(TJ): fuse relu
+  LOG(FATAL) << "Not implemented!";
+}
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/functors.h b/paddle/fluid/operators/math/functors.h
new file mode 100644
index 0000000000000000000000000000000000000000..ddb01cdfc084f5ba2e9e573be461389f46fbe03f
--- /dev/null
+++ b/paddle/fluid/operators/math/functors.h
@@ -0,0 +1,83 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+// MulFunctor
+template <typename T>
+struct MulFunctor {
+  // out = x * y;
+  inline HOSTDEVICE T operator()(T x, T y) { return x * y; }
+};
+
+template <typename T>
+struct MulGradFunctor {
+  inline HOSTDEVICE T Dx(T x, T y) { return y; }
+  inline HOSTDEVICE T Dy(T x, T y) { return x; }
+};
+
+// AddFunctor
+template <typename T>
+struct AddFunctor {
+  // out = x + y;
+  inline HOSTDEVICE T operator()(T x, T y) { return x + y; }
+};
+
+template <typename T>
+struct AddGradFunctor {
+  inline HOSTDEVICE T Dx(T x, T y) { return 1; }
+  inline HOSTDEVICE T Dy(T x, T y) { return 1; }
+};
+
+template <typename T>
+struct ScaleFunctor {
+  explicit ScaleFunctor(const T coeff) : coeff_(coeff) {}
+
+  inline HOSTDEVICE T operator()(T ele) { return ele * coeff_; }
+
+ private:
+  T coeff_;
+};
+
+template <typename T>
+struct ScaleGradFunctor {
+  explicit ScaleGradFunctor(T coeff) : coeff_(coeff) {}
+
+  inline HOSTDEVICE T operator()(T x) { return coeff_; }
+
+  inline HOSTDEVICE T operator()(T x, T out) { return coeff_; }
+
+ private:
+  T coeff_;
+};
+
+template <typename T>
+struct ReluFunctor {
+  inline HOSTDEVICE T operator()(T x) { return x * (x > 0); }
+};
+
+template <typename T>
+struct ReluGradFunctor {
+  inline HOSTDEVICE T operator()(T x) { return x > 0 ? 1 : 0; }
+
+  inline HOSTDEVICE T operator()(T x, T out) { return x > 0 ? 1 : 0; }
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/im2col.cc b/paddle/fluid/operators/math/im2col.cc
index 336d6febc2ce3a55e82ed613bbc1081101f822f0..1472edbbf47e3e4d6b22c65349713904b13647d2 100644
--- a/paddle/fluid/operators/math/im2col.cc
+++ b/paddle/fluid/operators/math/im2col.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math/im2col.h"
 #include <vector>
+#include "paddle/fluid/operators/math/im2col_cfo_cpu.h"
 
 namespace paddle {
 namespace operators {
@@ -35,51 +36,18 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
     PADDLE_ENFORCE(im.dims().size() == 3);
     PADDLE_ENFORCE(col->dims().size() == 5);
 
-    int im_channels = im.dims()[0];
-    int im_height = im.dims()[1];
-    int im_width = im.dims()[2];
-    int filter_height = col->dims()[1];
-    int filter_width = col->dims()[2];
-    int col_height = col->dims()[3];
-    int col_width = col->dims()[4];
-
-    PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] -
-                       ((dilation[0] * (filter_height - 1) + 1))) /
-                              stride[0] +
-                          1,
-                      col_height,
-                      "Output_height and padding(padding_up, padding_down) are "
-                      "inconsistent.");
-    PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
-                       ((dilation[1] * (filter_width - 1) + 1))) /
-                              stride[1] +
-                          1,
-                      col_width,
-                      "Output_height and padding(padding_up, padding_down) are "
-                      "inconsistent.");
-
-    int channels_col = im_channels * filter_height * filter_width;
-
-    const T* im_data = im.data<T>();
-    T* col_data = col->data<T>();
-    for (int c = 0; c < channels_col; ++c) {
-      int w_offset = c % filter_width;
-      int h_offset = (c / filter_width) % filter_height;
-      int c_im = c / (filter_width * filter_height);
-      for (int h = 0; h < col_height; ++h) {
-        int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0];
-        for (int w = 0; w < col_width; ++w) {
-          int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1];
-          int col_idx = (c * col_height + h) * col_width + w;
-          int im_idx = (im_row_idx + c_im * im_height) * im_width + im_col_idx;
-
-          col_data[col_idx] = (im_row_idx < 0 || im_row_idx >= im_height ||
-                               im_col_idx < 0 || im_col_idx >= im_width)
-                                  ? static_cast<T>(0)
-                                  : im_data[im_idx];
-        }
+    if (stride[0] == 1 && stride[1] == 1 && dilation[0] == 1 &&
+        dilation[1] == 1) {
+      if (padding[0] == 0 && padding[1] == 0) {
+        im2col_sh1sw1dh1dw1ph0pw0<T>(im, col);
+        return;
+      } else if (padding[0] == 1 && padding[1] == 1) {
+        im2col_sh1sw1dh1dw1ph1pw1<T>(im, col);
+        return;
       }
+      // TODO(TJ): complete padding >=2
     }
+    im2col_common<T>(im, dilation, stride, padding, col);
   }
 };
 
@@ -178,17 +146,6 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
     int col_height = col->dims()[0];
     int col_width = col->dims()[1];
 
-    PADDLE_ENFORCE_EQ(
-        (im_height + padding[0] + padding[2] - filter_height) / stride[0] + 1,
-        col_height,
-        "Output_height and padding(padding_up, padding_down) are "
-        "inconsistent.");
-    PADDLE_ENFORCE_EQ(
-        (im_width + padding[1] + padding[3] - filter_width) / stride[1] + 1,
-        col_width,
-        "col_width and padding(padding_left, padding_right) are "
-        "inconsistent.");
-
     const T* im_data = im.data<T>();
     T* col_data = col->data<T>();
 
diff --git a/paddle/fluid/operators/math/im2col.cu b/paddle/fluid/operators/math/im2col.cu
index eecb233d22cea06da016b2671fd606b70eddf5a5..4897767f4d88d9e079f05c921153923c4eb354b0 100644
--- a/paddle/fluid/operators/math/im2col.cu
+++ b/paddle/fluid/operators/math/im2col.cu
@@ -77,21 +77,6 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
     int col_height = col->dims()[3];
     int col_width = col->dims()[4];
 
-    PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] -
-                       (dilation[0] * (filter_height - 1) + 1)) /
-                              stride[0] +
-                          1,
-                      col_height,
-                      "Output_height and padding(padding_up, padding_down) are "
-                      "inconsistent.");
-    PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
-                       (dilation[1] * (filter_width - 1) + 1)) /
-                              stride[1] +
-                          1,
-                      col_width,
-                      "col_width and padding(padding_left, padding_right) are "
-                      "inconsistent.");
-
     int num_outputs = im_channels * col_height * col_width;
     int blocks = (num_outputs + 1024 - 1) / 1024;
     int block_x = 512;
@@ -274,21 +259,6 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
     int col_height = col->dims()[0];
     int col_width = col->dims()[1];
 
-    PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] -
-                       (dilation[0] * (filter_height - 1) + 1)) /
-                              stride[0] +
-                          1,
-                      col_height,
-                      "Output_height and padding(padding_up, padding_down) are "
-                      "inconsistent.");
-    PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
-                       (dilation[1] * (filter_width - 1) + 1)) /
-                              stride[1] +
-                          1,
-                      col_width,
-                      "col_width and padding(padding_left, padding_right) are "
-                      "inconsistent.");
-
     int block_dim_x = 0;
     int block_dim_y = 0;
     if (filter_height <= 4 && filter_width <= 4) {
diff --git a/paddle/fluid/operators/math/im2col_cfo_cpu.h b/paddle/fluid/operators/math/im2col_cfo_cpu.h
new file mode 100644
index 0000000000000000000000000000000000000000..0d32bc5bd0d7f25479370959cabeb9b9c9e7e2d6
--- /dev/null
+++ b/paddle/fluid/operators/math/im2col_cfo_cpu.h
@@ -0,0 +1,252 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "paddle/fluid/framework/tensor.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+/**
+ * The most common im2col algorithm.
+ * Support dilation, stride and padding.
+ */
+template <typename T>
+inline void im2col_common(const framework::Tensor& im,
+                          const std::vector<int>& dilation,
+                          const std::vector<int>& stride,
+                          const std::vector<int>& padding,
+                          framework::Tensor* col) {
+  int im_channels = im.dims()[0];
+  int im_height = im.dims()[1];
+  int im_width = im.dims()[2];
+  int filter_height = col->dims()[1];
+  int filter_width = col->dims()[2];
+  int output_height = col->dims()[3];
+  int output_width = col->dims()[4];
+  int channels_col = im_channels * filter_height * filter_width;
+
+  const T* im_data = im.data<T>();
+  T* col_data = col->data<T>();
+  for (int c = 0; c < channels_col; ++c) {
+    int w_offset = c % filter_width;
+    int h_offset = (c / filter_width) % filter_height;
+    int c_im = c / (filter_width * filter_height);
+    for (int h = 0; h < output_height; ++h) {
+      int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0];
+      for (int w = 0; w < output_width; ++w) {
+        int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1];
+        int col_idx = (c * output_height + h) * output_width + w;
+        int im_idx = (im_row_idx + c_im * im_height) * im_width + im_col_idx;
+        col_data[col_idx] = (im_row_idx < 0 || im_row_idx >= im_height ||
+                             im_col_idx < 0 || im_col_idx >= im_width)
+                                ? static_cast<T>(0)
+                                : im_data[im_idx];
+      }
+    }
+  }
+}
+
+/**
+ * im2col algorithm with strides == 1, dilations == 1, paddings == 0
+ */
+template <typename T>
+inline void im2col_sh1sw1dh1dw1ph0pw0(const framework::Tensor& im,
+                                      framework::Tensor* col) {
+  int im_channels = im.dims()[0];
+  int im_height = im.dims()[1];
+  int im_width = im.dims()[2];
+  int filter_height = col->dims()[1];
+  int filter_width = col->dims()[2];
+  int output_height = col->dims()[3];
+  int output_width = col->dims()[4];
+
+  const T* im_data = im.data<T>();
+  T* col_data = col->data<T>();
+  int col_matrix_width = output_width * output_height;
+  int im_size = im_height * im_width;
+  size_t copy_size = sizeof(T) * output_width;
+  const T* im_data_oh = im_data;
+  T* dst_data_oh = col_data;
+  for (int oh = 0; oh < output_height; ++oh) {
+    const T* src_data_ic = im_data_oh;
+    T* dst_data = dst_data_oh;
+    for (int ic = 0; ic < im_channels; ++ic) {
+      const T* src_data = src_data_ic;
+      for (int kh = 0; kh < filter_height; ++kh) {
+        for (int kw = 0; kw < filter_width; ++kw) {
+          std::memcpy(dst_data, src_data + kw, copy_size);
+          dst_data = dst_data + col_matrix_width;
+        }
+        src_data = src_data + im_width;
+      }
+      src_data_ic = src_data_ic + im_size;
+    }
+    im_data_oh = im_data_oh + im_width;
+    dst_data_oh = dst_data_oh + output_width;
+  }
+}
+
+/**
+ * im2col algorithm with strides == 1, dilations == 1, paddings == 1
+ * and filter_width == 1 have a special implementation
+ */
+template <typename T>
+inline void im2col_sh1sw1dh1dw1ph1pw1(const framework::Tensor& im,
+                                      framework::Tensor* col) {
+  int im_channels = im.dims()[0];
+  int im_height = im.dims()[1];
+  int im_width = im.dims()[2];
+  int filter_height = col->dims()[1];
+  int filter_width = col->dims()[2];
+  int output_height = col->dims()[3];
+  int output_width = col->dims()[4];
+
+  constexpr int plh = 1;
+  constexpr int prh = 1;
+  constexpr int plw = 1;
+  constexpr int prw = 1;
+
+  const T* im_data = im.data<T>();
+  T* col_data = col->data<T>();
+  int im_size = im_height * im_width;
+  int col_matrix_width = output_width * output_height;
+  int col_block_fh = filter_width * col_matrix_width;  // fw*oh*ow
+  int col_block_ic = filter_height * col_block_fh;     // fh*fw*oh*ow
+
+  // fill height padding
+  {
+    size_t copy_size = sizeof(T) * output_width;
+    T* col_start_l = col_data;
+    T* col_start_r = col_data + (filter_height - 1) * col_block_fh +
+                     col_matrix_width - output_width;
+    for (int ic = 0; ic < im_channels; ++ic) {
+      T* dst_data_l = col_start_l;
+      T* dst_data_r = col_start_r;
+      for (int kw = 0; kw < filter_width; ++kw) {
+        std::memset(dst_data_l, 0, copy_size);
+        std::memset(dst_data_r, 0, copy_size);
+        dst_data_l = dst_data_l + col_matrix_width;
+        dst_data_r = dst_data_r + col_matrix_width;
+      }
+      col_start_l = col_start_l + col_block_ic;
+      col_start_r = col_start_r + col_block_ic;
+    }
+  }
+
+  auto pad = static_cast<T>(0);
+  if (filter_width == 1) {
+    // fill width padding
+    T* dst_data_ic = col_data;
+    for (int ic = 0; ic < im_channels; ++ic) {
+      T* dst_data_kh = dst_data_ic;
+      for (int kh = 0; kh < filter_height; ++kh) {
+        T* dst_data = dst_data_kh;
+        for (int oh = 0; oh < output_height; ++oh) {
+          *dst_data = pad;
+          dst_data = dst_data + output_width - 1;
+          *dst_data = pad;
+          ++dst_data;
+        }
+        dst_data_kh = dst_data_kh + col_block_fh;
+      }
+      dst_data_ic = dst_data_ic + col_block_ic;
+    }
+    // fill core
+    size_t copy_size = sizeof(T) * (output_width - plw - prw);
+    for (int oh = 0; oh < output_height; ++oh) {
+      const T* im_data_start =
+          im_data + (oh - plh > 0 ? oh - plh : 0) * im_width;
+      T* dst_data = col_data + oh * output_width;
+      for (int ic = 0; ic < im_channels; ++ic) {
+        const T* src_data = im_data_start + ic * im_size;
+        for (int kh = 0; kh < filter_height; ++kh) {
+          if ((oh < plh && kh < plh) || (oh > (output_height - prh - 1) &&
+                                         kh > (filter_height - prh - 1))) {
+            dst_data = dst_data + col_matrix_width;
+            continue;
+          }
+          std::memcpy(dst_data + plw, src_data, copy_size);
+          dst_data = dst_data + col_matrix_width;
+          src_data = src_data + im_width;
+        }
+      }
+    }
+    return;
+  }
+
+  // filter_width != 1
+  // fill width padding
+  T* dst_data_ic = col_data;
+  for (int ic = 0; ic < im_channels; ++ic) {
+    T* dst_data_kh = dst_data_ic;
+    for (int kh = 0; kh < filter_height; ++kh) {
+      for (T* dst_data :
+           {dst_data_kh, dst_data_kh + (filter_width - prw) * col_matrix_width +
+                             output_width - 1}) {
+        // TODO(TJ): from plh, saving repeated assignment
+        for (int oh = 0; oh < output_height; ++oh) {
+          *dst_data = pad;
+          dst_data = dst_data + output_width;
+        }
+      }
+      dst_data_kh = dst_data_kh + col_block_fh;
+    }
+    dst_data_ic = dst_data_ic + col_block_ic;
+  }
+
+  // TODO(TJ): use array like: size_t copy_size[kw]={sizeof(T) *
+  // (output_width-1)}
+  // length of copy_size is equal kw.
+  for (int oh = 0; oh < output_height; ++oh) {
+    const T* im_data_start = im_data + (oh - plh > 0 ? oh - plh : 0) * im_width;
+    T* dst_data = col_data + oh * output_width;
+    for (int ic = 0; ic < im_channels; ++ic) {
+      const T* src_data = im_data_start + ic * im_size;
+      for (int kh = 0; kh < filter_height; ++kh) {
+        if ((oh < plh && kh < plh) || (oh > (output_height - prh - 1) &&
+                                       kh > (filter_height - prh - 1))) {
+          dst_data = dst_data + filter_width * col_matrix_width;
+          continue;
+        }
+        // TODO(TJ): reuse plw-kw outside this for
+        // try to unify
+        for (int kw = 0; kw < plw; ++kw) {
+          std::memcpy(dst_data + (plw - kw), src_data,
+                      sizeof(T) * (output_width - (plw - kw)));
+          dst_data = dst_data + col_matrix_width;
+        }
+        for (int kw = plw; kw < filter_width - prw; ++kw) {
+          std::memcpy(dst_data, src_data + (kw - plw),
+                      sizeof(T) * output_width);
+          dst_data = dst_data + col_matrix_width;
+        }
+        int i = 1;
+        for (int kw = filter_width - prw; kw < filter_width; ++kw, ++i) {
+          std::memcpy(dst_data, src_data + (kw - plw),
+                      sizeof(T) * (output_width - i));
+          dst_data = dst_data + col_matrix_width;
+        }
+        src_data = src_data + im_width;
+      }
+    }
+  }
+}
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/im2col_test.cc b/paddle/fluid/operators/math/im2col_test.cc
index 8e3f0f286823c383bb0c44d0e7887040ec9b20a0..ae2c90b33a4298ada4fd01aa2a44ebdf10d036d4 100644
--- a/paddle/fluid/operators/math/im2col_test.cc
+++ b/paddle/fluid/operators/math/im2col_test.cc
@@ -14,7 +14,9 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math/im2col.h"
 #include <gtest/gtest.h>
+#include <sys/time.h>
 #include <vector>
+#include "paddle/fluid/operators/math/im2col_cfo_cpu.h"
 
 template <typename DeviceContext, typename Place>
 void testIm2col() {
@@ -167,3 +169,104 @@ TEST(math, im2col) {
              paddle::platform::CUDAPlace>();
 #endif
 }
+
+#define PREPARE_IM2COL_CPU                                                   \
+  paddle::platform::CPUPlace place;                                          \
+  paddle::platform::CPUDeviceContext context(place);                         \
+  paddle::framework::Tensor input;                                           \
+  paddle::framework::Tensor out;                                             \
+  paddle::framework::Tensor ref;                                             \
+  std::vector<int> padding({ph, pw});                                        \
+  std::vector<int> stride({1, 1});                                           \
+  std::vector<int> dilation({1, 1});                                         \
+  float* input_ptr = input.mutable_data<float>({ic, ih, iw}, place);         \
+  for (int i = 0; i < input.numel(); ++i) {                                  \
+    input_ptr[i] = static_cast<float>(i + 1);                                \
+  }                                                                          \
+  int output_height = (ih - fh + padding[0] * 2) / stride[0] + 1;            \
+  int output_width = (iw - fw + padding[1] * 2) / stride[1] + 1;             \
+  out.mutable_data<float>({ic, fh, fw, output_height, output_width}, place); \
+  ref.mutable_data<float>({ic, fh, fw, output_height, output_width}, place); \
+  paddle::operators::math::Im2ColFunctor<                                    \
+      paddle::operators::math::ColFormat::kCFO,                              \
+      paddle::platform::CPUDeviceContext, float>                             \
+      im2col
+
+void testIm2colCPU(int ic, int ih, int iw, int fh, int fw, int ph, int pw) {
+  PREPARE_IM2COL_CPU;
+
+  im2col(context, input, dilation, stride, padding, &out);
+  paddle::operators::math::im2col_common<float>(input, dilation, stride,
+                                                padding, &ref);
+
+  float* ref_data = ref.data<float>();
+  float* out_data = out.data<float>();
+  for (int i = 0; i < out.numel(); ++i) {
+    EXPECT_EQ(out_data[i], ref_data[i]);
+  }
+}
+
+void benchIm2col(int ic, int ih, int iw, int fh, int fw, int ph, int pw) {
+  PREPARE_IM2COL_CPU;
+  constexpr int repeat = 100;
+  auto GetCurrentMs = []() -> double {
+    struct timeval time;
+    gettimeofday(&time, NULL);
+    return 1e+3 * time.tv_sec + 1e-3 * time.tv_usec;
+  };
+  auto t1 = GetCurrentMs();
+  for (int i = 0; i < repeat; ++i) {
+    im2col(context, input, dilation, stride, padding, &out);
+  }
+  auto t2 = GetCurrentMs();
+
+  for (int i = 0; i < repeat; ++i) {
+    paddle::operators::math::im2col_common<float>(input, dilation, stride,
+                                                  padding, &ref);
+  }
+  auto t3 = GetCurrentMs();
+
+  LOG(INFO) << "before: " << (t3 - t2) / repeat
+            << ",after: " << (t2 - t1) / repeat
+            << ",boost: " << ((t3 - t2) / (t2 - t1) - 1) * 100 << "%";
+}
+
+TEST(math, im2col_cputest) {
+  // padding_h == padding_w
+  for (int p = 0; p < 4; ++p) {
+    // width == height
+    testIm2colCPU(/*ic*/ 2, /*ih*/ 5, /*iw*/ 5, /*fh*/ 4, /*fw*/ 4, /*ph*/ p,
+                  /*pw*/ p);
+    testIm2colCPU(/*ic*/ 2, /*ih*/ 4, /*iw*/ 4, /*fh*/ 3, /*fw*/ 3, /*ph*/ p,
+                  /*pw*/ p);
+    testIm2colCPU(/*ic*/ 2, /*ih*/ 4, /*iw*/ 4, /*fh*/ 2, /*fw*/ 2, /*ph*/ p,
+                  /*pw*/ p);
+
+    // height != width
+    testIm2colCPU(/*ic*/ 2, /*ih*/ 5, /*iw*/ 4, /*fh*/ 2, /*fw*/ 3, /*ph*/ p,
+                  /*pw*/ p);
+    testIm2colCPU(/*ic*/ 2, /*ih*/ 5, /*iw*/ 4, /*fh*/ 1, /*fw*/ 3, /*ph*/ p,
+                  /*pw*/ p);
+    testIm2colCPU(/*ic*/ 2, /*ih*/ 4, /*iw*/ 5, /*fh*/ 3, /*fw*/ 1, /*ph*/ p,
+                  /*pw*/ p);
+
+    // filter == 1
+    testIm2colCPU(/*ic*/ 3, /*ih*/ 4, /*iw*/ 4, /*fh*/ 1, /*fw*/ 1, /*ph*/ p,
+                  /*pw*/ p);
+    testIm2colCPU(/*ic*/ 3, /*ih*/ 3, /*iw*/ 4, /*fh*/ 1, /*fw*/ 1, /*ph*/ p,
+                  /*pw*/ p);
+  }
+
+  // padding_h != padding_w
+  testIm2colCPU(/*ic*/ 2, /*ih*/ 4, /*iw*/ 4, /*fh*/ 2, /*fw*/ 3, /*ph*/ 1,
+                /*pw*/ 2);
+
+  // benchmark
+  for (int p : {0, 1}) {
+    for (int k : {1, 3, 5}) {
+      LOG(INFO) << "padding == " << p << ", filter == " << k;
+      benchIm2col(/*ic*/ 3, /*ih*/ 224, /*iw*/ 224, /*fh*/ k, /*fw*/ k,
+                  /*ph*/ p, /*pw*/ p);
+    }
+  }
+}
diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc
index c3387be6daa3bd34a6e3410ced23fce5d65f2cf7..5923792902a81521256de300f77955f1ea3d16c6 100644
--- a/paddle/fluid/operators/math/math_function.cc
+++ b/paddle/fluid/operators/math/math_function.cc
@@ -41,7 +41,8 @@ template struct SetConstant<platform::CPUDeviceContext, uint8_t>;
   template struct Transpose<platform::CPUDeviceContext, int64_t, RANK>;    \
   template struct Transpose<platform::CPUDeviceContext, bool, RANK>;       \
   template struct Transpose<platform::CPUDeviceContext, int16_t, RANK>;    \
-  template struct Transpose<platform::CPUDeviceContext, uint8_t, RANK>;
+  template struct Transpose<platform::CPUDeviceContext, uint8_t, RANK>;    \
+  template struct Transpose<platform::CPUDeviceContext, int8_t, RANK>;
 
 DEFINE_CPU_TRANS(1);
 DEFINE_CPU_TRANS(2);
@@ -54,7 +55,7 @@ struct TensorSetConstantCPU {
   TensorSetConstantCPU(framework::Tensor* tensor, float value)
       : tensor_(tensor), value_(value) {}
   template <typename T>
-  void operator()() const {
+  void apply() const {
     auto cpu = platform::CPUPlace();
     auto* begin = tensor_->mutable_data<T>(cpu);
     std::fill(begin, begin + tensor_->numel(), static_cast<T>(value_));
diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu
index d5af718723e8d44da0971ea7756b8c36e771cca2..79b7538ad05b0ff348b8264d50b63211b5254e80 100644
--- a/paddle/fluid/operators/math/math_function.cu
+++ b/paddle/fluid/operators/math/math_function.cu
@@ -33,10 +33,11 @@ template struct SetConstant<platform::CUDADeviceContext, int>;
 template struct SetConstant<platform::CUDADeviceContext, int64_t>;
 template struct SetConstant<platform::CUDADeviceContext, bool>;
 
-#define DEFINE_GPU_TRANS(RANK)                                          \
-  template struct Transpose<platform::CUDADeviceContext, float, RANK>;  \
-  template struct Transpose<platform::CUDADeviceContext, double, RANK>; \
-  template struct Transpose<platform::CUDADeviceContext, float16, RANK>;
+#define DEFINE_GPU_TRANS(RANK)                                           \
+  template struct Transpose<platform::CUDADeviceContext, float, RANK>;   \
+  template struct Transpose<platform::CUDADeviceContext, double, RANK>;  \
+  template struct Transpose<platform::CUDADeviceContext, float16, RANK>; \
+  template struct Transpose<platform::CUDADeviceContext, int8_t, RANK>;
 
 DEFINE_GPU_TRANS(1);
 DEFINE_GPU_TRANS(2);
@@ -51,7 +52,7 @@ struct TensorSetConstantGPU {
       : context_(context), tensor_(tensor), value_(value) {}
 
   template <typename T>
-  void operator()() const {
+  void apply() const {
     SetConstant<platform::CUDADeviceContext, T> functor;
     functor(reinterpret_cast<const platform::CUDADeviceContext&>(context_),
             tensor_, static_cast<T>(value_));
diff --git a/paddle/fluid/operators/math/math_function.h b/paddle/fluid/operators/math/math_function.h
index 7ec78d9ef8e7ff966674b043c017f2fbedb77bb9..c63ad89e46d2c187c7e6fe6b2fe73fbbed5f4044 100644
--- a/paddle/fluid/operators/math/math_function.h
+++ b/paddle/fluid/operators/math/math_function.h
@@ -19,6 +19,10 @@ limitations under the License. */
 
 #ifdef PADDLE_USE_OPENBLAS
 #include <cblas.h>
+// remove typedef in openblas
+#undef FLOAT
+#undef INT
+#undef SIZE
 #endif
 
 #include <cmath>
diff --git a/paddle/fluid/operators/math/math_function_impl.h b/paddle/fluid/operators/math/math_function_impl.h
index b9bd49d77d935e985705f78402ffe1ea90f24cb3..895a7019aa10e5d9bb8f0c17e433a4344eac3bf4 100644
--- a/paddle/fluid/operators/math/math_function_impl.h
+++ b/paddle/fluid/operators/math/math_function_impl.h
@@ -155,7 +155,7 @@ class RowwiseSum<platform::CPUDeviceContext, T> {
     PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
     auto height = in_dims[0];
     auto size = in_dims[1];
-    PADDLE_ENFORCE_EQ(out->numel(), size);
+    PADDLE_ENFORCE_EQ(out->numel(), height);
 
     T* out_buf = out->mutable_data<T>(out->place());
     const T* in_buf = input.data<T>();
diff --git a/paddle/fluid/operators/math/math_function_test.cc b/paddle/fluid/operators/math/math_function_test.cc
index b545671b43d3a453ab03e4774427179617f62db0..2343e0ee965303c9fdb2ad3faf9ddf6e5bb7782f 100644
--- a/paddle/fluid/operators/math/math_function_test.cc
+++ b/paddle/fluid/operators/math/math_function_test.cc
@@ -54,8 +54,64 @@ TEST(math_function, gemm_notrans_cblas) {
   EXPECT_EQ(input3_ptr[6], 86);
   EXPECT_EQ(input3_ptr[7], 99);
 }
+#ifdef PADDLE_WITH_LIBXSMM
+template <typename T>
+void MklSmmCompare(int m, int n, int k) {
+  paddle::framework::Tensor mat_a;
+  paddle::framework::Tensor mat_b;
+  paddle::framework::Tensor mat_c_smm;
+  paddle::framework::Tensor mat_c_mkl;
+  auto* cpu_place = new paddle::platform::CPUPlace();
+
+  T* A = mat_a.mutable_data<T>({m, k}, *cpu_place);
+  T* B = mat_b.mutable_data<T>({k, n}, *cpu_place);
+  T* CSMM = mat_c_smm.mutable_data<T>({m, n}, *cpu_place);
+  T* CMKL = mat_c_mkl.mutable_data<T>({m, n}, *cpu_place);
+  T alpha = static_cast<T>(1);
+  T beta = static_cast<T>(0);
+  for (int i = 0; i < mat_a.numel(); ++i) {
+    A[i] = static_cast<T>(i);
+  }
+  for (int i = 0; i < mat_b.numel(); ++i) {
+    B[i] = static_cast<T>(i);
+  }
+  // lda,ldb,ldc follow RowMajor
+  int lda = k;
+  int ldb = n;
+  int ldc = n;
+
+  auto smm = [&, m, n, k, lda, ldb, ldc, alpha, beta]() {
+    const char transa = 'N';
+    const char transb = 'N';
+    paddle::operators::math::CBlas<T>::SMM_GEMM(&transa, &transb, &n, &m, &k,
+                                                &alpha, B, &ldb, A, &lda, &beta,
+                                                CSMM, &ldc);
+  };
 
-TEST(math_function, gemm_trans_clbas) {
+  auto mkl = [&, m, n, k, lda, ldb, ldc, alpha, beta]() {
+    paddle::operators::math::CBlas<T>::GEMM(CblasRowMajor, CblasNoTrans,
+                                            CblasNoTrans, m, n, k, alpha, A,
+                                            lda, B, ldb, beta, CMKL, ldc);
+  };
+
+  smm();
+  mkl();
+  ASSERT_EQ(mat_c_mkl.numel(), mat_c_smm.numel());
+  for (int i = 0; i < mat_c_mkl.numel(); ++i) {
+    EXPECT_FLOAT_EQ(CSMM[i], CMKL[i]);
+  }
+}
+TEST(math_function, gemm_mkl_vs_smm) {
+  MklSmmCompare<float>(1, 2, 3);
+  MklSmmCompare<double>(1, 2, 3);
+  MklSmmCompare<float>(3, 2, 1);
+  MklSmmCompare<double>(3, 2, 1);
+  MklSmmCompare<float>(3, 8, 5);
+  MklSmmCompare<double>(3, 8, 5);
+}
+#endif
+
+TEST(math_function, gemm_trans_cblas) {
   paddle::framework::Tensor input1;
   paddle::framework::Tensor input2;
   paddle::framework::Tensor input3;
@@ -172,3 +228,57 @@ TEST(math_funciton, set_constant) {
   }
   delete ctx;
 }
+
+template <typename T>
+void GemmWarpTest(int m, int n, int k, T alpha, T beta) {
+  paddle::framework::Tensor mat_a;
+  paddle::framework::Tensor mat_b;
+  paddle::framework::Tensor mat_c_ref;
+  paddle::framework::Tensor mat_c_mkl;
+  auto* cpu_place = new paddle::platform::CPUPlace();
+
+  T* A = mat_a.mutable_data<T>({m, k}, *cpu_place);
+  T* B = mat_b.mutable_data<T>({k, n}, *cpu_place);
+  T* CREF = mat_c_ref.mutable_data<T>({m, n}, *cpu_place);
+  T* CMKL = mat_c_mkl.mutable_data<T>({m, n}, *cpu_place);
+
+  ASSERT_EQ(mat_c_mkl.numel(), mat_c_ref.numel());
+  for (int i = 0; i < mat_a.numel(); ++i) {
+    A[i] = static_cast<T>(i);
+  }
+  for (int i = 0; i < mat_b.numel(); ++i) {
+    B[i] = static_cast<T>(i + 1);
+  }
+  for (int i = 0; i < mat_c_ref.numel(); ++i) {
+    CREF[i] = static_cast<T>(i + 2);
+    CMKL[i] = CREF[i];
+  }
+
+  // this would call gemm_warp
+  paddle::platform::CPUDeviceContext context(*cpu_place);
+  GetBlas<T>(context).GEMM(CblasNoTrans, CblasNoTrans, m, n, k, alpha, A, B,
+                           beta, CREF);
+
+  // lda,ldb,ldc follow RowMajor
+  int lda = k;
+  int ldb = n;
+  int ldc = n;
+  paddle::operators::math::CBlas<T>::GEMM(CblasRowMajor, CblasNoTrans,
+                                          CblasNoTrans, m, n, k, alpha, A, lda,
+                                          B, ldb, beta, CMKL, ldc);
+
+  for (int i = 0; i < mat_c_mkl.numel(); ++i) {
+    EXPECT_FLOAT_EQ(CREF[i], CMKL[i]);
+  }
+}
+
+TEST(math_function, gemm_warp) {
+  GemmWarpTest<float>(3, 2, 5, 1.f, 0.f);
+  GemmWarpTest<float>(3, 2, 5, 2.f, 1.f);
+  GemmWarpTest<float>(8, 5, 6, 1.f, 0.f);
+  GemmWarpTest<float>(8, 5, 6, 2.f, 1.f);
+  GemmWarpTest<double>(3, 2, 5, 1.0, 0.0);
+  GemmWarpTest<double>(3, 2, 5, 2.0, 1.0);
+  GemmWarpTest<double>(8, 5, 6, 1.0, 0.0);
+  GemmWarpTest<double>(8, 5, 6, 2.0, 1.0);
+}
diff --git a/paddle/fluid/operators/math/matrix_bit_code.cc b/paddle/fluid/operators/math/matrix_bit_code.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1e56e297396c6e37867a53f039478191f0caf08e
--- /dev/null
+++ b/paddle/fluid/operators/math/matrix_bit_code.cc
@@ -0,0 +1,176 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/matrix_bit_code.h"
+#include <iostream>
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T>
+void MatrixBitCodeFunctor<T>::Add(framework::Tensor* tmat,
+                                  const framework::Tensor& vec) {
+  SimpleCodeTable code_table(num_classes_);
+  size_t batch_size = tmat->dims()[0];
+  size_t width = tmat->dims()[1];
+  for (size_t i = 0; i < batch_size; ++i) {
+    auto code = code_table(static_cast<size_t>(ids_[i]));
+    int code_length = code.get_length();
+    for (int j = 0; j < code_length; ++j) {
+      size_t index = code.calc_index(j);
+      tmat->data<T>()[i * width + j] += vec.data<T>()[index];
+    }
+  }
+}
+
+template <typename T>
+void MatrixBitCodeFunctor<T>::AddGrad(const framework::Tensor& tmat,
+                                      framework::Tensor* vec) {
+  SimpleCodeTable code_table(num_classes_);
+  size_t batch_size = tmat.dims()[0];
+  size_t width = tmat.dims()[1];
+  for (size_t i = 0; i < batch_size; ++i) {
+    auto code = code_table(static_cast<size_t>(ids_[i]));
+    int code_length = code.get_length();
+    for (int j = 0; j < code_length; ++j) {
+      size_t index = code.calc_index(j);
+      vec->data<T>()[index] += tmat.data<T>()[i * width + j];
+    }
+  }
+}
+
+template <typename T>
+void MatrixBitCodeFunctor<T>::Sum(const framework::Tensor& tmat,
+                                  framework::Tensor* sum, T scale_sum) {
+  SimpleCodeTable code_table(num_classes_);
+  size_t num_samples = tmat.dims()[0];
+  size_t o_width = tmat.dims()[1];
+  for (size_t i = 0; i < num_samples; ++i) {
+    T sm = static_cast<T>(0.0);
+    auto code = code_table(static_cast<size_t>(ids_[i]));
+    int code_length = code.get_length();
+    for (int j = 0; j < code_length; ++j) {
+      if (code.calc_bit(j)) {
+        // calc_bit starts from right most bit, while data in tmat[i] is in the
+        // reverse order.
+        sm += tmat.data<T>()[i * o_width + j];
+      }
+    }
+    sum->data<T>()[i] = scale_sum * sm;
+  }
+}
+
+template <typename T>
+void MatrixBitCodeFunctor<T>::Mul(framework::Tensor* tmat,
+                                  const framework::Tensor& weight,
+                                  const framework::Tensor& input) {
+  SimpleCodeTable code_table(num_classes_);
+  size_t num_samples = tmat->dims()[0];
+  size_t tmat_width = tmat->dims()[1];
+  size_t input_width = input.dims()[1];
+  size_t weight_width = weight.dims()[1];
+  auto tmat_value = tmat->data<T>();
+  auto weight_value = weight.data<T>();
+  auto input_value = input.data<T>();
+  for (size_t i = 0; i < num_samples; ++i) {
+    auto code = code_table(static_cast<size_t>(ids_[i]));
+    int code_length = code.get_length();
+    for (int j = 0; j < code_length; ++j) {
+      size_t index = code.calc_index(j);
+      T sum = static_cast<T>(0.0);
+      for (size_t k = 0; k < input_width; ++k) {
+        sum += weight_value[weight_width * index + k] *
+               input_value[input_width * i + k];
+      }
+      tmat_value[i * tmat_width + j] += sum;
+    }
+  }
+}
+
+template <typename T>
+void MatrixBitCodeFunctor<T>::MulGradWeight(const framework::Tensor& tmat,
+                                            framework::Tensor* weight,
+                                            const framework::Tensor& input) {
+  SimpleCodeTable code_table(num_classes_);
+  size_t num_samples = tmat.dims()[0];
+  size_t input_width = input.dims()[1];
+  size_t tmat_width = tmat.dims()[1];
+  size_t weight_width = weight->dims()[1];
+  auto tmat_value = tmat.data<T>();
+  auto weight_value = weight->data<T>();
+  auto input_value = input.data<T>();
+  for (size_t i = 0; i < num_samples; ++i) {
+    auto code = code_table(static_cast<size_t>(ids_[i]));
+    int code_length = code.get_length();
+    for (int j = 0; j < code_length; ++j) {
+      size_t index = code.calc_index(j);
+
+      for (size_t k = 0; k < input_width; ++k) {
+        weight_value[weight_width * index + k] +=
+            tmat_value[i * tmat_width + j] * input_value[input_width * i + k];
+      }
+    }
+  }
+}
+
+template <typename T>
+void MatrixBitCodeFunctor<T>::MulGradError(const framework::Tensor& tmat,
+                                           const framework::Tensor& weight,
+                                           framework::Tensor* input) {
+  SimpleCodeTable code_table(num_classes_);
+  size_t num_samples = tmat.dims()[0];
+  size_t tmat_width = tmat.dims()[1];
+  size_t input_width = input->dims()[1];
+  size_t weight_width = weight.dims()[1];
+  auto tmat_value = tmat.data<T>();
+  auto weight_value = weight.data<T>();
+  auto input_value = input->data<T>();
+
+  for (size_t i = 0; i < num_samples; ++i) {
+    auto code = code_table(static_cast<size_t>(ids_[i]));
+    int code_length = code.get_length();
+    for (int j = 0; j < code_length; ++j) {
+      size_t index = code.calc_index(j);
+
+      for (size_t k = 0; k < input_width; ++k) {
+        input_value[input_width * i + k] +=
+            tmat_value[i * tmat_width + j] *
+            weight_value[weight_width * index + k];
+      }
+    }
+  }
+}
+
+template <typename T>
+void MatrixBitCodeFunctor<T>::Sub(framework::Tensor* tmat) {
+  SimpleCodeTable code_table(num_classes_);
+  size_t num_samples = tmat->dims()[0];
+  size_t o_width = tmat->dims()[1];
+  for (size_t i = 0; i < num_samples; ++i) {
+    auto code = code_table(static_cast<size_t>(ids_[i]));
+    int code_length = code.get_length();
+    for (int j = 0; j < code_length; ++j) {
+      if (code.calc_bit(j)) {
+        tmat->data<T>()[i * o_width + j] -= 1;
+      }
+    }
+  }
+}
+
+template class MatrixBitCodeFunctor<float>;
+template class MatrixBitCodeFunctor<double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/matrix_bit_code.h b/paddle/fluid/operators/math/matrix_bit_code.h
new file mode 100644
index 0000000000000000000000000000000000000000..07854c83584f90db02b416b85a4aa61f5cdc0685
--- /dev/null
+++ b/paddle/fluid/operators/math/matrix_bit_code.h
@@ -0,0 +1,174 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/device_context.h"
+
+#if defined(_WIN32)
+#include <intrin.h>
+#include <windows.h>
+#endif  // _WIN32
+
+namespace paddle {
+namespace operators {
+namespace math {
+/**
+ * SimpleCodeTable class should support 3 functions:
+ *
+ * size_t size()
+ *   return the number of ids
+ *
+ * int get_max_code_length()
+ *   return the maximal code length
+ *
+ * SimpleCode operator()(size_t i)
+ *   return the i-th code. Code class is descriebed below.
+ *
+ * SimpleCode class should support 3 functions:
+ *
+ * int get_length()
+ *   return the length of the code
+ *
+ * size_t cal_index(int bit)
+ *   bit ranges from 0 to get_length() - 1
+ *   return the index for the (1+bit) level parent
+ *
+ * bool calc_bit(int bit)
+ *   return true if the bit level parent is the right child of (1+bit) level
+ *   parent
+ *
+ */
+
+/**
+ * return the 1-based index of the highest bit set
+ *
+ * for x > 0:
+ * \f[
+ *    FindLastSet(x) = 1 + \floor*{\log_{2}x}
+ * \f]
+ */
+#if !defined(_WIN32)
+inline constexpr size_t FindLastSet(size_t x) {
+  return std::is_same<size_t, unsigned int>::value
+             ? (x ? 8 * sizeof(x) - __builtin_clz(x) : 0)
+             : (std::is_same<size_t, unsigned long>::value  // NOLINT
+                    ? (x ? 8 * sizeof(x) - __builtin_clzl(x) : 0)
+                    : (x ? 8 * sizeof(x) - __builtin_clzll(x) : 0));
+
+#else
+// windows don't have built-in clz, ctz function
+template <typename T>
+inline int ctz(const T& value) {
+  DWORD trailing_zero = 0;
+  if (_BitScanForward(&trailing_zero, value)) {
+    return static_cast<int>(trailing_zero);
+  } else {
+    return static_cast<int>(0);
+  }
+}
+
+template <typename T>
+inline int clz(const T& value) {
+  DWORD leadning_zero = 0;
+  if (_BitScanReverse(&leadning_zero, value)) {
+    return static_cast<int>(sizeof(T) * 8 - leadning_zero);
+  } else {
+    return static_cast<int>(0);
+  }
+}
+
+inline size_t FindLastSet(size_t x) { return sizeof(size_t) * 8 - clz(x); }
+#endif  // !_WIN32
+}
+
+struct SimpleCode {
+  SimpleCode(size_t code, size_t num_classes) : c_(code + num_classes) {}
+  /**
+   * Here the id of root shoud be 1 rather than 0, thus the encoding of class c
+   * is `c + num_classes` and all siblings can get the same weight indice using
+   * prefixes.
+   * Weight index is the prefixes of encoding, thus leave out the right most
+   * bit in calc_index.
+   * Binary classification path is the suffixes of encoding, thus leave out the
+   * left most bit in calc_bit.
+   */
+  inline size_t calc_index(int bit) const { return (c_ >> (bit + 1)) - 1; }
+  inline bool calc_bit(int bit) const { return c_ & (1 << bit); }
+  inline int get_length() const { return FindLastSet(c_) - 1; }
+
+ private:
+  size_t c_;
+};
+
+struct SimpleCodeTable {
+  explicit SimpleCodeTable(size_t num_classes) : num_classes_(num_classes) {}
+  SimpleCode operator()(size_t code) const {
+    return SimpleCode(code, num_classes_);
+  }
+  size_t size() const { return num_classes_; }
+  int get_max_code_length() const { return FindLastSet(num_classes_ - 1); }
+
+ private:
+  size_t num_classes_;
+};
+
+template <typename T>
+class MatrixBitCodeFunctor {
+ public:
+  explicit MatrixBitCodeFunctor(size_t num_classes, const int64_t* ids)
+      : num_classes_(num_classes), ids_(ids) {}
+  /* For j < code_length
+       tmat(i, j) += vec(0, index(i, j))
+  */
+  void Add(framework::Tensor* tmat, const framework::Tensor& vec);
+
+  /* For j < code_length
+       vec(0, index(i, j)) += tmat(i, j)
+  */
+  void AddGrad(const framework::Tensor& tmat, framework::Tensor* vec);
+
+  /* For j < code_length
+    sum(i, 0) = \sum_j bit(i, j) * tmat(i, j)
+  */
+  void Sum(const framework::Tensor& tmat, framework::Tensor* sum, T scale_sum);
+
+  /* For j < code_length
+       tmat(i, j) -= bit(i, j)
+  */
+  void Sub(framework::Tensor* tmat);
+  /* For j < code_length
+       input.row(i) += tmat(i, j) * weight.row(index(i, j))
+  */
+  void Mul(framework::Tensor* tmat, const framework::Tensor& weight,
+           const framework::Tensor& input);
+
+  /* For index(i, j) >= 0:
+      weight.row(index(i, j)) += tmat(i, j) * input.row(i)
+  */
+  void MulGradWeight(const framework::Tensor& tmat, framework::Tensor* weight,
+                     const framework::Tensor& input);
+  /* For j < code_length
+    input.row(i) += tmat(i, j) * weight.row(index(i, j))
+  */
+  void MulGradError(const framework::Tensor& tmat,
+                    const framework::Tensor& weight, framework::Tensor* input);
+
+  size_t num_classes_;
+  const int64_t* ids_;
+};
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/maxouting.h b/paddle/fluid/operators/math/maxouting.h
index 4166fb54946b7082f5f7dc0e232f636a1d2f8a13..e4d378dc23210e95605c6e09eda8a190cc5c6b4f 100644
--- a/paddle/fluid/operators/math/maxouting.h
+++ b/paddle/fluid/operators/math/maxouting.h
@@ -16,13 +16,12 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/hostdevice.h"
+#include "paddle/fluid/platform/macros.h"
 
 namespace paddle {
 namespace operators {
 namespace math {
 
-#define FLT_MAX __FLT_MAX__
-
 template <typename DeviceContext, typename T>
 class MaxOutFunctor {
  public:
diff --git a/paddle/fluid/operators/math/padding.h b/paddle/fluid/operators/math/padding.h
new file mode 100644
index 0000000000000000000000000000000000000000..3ae25eae98b25bca015ec4383c7126eb81e52b8a
--- /dev/null
+++ b/paddle/fluid/operators/math/padding.h
@@ -0,0 +1,124 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/tensor.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T, size_t D, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
+
+template <typename DeviceContext, typename T, size_t D>
+void PadFunction(const framework::ExecutionContext& context,
+                 const std::vector<int>& pads, const framework::Tensor& src,
+                 T pad_value, framework::Tensor* out) {
+  Eigen::array<std::pair<int, int>, D> paddings;
+
+  for (size_t i = 0; i < paddings.size(); ++i) {
+    paddings[i].first = pads[i * 2];
+    paddings[i].second = pads[i * 2 + 1];
+  }
+
+  auto src_tensor = EigenTensor<T, D>::From(src);
+  auto out_tensor = EigenTensor<T, D>::From(*out);
+
+  auto& place =
+      *context.template device_context<DeviceContext>().eigen_device();
+  out_tensor.device(place) = src_tensor.pad(paddings, pad_value);
+}
+
+template <typename DeviceContext, typename T, size_t D>
+void PadGradFunction(const framework::ExecutionContext& context,
+                     const std::vector<int>& pads, const framework::Tensor& src,
+                     framework::Tensor* d_out) {
+  Eigen::array<std::pair<int, int>, D> paddings;
+  for (size_t i = 0; i < paddings.size(); ++i) {
+    paddings[i].first = -pads[i * 2];
+    paddings[i].second = -pads[i * 2 + 1];
+  }
+
+  auto d_out_tensor = EigenTensor<T, D>::From(*d_out);
+  auto src_tensor = EigenTensor<T, D>::From(src);
+  auto& place =
+      *context.template device_context<DeviceContext>().eigen_device();
+  d_out_tensor.device(place) = src_tensor.pad(paddings, 0);
+}
+
+template <typename DeviceContext, typename T>
+void PaddingFunctor(int rank, const framework::ExecutionContext& context,
+                    const std::vector<int>& pads, T pad_value,
+                    const framework::Tensor& src, framework::Tensor* out) {
+  switch (rank) {
+    case 1:
+      PadFunction<DeviceContext, T, 1>(context, pads, src, pad_value, out);
+      break;
+    case 2:
+      PadFunction<DeviceContext, T, 2>(context, pads, src, pad_value, out);
+      break;
+    case 3:
+      PadFunction<DeviceContext, T, 3>(context, pads, src, pad_value, out);
+      break;
+    case 4:
+      PadFunction<DeviceContext, T, 4>(context, pads, src, pad_value, out);
+      break;
+    case 5:
+      PadFunction<DeviceContext, T, 5>(context, pads, src, pad_value, out);
+      break;
+    case 6:
+      PadFunction<DeviceContext, T, 6>(context, pads, src, pad_value, out);
+      break;
+    default:
+      PADDLE_THROW(
+          "PadOp only support tensors with no more than 6 dimensions.");
+  }
+}
+
+template <typename DeviceContext, typename T>
+void PaddingGradFunctor(int rank, const framework::ExecutionContext& context,
+                        const std::vector<int>& pads,
+                        const framework::Tensor& src, framework::Tensor* out) {
+  switch (rank) {
+    case 1:
+      PadGradFunction<DeviceContext, T, 1>(context, pads, src, out);
+      break;
+    case 2:
+      PadGradFunction<DeviceContext, T, 2>(context, pads, src, out);
+      break;
+    case 3:
+      PadGradFunction<DeviceContext, T, 3>(context, pads, src, out);
+      break;
+    case 4:
+      PadGradFunction<DeviceContext, T, 4>(context, pads, src, out);
+      break;
+    case 5:
+      PadGradFunction<DeviceContext, T, 5>(context, pads, src, out);
+      break;
+    case 6:
+      PadGradFunction<DeviceContext, T, 6>(context, pads, src, out);
+      break;
+    default:
+      PADDLE_THROW(
+          "PadOp only support tensors with no more than 6 dimensions.");
+  }
+}
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/pooling.h b/paddle/fluid/operators/math/pooling.h
index 2538d739cce95d1b2fc5b3f905af5e6d94cf7af5..120f5919803806e0d3b7dc8eaf530ae89819b84d 100644
--- a/paddle/fluid/operators/math/pooling.h
+++ b/paddle/fluid/operators/math/pooling.h
@@ -18,15 +18,12 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/hostdevice.h"
+#include "paddle/fluid/platform/macros.h"
 
 namespace paddle {
 namespace operators {
 namespace math {
 
-#define FLT_MAX \
-  __FLT_MAX__  // TODO(zcd) :It might need to be placed in another file, but I'm
-               // still wondering where to put it.
-
 /*
  * \brief Extracting simple operations from pooling.
  *        Both MaxPool and AvgPool need "initial", "compute" and "finalize"
diff --git a/paddle/fluid/operators/math/sequence2batch.cc b/paddle/fluid/operators/math/sequence2batch.cc
index b546b8728217ed6013247555dcd5d7180ddeae74..e4ffeedb5a0061dd60ca3a30aa9928ef8b05887c 100644
--- a/paddle/fluid/operators/math/sequence2batch.cc
+++ b/paddle/fluid/operators/math/sequence2batch.cc
@@ -38,13 +38,14 @@ class CopyMatrixRowsFunctor<platform::CPUDeviceContext, T> {
     auto width = dst_dims[1];
     auto* src_data = src.data<T>();
     auto* dst_data = dst->data<T>();
-    for (int i = 0; i < height; ++i) {
-      if (is_src_index) {
-        memcpy(dst_data + i * width, src_data + index[i] * width,
-               width * sizeof(T));
-      } else {
-        memcpy(dst_data + index[i] * width, src_data + i * width,
-               width * sizeof(T));
+    const int sz = width * sizeof(T);
+    if (is_src_index) {
+      for (int i = 0; i < height; ++i) {
+        memcpy(dst_data + i * width, src_data + index[i] * width, sz);
+      }
+    } else {
+      for (int i = 0; i < height; ++i) {
+        memcpy(dst_data + index[i] * width, src_data + i * width, sz);
       }
     }
   }
diff --git a/paddle/fluid/operators/math/sequence2batch.h b/paddle/fluid/operators/math/sequence2batch.h
index 62e6307ae9f4236a38c49daaf09fc05c54268159..a3186f82d0c0cc6c9585735ddf7e9bb4db7126cb 100644
--- a/paddle/fluid/operators/math/sequence2batch.h
+++ b/paddle/fluid/operators/math/sequence2batch.h
@@ -78,7 +78,7 @@ class LoDTensor2BatchFunctor {
     auto lods = lod_tensor.lod();
     PADDLE_ENFORCE_EQ(lods.size(), 1UL, "Only support one level sequence now.");
 
-    auto lod = lods[0];
+    const auto& lod = lods[0];
 
     std::vector<SeqInfo> seq_info;
     for (size_t seq_id = 0; seq_id < lod.size() - 1; ++seq_id) {
@@ -92,7 +92,7 @@ class LoDTensor2BatchFunctor {
     // Calculate the start position of each batch.
     // example:  sequences = {s0, s1, s2}
     //           s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2
-    //           num_batch = 5,
+    //           max_seqlen = 5,
     //           batchIndex = {b0, b1, b2, b3, b4}
     //           b0: 1 0 2, b1: 1 0 2, b2: 1 0 2, b3: 1 0, b4: 1
     //           batch_start_positions[6] = {0, 3, 6, 9, 11, 12}
@@ -109,7 +109,7 @@ class LoDTensor2BatchFunctor {
     //               where 1 is the second sequence,
     //                     0 is the first sequence,
     //                     2 is the third sequence.
-    // The num_batch represents batch size after rearranging the
+    // The max_seqlen represents batch size after rearranging the
     // input LodTensor. It is also the maximum length of input sequence.
 
     paddle::framework::LoD batch_lods;
@@ -118,8 +118,8 @@ class LoDTensor2BatchFunctor {
     batch_lods.emplace_back(std::vector<size_t>{0});
 
     // batch_lods[0] is the start positions for batch LoDTensor
-    int num_batch = seq_info[0].length;
-    batch_lods[0].resize(static_cast<size_t>(num_batch + 1));
+    int max_seqlen = seq_info[0].length;
+    batch_lods[0].resize(static_cast<size_t>(max_seqlen + 1));
     // batch_lods[1] is the raw index in the input LoDTensor
     batch_lods[1].resize(static_cast<size_t>(lod_tensor.dims()[0]));
     // batch_lods[2] is the sort order for the input LoDTensor.
@@ -128,7 +128,7 @@ class LoDTensor2BatchFunctor {
     size_t* batch_starts = batch_lods[0].data();
     size_t* seq2batch_idx = batch_lods[1].data();
     batch_starts[0] = 0;
-    for (int n = 0; n < num_batch; n++) {
+    for (int n = 0; n < max_seqlen; n++) {
       auto batch_id = static_cast<int>(batch_starts[n]);
       for (size_t i = 0; i < seq_info.size(); ++i) {
         int seq_len = seq_info[i].length;
diff --git a/paddle/fluid/operators/math/sequence_padding.cc b/paddle/fluid/operators/math/sequence_padding.cc
index d63c6c4ed55331235188c1c750468d4e75b9b7f2..25f06a25a0638cbb394df58d35f88307941d117f 100644
--- a/paddle/fluid/operators/math/sequence_padding.cc
+++ b/paddle/fluid/operators/math/sequence_padding.cc
@@ -18,65 +18,86 @@ namespace paddle {
 namespace operators {
 namespace math {
 
+template <typename T>
+void CopyValidData(framework::Tensor* dst_tensor,
+                   const framework::Tensor* src_tensor,
+                   const framework::Vector<size_t>& seq_offsets,
+                   int pad_seq_len, int step_width, bool norm_by_len,
+                   CopyType type, PadLayout layout) {
+  int seq_num = seq_offsets.size() - 1;
+  const T* src_data = src_tensor->data<T>();
+  T* dst_data = dst_tensor->data<T>();
+
+  int seq_cpy_gap = step_width;
+  int pad_cpy_gap =
+      layout == kBatchLengthWidth ? step_width : seq_num * step_width;
+  for (int seq_idx = 0; seq_idx < seq_num; ++seq_idx) {
+    int valid_seq_len = seq_offsets[seq_idx + 1] - seq_offsets[seq_idx];
+    PADDLE_ENFORCE_GE(
+        pad_seq_len, valid_seq_len,
+        "The padded sequence length can not be less than its original length.");
+    int seq_data_offset = seq_offsets[seq_idx] * step_width;
+    int pad_data_offset = layout == kBatchLengthWidth
+                              ? seq_idx * pad_seq_len * step_width
+                              : seq_idx * step_width;
+    float scale = 1.0f / static_cast<float>(valid_seq_len);
+
+    for (int step_idx = 0; step_idx < valid_seq_len; ++step_idx) {
+      const T* src =
+          src_data + (type == kSeqToPad ? seq_data_offset : pad_data_offset);
+      T* dst =
+          dst_data + (type == kSeqToPad ? pad_data_offset : seq_data_offset);
+      memcpy(dst, src, step_width * sizeof(T));
+      if (norm_by_len) {
+        for (int i = 0; i < step_width; ++i) {
+          *(dst + i) *= scale;
+        }
+      }
+      seq_data_offset += seq_cpy_gap;
+      pad_data_offset += pad_cpy_gap;
+    }
+  }
+}
+
 template <typename T>
 class PaddingLoDTensorFunctor<platform::CPUDeviceContext, T> {
  public:
   void operator()(const platform::CPUDeviceContext& context,
-                  const framework::LoDTensor& seq, framework::Tensor* padding,
-                  bool norm_by_times) {
-    auto lod = seq.lod();
-    PADDLE_ENFORCE_GT(lod.size(), 0UL,
-                      "The LoD of LoDTensor seq should not be null.");
-
-    const size_t level = 0;
-    framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
-
-    auto seq_dims = seq.dims();
-    PADDLE_ENFORCE_EQ(seq_dims[0],
-                      static_cast<int64_t>(abs_offset_lod[level].back()),
-                      "The first dimension of LoDTensor seq should be "
-                      "equal to the sum of all sequences's length.");
-
-    auto padding_dims = padding->dims();
-    PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL,
-                      "The input padding should be a 3-D Tensor of shape "
-                      "[max_sequence_length, num_sequences, sequence_width].");
-
-    const int64_t max_sequence_length = MaximumSequenceLength(lod, level);
-    PADDLE_ENFORCE_EQ(padding_dims[0], max_sequence_length,
-                      "The first dimension of Tensor padding should be the "
-                      "maximum length of all sequences in LoDTensor seq.");
-
-    const int64_t num_sequences = abs_offset_lod[level].size() - 1;
-    PADDLE_ENFORCE_EQ(padding_dims[1], num_sequences,
-                      "The second dimension of Tensor padding should be the "
-                      "number of sequences in LoDTensor seq.");
-
-    const int64_t sequence_width = seq.numel() / seq_dims[0];
-    PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width,
-                      "The third dimension of Tensor padding should be the "
-                      "width of sequence in LoDTensor seq.");
-
-    const T* seq_data = seq.data<T>();
-    T* padding_data = padding->data<T>();
-    for (int64_t i = 0; i < max_sequence_length; ++i) {
-      for (int64_t j = 0; j < num_sequences; ++j) {
-        int64_t start_pos = abs_offset_lod[level][j];
-        int64_t sequence_length = abs_offset_lod[level][j + 1] - start_pos;
-        if (i < sequence_length) {
-          // i > 0 => sequence_length > 0
-          T scale =
-              norm_by_times ? (1.0f / static_cast<T>(sequence_length)) : 1.0f;
-          for (int64_t k = 0; k < sequence_width; ++k) {
-            padding_data[(i * num_sequences + j) * sequence_width + k] =
-                seq_data[(start_pos + i) * sequence_width + k] * scale;
-          }
-        } else {
-          memset(padding_data + (i * num_sequences + j) * sequence_width, 0,
-                 sequence_width * sizeof(T));
-        }
+                  const framework::LoDTensor& seq_tensor,
+                  framework::LoDTensor* pad_tensor,
+                  const framework::LoDTensor& pad_value, int pad_seq_len = -1,
+                  int lod_level = 0, bool norm_by_times = false,
+                  const PadLayout layout = kBatchLengthWidth) {
+    auto seq_lod = seq_tensor.lod();
+    const auto seq_offsets = framework::ToAbsOffset(seq_lod)[lod_level];
+    const auto& seq_tensor_dims = seq_tensor.dims();
+    const auto& pad_tensor_dims = pad_tensor->dims();
+    if (pad_seq_len == -1) {
+      pad_seq_len = MaximumSequenceLength(seq_offsets);
+    }
+    int step_width = seq_tensor.numel() / seq_tensor_dims[0];
+
+    CheckDims(seq_tensor_dims, pad_tensor_dims, seq_offsets, pad_seq_len,
+              step_width, layout);
+    PADDLE_ENFORCE(pad_value.numel() == 1 || pad_value.numel() == step_width,
+                   "The numel of 'pad_value' can only be 1 or be equal to the "
+                   "'step_width'.");
+
+    // fill padding value
+    T* pad_data = pad_tensor->data<T>();
+    const T* pad_value_data = pad_value.data<T>();
+    if (pad_value.numel() == 1) {
+      for (int i = 0; i < pad_tensor->numel(); ++i) {
+        pad_data[i] = *pad_value_data;
+      }
+    } else {
+      for (int i = 0; i < pad_tensor->numel(); i += step_width) {
+        memcpy(pad_data + i, pad_value_data, step_width * sizeof(T));
       }
     }
+
+    CopyValidData<T>(pad_tensor, &seq_tensor, seq_offsets, pad_seq_len,
+                     step_width, norm_by_times, kSeqToPad, layout);
   }
 };
 
@@ -84,62 +105,35 @@ template <typename T>
 class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, T> {
  public:
   void operator()(const platform::CPUDeviceContext& context,
-                  framework::LoDTensor* seq, const framework::Tensor& padding,
-                  bool norm_by_times) {
-    auto lod = seq->lod();
-    PADDLE_ENFORCE_GT(lod.size(), 0UL,
-                      "The LoD of LoDTensor seq should not be null.");
-
-    const size_t level = 0;
-    framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
-
-    auto seq_dims = seq->dims();
-    PADDLE_ENFORCE_EQ(seq_dims[0],
-                      static_cast<int64_t>(abs_offset_lod[level].back()),
-                      "The first dimension of LoDTensor seq should be "
-                      "equal to the sum of all sequences's length.");
-
-    auto padding_dims = padding.dims();
-    PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL,
-                      "The input padding should be a 3-D Tensor of shape "
-                      "[max_sequnece_length, num_sequences, sequence_width].");
-
-    const int64_t max_sequence_length = MaximumSequenceLength(lod, level);
-    PADDLE_ENFORCE_EQ(padding_dims[0], max_sequence_length,
-                      "The first dimension of Tensor padding should be "
-                      "the maximum length of all sequences in LoDTensor seq.");
-
-    const int64_t num_sequences = abs_offset_lod[level].size() - 1;
-    PADDLE_ENFORCE_EQ(padding_dims[1], num_sequences,
-                      "The second dimension of Tensor padding should be "
-                      "the number of sequences in LoDTensor seq.");
-
-    const int64_t sequence_width = seq->numel() / seq_dims[0];
-    PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width,
-                      "The third dimension of Tensor padding should be the "
-                      "width of sequence in LoDTensor seq.");
-
-    const T* padding_data = padding.data<T>();
-    T* seq_data = seq->data<T>();
-    for (int64_t i = 0; i < num_sequences; ++i) {
-      int64_t start_pos = abs_offset_lod[level][i];
-      int64_t sequence_length = abs_offset_lod[level][i + 1] - start_pos;
-      for (int64_t j = 0; j < sequence_length; ++j) {
-        // sequence_width > j > 0
-        T scale =
-            norm_by_times ? (1.0f / static_cast<T>(sequence_length)) : 1.0f;
-        for (int64_t k = 0; k < sequence_width; ++k) {
-          seq_data[(start_pos + j) * sequence_width + k] =
-              padding_data[(j * num_sequences + i) * sequence_width + k] *
-              scale;
-        }
-      }
+                  const framework::LoDTensor& pad_tensor,
+                  framework::LoDTensor* seq_tensor, int pad_seq_len = -1,
+                  int lod_level = 0, bool norm_by_times = false,
+                  const PadLayout layout = kBatchLengthWidth) {
+    auto seq_offsets = framework::ToAbsOffset(seq_tensor->lod())[lod_level];
+    const auto& seq_tensor_dims = seq_tensor->dims();
+    const auto& pad_tensor_dims = pad_tensor.dims();
+    if (pad_seq_len == -1) {
+      pad_seq_len = MaximumSequenceLength(seq_offsets);
     }
+    int step_width = seq_tensor->numel() / seq_tensor_dims[0];
+
+    CheckDims(seq_tensor_dims, pad_tensor_dims, seq_offsets, pad_seq_len,
+              step_width, layout);
+
+    CopyValidData<T>(seq_tensor, &pad_tensor, seq_offsets, pad_seq_len,
+                     step_width, norm_by_times, kPadToSeq, layout);
   }
 };
 
+template class PaddingLoDTensorFunctor<platform::CPUDeviceContext, int>;
+template class PaddingLoDTensorFunctor<platform::CPUDeviceContext, int64_t>;
 template class PaddingLoDTensorFunctor<platform::CPUDeviceContext, float>;
+template class PaddingLoDTensorFunctor<platform::CPUDeviceContext, double>;
+
+template class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, int>;
+template class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, int64_t>;
 template class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, float>;
+template class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/fluid/operators/math/sequence_padding.cu b/paddle/fluid/operators/math/sequence_padding.cu
index 0956a0c17d387f4a174c7ed4e9b1b1f816dcf4ae..035e10dcbe4e2083723e47d7dda75ce267a9f141 100644
--- a/paddle/fluid/operators/math/sequence_padding.cu
+++ b/paddle/fluid/operators/math/sequence_padding.cu
@@ -19,41 +19,32 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-template <typename T, bool NormByTimes, bool Padding>
-__global__ void SequencePaddingKernel(T* padding, T* sequence,
-                                      const size_t* sequence_start_positions,
-                                      const size_t sequence_width,
-                                      const size_t max_sequence_length,
-                                      const size_t num_sequences) {
-  size_t padding_idx = blockIdx.y;
-  size_t start_pos = sequence_start_positions[padding_idx];
-  size_t sequence_length =
-      sequence_start_positions[padding_idx + 1] - start_pos;
-
-  size_t sequence_idx = blockIdx.x * blockDim.y + threadIdx.y;
-  size_t padding_base_idx =
-      (sequence_idx * num_sequences + padding_idx) * sequence_width;
-  size_t sequence_base_idx = (start_pos + sequence_idx) * sequence_width;
-
-  if (sequence_idx < sequence_length) {
-    T scale = NormByTimes ? (1.0f / static_cast<T>(sequence_length)) : 1.0f;
-    if (Padding) {
-      /* sequence -> padding */
-      for (size_t i = threadIdx.x; i < sequence_width; i += blockDim.x) {
-        padding[padding_base_idx + i] = scale * sequence[sequence_base_idx + i];
-      }
-    } else {
-      /* padding -> sequence */
-      for (size_t i = threadIdx.x; i < sequence_width; i += blockDim.x) {
-        sequence[sequence_base_idx + i] = scale * padding[padding_base_idx + i];
-      }
+template <typename T, CopyType Type>
+__global__ void SequencePaddingKernel(
+    T* dst, const T* src, const T* pad_value, bool is_constant_pad,
+    const size_t* seq_offsets, const size_t seq_num, const size_t pad_seq_len,
+    const size_t step_width, bool norm_by_len, const PadLayout layout) {
+  size_t seq_idx = blockIdx.y;
+  size_t seq_len = seq_offsets[seq_idx + 1] - seq_offsets[seq_idx];
+
+  size_t step_idx = blockIdx.x * blockDim.y + threadIdx.y;
+  size_t seq_data_offset = (seq_offsets[seq_idx] + step_idx) * step_width;
+  size_t pad_data_offset = layout == kBatchLengthWidth
+                               ? (seq_idx * pad_seq_len + step_idx) * step_width
+                               : (step_idx * seq_num + seq_idx) * step_width;
+
+  T* dst_data = dst + (Type == kSeqToPad ? pad_data_offset : seq_data_offset);
+  const T* src_data =
+      src + (Type == kSeqToPad ? seq_data_offset : pad_data_offset);
+
+  if (step_idx < seq_len) {
+    float scale = norm_by_len ? (1.0f / static_cast<float>(seq_len)) : 1.0f;
+    for (size_t i = threadIdx.x; i < step_width; i += blockDim.x) {
+      dst_data[i] = scale * src_data[i];
     }
-  } else if (sequence_idx < max_sequence_length) {
-    if (Padding) {
-      /* sequence -> padding */
-      for (size_t i = threadIdx.x; i < sequence_width; i += blockDim.x) {
-        padding[padding_base_idx + i] = 0;
-      }
+  } else if (step_idx < pad_seq_len && Type == kSeqToPad) {
+    for (size_t i = threadIdx.x; i < step_width; i += blockDim.x) {
+      dst_data[i] = is_constant_pad ? pad_value[0] : pad_value[i];
     }
   }
 }
@@ -62,74 +53,59 @@ template <typename T>
 class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
  public:
   void operator()(const platform::CUDADeviceContext& context,
-                  const framework::LoDTensor& seq, framework::Tensor* padding,
-                  bool norm_by_times) {
-    auto lod = seq.lod();
-    PADDLE_ENFORCE_GT(lod.size(), 0UL,
-                      "The lod of LoDTensor seq should not be null.");
-
-    const size_t level = 0;
-    framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
-
-    auto seq_dims = seq.dims();
-    PADDLE_ENFORCE_EQ(seq_dims[0],
-                      static_cast<int64_t>(abs_offset_lod[level].back()),
-                      "The first dimension of LoDTensor seq should be "
-                      "equal to the sum of all sequences's length.");
-
-    auto padding_dims = padding->dims();
-    PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL,
-                      "The input padding should be a 3-D Tensor of shape "
-                      "[max_sequence_length, num_sequences, sequence_width].");
-
-    int64_t max_sequence_length = MaximumSequenceLength(lod, level);
-    PADDLE_ENFORCE_EQ(padding_dims[0], max_sequence_length,
-                      "The first dimension of Tensor padding should be the "
-                      "maximum length of all sequences in LoDTensor seq.");
-
-    const int64_t num_sequences = abs_offset_lod[level].size() - 1;
-    PADDLE_ENFORCE_EQ(padding_dims[1], num_sequences,
-                      "The second dimension of Tensor padding should be the "
-                      "number of sequences in LoDTensor seq.");
-
-    const int64_t sequence_width = seq.numel() / seq_dims[0];
-    PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width,
-                      "The third dimension of Tensor padding should be the "
-                      "width of sequence in LoDTensor seq.");
-
-    if (!norm_by_times && num_sequences == 1UL) {
-      TensorCopy(seq, context.GetPlace(), context, padding);
-      padding->Resize(padding_dims);
+                  const framework::LoDTensor& seq_tensor,
+                  framework::LoDTensor* pad_tensor,
+                  const framework::LoDTensor& pad_value, int pad_seq_len = -1,
+                  int lod_level = 0, bool norm_by_times = false,
+                  const PadLayout layout = kBatchLengthWidth) {
+    auto seq_lod = seq_tensor.lod();
+    const auto seq_offsets = framework::ToAbsOffset(seq_lod)[lod_level];
+    const auto& seq_tensor_dims = seq_tensor.dims();
+    const auto& pad_tensor_dims = pad_tensor->dims();
+    int max_seq_len = MaximumSequenceLength(seq_offsets);
+    if (pad_seq_len == -1) {
+      pad_seq_len = max_seq_len;
+    }
+    PADDLE_ENFORCE_GE(pad_seq_len, max_seq_len,
+                      "The pad_seq_len must be equal to or greater than the "
+                      "original max sequence length.");
+    int step_width = seq_tensor.numel() / seq_tensor_dims[0];
+    int seq_num = seq_offsets.size() - 1;
+
+    CheckDims(seq_tensor_dims, pad_tensor_dims, seq_offsets, pad_seq_len,
+              step_width, layout);
+    PADDLE_ENFORCE(pad_value.numel() == 1 || pad_value.numel() == step_width,
+                   "The numel of 'pad_value' can only be 1 or be equal to the "
+                   "'step_width'.");
+
+    if (!norm_by_times && seq_num == 1UL && pad_seq_len == max_seq_len) {
+      TensorCopy(seq_tensor, context.GetPlace(), context, pad_tensor);
+      pad_tensor->Resize(pad_tensor_dims);
       return;
     }
 
-    const int64_t kBlockSize = 512;
+    const int kBlockSize = 512;
 
     /* At least use 32 threads to copy sequence_width elements,
      * and at least 8 elements for each thread.
      */
     size_t block_dim_x =
-        std::min(((((sequence_width + 7) >> 3) + 31) >> 5) << 5, kBlockSize);
+        std::min(((((step_width + 7) >> 3) + 31) >> 5) << 5, kBlockSize);
     size_t block_dim_y = kBlockSize / block_dim_x;
     dim3 threads(block_dim_x, block_dim_y);
 
-    size_t grid_dim_x = (max_sequence_length + block_dim_y - 1) / block_dim_y;
-    size_t grid_dim_y = num_sequences;
+    size_t grid_dim_x = (pad_seq_len + block_dim_y - 1) / block_dim_y;
+    size_t grid_dim_y = seq_num;
     dim3 grid(grid_dim_x, grid_dim_y);
 
-    const T* seq_data = seq.data<T>();
-    T* padding_data = padding->data<T>();
-    if (norm_by_times) {
-      SequencePaddingKernel<T, 1, 1><<<grid, threads, 0, context.stream()>>>(
-          padding_data, const_cast<T*>(seq_data),
-          abs_offset_lod[level].CUDAData(context.GetPlace()), sequence_width,
-          max_sequence_length, num_sequences);
-    } else {
-      SequencePaddingKernel<T, 0, 1><<<grid, threads, 0, context.stream()>>>(
-          padding_data, const_cast<T*>(seq_data),
-          abs_offset_lod[level].CUDAData(context.GetPlace()), sequence_width,
-          max_sequence_length, num_sequences);
-    }
+    const T* seq_data = seq_tensor.data<T>();
+    T* pad_data = pad_tensor->data<T>();
+    const T* pad_value_data = pad_value.data<T>();
+
+    SequencePaddingKernel<T, kSeqToPad><<<grid, threads, 0, context.stream()>>>(
+        pad_data, seq_data, pad_value_data, pad_value.numel() == 1,
+        seq_offsets.CUDAData(context.GetPlace()), seq_num, pad_seq_len,
+        step_width, norm_by_times, layout);
   }
 };
 
@@ -137,79 +113,62 @@ template <typename T>
 class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
  public:
   void operator()(const platform::CUDADeviceContext& context,
-                  framework::LoDTensor* seq, const framework::Tensor& padding,
-                  bool norm_by_times) {
-    auto lod = seq->lod();
-    PADDLE_ENFORCE_GT(lod.size(), 0UL,
-                      "The lod of LoDTensor seq should not be null.");
-
-    const size_t level = 0;
-    framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
-
-    auto seq_dims = seq->dims();
-    PADDLE_ENFORCE_EQ(seq_dims[0],
-                      static_cast<int64_t>(abs_offset_lod[level].back()),
-                      "The first dimension of LoDTensor seq should be "
-                      "equal to the sum of all sequences's length.");
-
-    auto padding_dims = padding.dims();
-    PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL,
-                      "The input padding should be a 3-D Tensor of shape "
-                      "[max_sequnece_length, num_sequences, sequence_width].");
-
-    int64_t max_sequence_length = MaximumSequenceLength(lod, level);
-    PADDLE_ENFORCE_EQ(padding_dims[0], max_sequence_length,
-                      "The first dimension of Tensor padding should be "
-                      "the maximum length of all sequences in LoDTensor seq.");
-
-    const int64_t num_sequences = abs_offset_lod[level].size() - 1;
-    PADDLE_ENFORCE_EQ(padding_dims[1], num_sequences,
-                      "The second dimension of Tensor padding should be "
-                      "the number of sequences in LoDTensor seq.");
-
-    const int64_t sequence_width = seq->numel() / seq_dims[0];
-    PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width,
-                      "The third dimension of Tensor padding should be the "
-                      "width of sequence in LoDTensor seq.");
-
-    if (!norm_by_times && num_sequences == 1UL) {
-      TensorCopy(padding, context.GetPlace(), context, seq);
-      seq->Resize(seq_dims);
+                  const framework::LoDTensor& pad_tensor,
+                  framework::LoDTensor* seq_tensor, int pad_seq_len = -1,
+                  int lod_level = 0, bool norm_by_times = false,
+                  const PadLayout layout = kBatchLengthWidth) {
+    auto seq_offsets = framework::ToAbsOffset(seq_tensor->lod())[lod_level];
+    const auto& seq_tensor_dims = seq_tensor->dims();
+    const auto& pad_tensor_dims = pad_tensor.dims();
+    int max_seq_len = MaximumSequenceLength(seq_offsets);
+    if (pad_seq_len == -1) {
+      pad_seq_len = max_seq_len;
+    }
+    int step_width = seq_tensor->numel() / seq_tensor_dims[0];
+    int seq_num = seq_offsets.size() - 1;
+
+    CheckDims(seq_tensor_dims, pad_tensor_dims, seq_offsets, pad_seq_len,
+              step_width, layout);
+
+    if (!norm_by_times && seq_num == 1UL && pad_seq_len == max_seq_len) {
+      TensorCopy(pad_tensor, context.GetPlace(), context, seq_tensor);
+      seq_tensor->Resize(seq_tensor_dims);
       return;
     }
 
-    const int64_t kBlockSize = 512;
+    const int kBlockSize = 512;
 
     /* At least use 32 threads to copy sequence_width elements,
      * and at least 8 elements for each thread.
      */
     size_t block_dim_x =
-        std::min(((((sequence_width + 7) >> 3) + 31) >> 5) << 5, kBlockSize);
+        std::min(((((step_width + 7) >> 3) + 31) >> 5) << 5, kBlockSize);
     size_t block_dim_y = kBlockSize / block_dim_x;
     dim3 threads(block_dim_x, block_dim_y);
 
-    size_t grid_dim_x = (max_sequence_length + block_dim_y - 1) / block_dim_y;
-    size_t grid_dim_y = num_sequences;
+    size_t grid_dim_x = (pad_seq_len + block_dim_y - 1) / block_dim_y;
+    size_t grid_dim_y = seq_num;
     dim3 grid(grid_dim_x, grid_dim_y);
 
-    const T* padding_data = padding.data<T>();
-    T* seq_data = seq->data<T>();
-    if (norm_by_times) {
-      SequencePaddingKernel<T, 1, 0><<<grid, threads, 0, context.stream()>>>(
-          const_cast<T*>(padding_data), seq_data,
-          abs_offset_lod[level].CUDAData(context.GetPlace()), sequence_width,
-          max_sequence_length, num_sequences);
-    } else {
-      SequencePaddingKernel<T, 0, 0><<<grid, threads, 0, context.stream()>>>(
-          const_cast<T*>(padding_data), seq_data,
-          abs_offset_lod[level].CUDAData(context.GetPlace()), sequence_width,
-          max_sequence_length, num_sequences);
-    }
+    const T* pad_data = pad_tensor.data<T>();
+    T* seq_data = seq_tensor->data<T>();
+
+    SequencePaddingKernel<T, kPadToSeq><<<grid, threads, 0, context.stream()>>>(
+        seq_data, pad_data, nullptr, false,
+        seq_offsets.CUDAData(context.GetPlace()), seq_num, pad_seq_len,
+        step_width, norm_by_times, layout);
   }
 };
 
+template class PaddingLoDTensorFunctor<platform::CUDADeviceContext, int>;
+template class PaddingLoDTensorFunctor<platform::CUDADeviceContext, int64_t>;
 template class PaddingLoDTensorFunctor<platform::CUDADeviceContext, float>;
+template class PaddingLoDTensorFunctor<platform::CUDADeviceContext, double>;
+
+template class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, int>;
+template class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, int64_t>;
 template class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, float>;
+template class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/fluid/operators/math/sequence_padding.h b/paddle/fluid/operators/math/sequence_padding.h
index b56e6db1ebdac1a00561c07845c03bb8fbd8d35a..e752aa58979dddba4d010071d2c4b5dc3e0c6756 100644
--- a/paddle/fluid/operators/math/sequence_padding.h
+++ b/paddle/fluid/operators/math/sequence_padding.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <algorithm>
+#include <vector>
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/platform/device_context.h"
 
@@ -22,17 +23,33 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-inline static size_t MaximumSequenceLength(const framework::LoD& lod,
-                                           const size_t level) {
-  const size_t num_sequences = lod[level].size() - 1;
-  size_t max_sequence_length = 0;
-  framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
-  for (size_t i = 0; i < num_sequences; ++i) {
-    max_sequence_length =
-        std::max(max_sequence_length,
-                 abs_offset_lod[level][i + 1] - abs_offset_lod[level][i]);
+enum PadLayout { kBatchLengthWidth = 0, kLengthBatchWidth };
+
+enum CopyType { kSeqToPad, kPadToSeq };
+
+inline static size_t MaximumSequenceLength(
+    const framework::Vector<size_t>& seq_offset) {
+  size_t seq_num = seq_offset.size() - 1;
+  size_t max_seq_len = 0;
+  for (size_t i = 0; i < seq_num; ++i) {
+    max_seq_len = std::max(max_seq_len, seq_offset[i + 1] - seq_offset[i]);
   }
-  return max_sequence_length;
+  return max_seq_len;
+}
+
+inline static void CheckDims(const framework::DDim& seq_tensor_dims,
+                             const framework::DDim& pad_tensor_dims,
+                             const framework::Vector<size_t>& seq_offset,
+                             int64_t padded_seq_len, int64_t step_width,
+                             const PadLayout& layout) {
+  PADDLE_ENFORCE_EQ(static_cast<size_t>(seq_tensor_dims[0]), seq_offset.back(),
+                    "Value of 1st dimension of the sequence tensor should be "
+                    "equal to sum of lengths of all sequences.");
+
+  PADDLE_ENFORCE(seq_tensor_dims.size() + 1 == pad_tensor_dims.size() ||
+                     seq_tensor_dims.size() == pad_tensor_dims.size(),
+                 "pad_tensor's rank should be 1 greater than seq_tensor's "
+                 "rank, or be equal with it.");
 }
 
 /*
@@ -64,15 +81,22 @@ inline static size_t MaximumSequenceLength(const framework::LoD& lod,
 template <typename DeviceContext, typename T>
 class PaddingLoDTensorFunctor {
  public:
-  void operator()(const DeviceContext& context, const framework::LoDTensor& seq,
-                  framework::Tensor* padding, bool norm_by_times);
+  void operator()(const DeviceContext& context,
+                  const framework::LoDTensor& seq_tensor,
+                  framework::LoDTensor* pad_tensor,
+                  const framework::LoDTensor& pad_value, int pad_seq_len = -1,
+                  int lod_level = 0, bool norm_by_times = false,
+                  const PadLayout layout = kBatchLengthWidth);
 };
 
 template <typename DeviceContext, typename T>
 class UnpaddingLoDTensorFunctor {
  public:
-  void operator()(const DeviceContext& context, framework::LoDTensor* seq,
-                  const framework::Tensor& padding, bool norm_by_times);
+  void operator()(const DeviceContext& context,
+                  const framework::LoDTensor& pad_tensor,
+                  framework::LoDTensor* seq_tensor, int pad_seq_len = -1,
+                  int lod_level = 0, bool norm_by_times = false,
+                  const PadLayout layout = kBatchLengthWidth);
 };
 
 }  // namespace math
diff --git a/paddle/fluid/operators/math/sequence_padding_test.cc b/paddle/fluid/operators/math/sequence_padding_test.cc
index b0c201db0ccbe81d8f57cd984d2cdfd2f6a48f25..4f61b1029c65aedaf4fce771866964fe1d0d6112 100644
--- a/paddle/fluid/operators/math/sequence_padding_test.cc
+++ b/paddle/fluid/operators/math/sequence_padding_test.cc
@@ -23,7 +23,9 @@ void TestSequencePadding(const paddle::framework::LoD& lod,
   paddle::framework::LoDTensor cpu_seq_back;
   paddle::framework::LoDTensor seq;
   paddle::framework::LoDTensor seq_back;
-  paddle::framework::Tensor padding;
+  paddle::framework::LoDTensor padding;
+  paddle::framework::LoDTensor cpu_pad_value;
+  paddle::framework::LoDTensor pad_value;
 
   const size_t level = lod.size() - 1;
   auto seq_dims =
@@ -46,20 +48,33 @@ void TestSequencePadding(const paddle::framework::LoD& lod,
   }
 
   const size_t max_sequence_length =
-      paddle::operators::math::MaximumSequenceLength(lod, level);
+      paddle::operators::math::MaximumSequenceLength(lod[level]);
   const size_t num_sequences = lod[level].size() - 1;
   auto padding_dims =
       paddle::framework::make_ddim({static_cast<int64_t>(max_sequence_length),
                                     static_cast<int64_t>(num_sequences),
                                     static_cast<int64_t>(sequence_width)});
+
   padding.mutable_data<T>(padding_dims, *place);
+
+  T* pad_value_data =
+      cpu_pad_value.mutable_data<T>({1}, paddle::platform::CPUPlace());
+  *pad_value_data = static_cast<T>(0);
+  if (paddle::platform::is_cpu_place(*place)) {
+    pad_value = cpu_pad_value;
+  } else {
+    TensorCopySync(cpu_pad_value, *place, &pad_value);
+  }
+
   paddle::operators::math::PaddingLoDTensorFunctor<DeviceContext, T>()(
-      *context, seq, &padding, false);
+      *context, seq, &padding, pad_value, -1, 0, false,
+      paddle::operators::math::kLengthBatchWidth);
 
   seq_back.set_lod(lod);
   seq_back.mutable_data<T>(seq_dims, *place);
   paddle::operators::math::UnpaddingLoDTensorFunctor<DeviceContext, T>()(
-      *context, &seq_back, padding, false);
+      *context, padding, &seq_back, -1, 0, false,
+      paddle::operators::math::kLengthBatchWidth);
 
   if (paddle::platform::is_cpu_place(*place)) {
     cpu_seq_back = seq_back;
diff --git a/paddle/fluid/operators/math/sequence_pooling.cc b/paddle/fluid/operators/math/sequence_pooling.cc
index f25d3d3f1ee1f89d46b8e7c88ca68048f5203544..69318a6598c8c69eceab7216df6382537153d34f 100644
--- a/paddle/fluid/operators/math/sequence_pooling.cc
+++ b/paddle/fluid/operators/math/sequence_pooling.cc
@@ -103,6 +103,58 @@ class MaxSeqPoolGradFunctor {
   }
 };
 
+template <typename T>
+class LastSeqPoolFunctor {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::LoDTensor& input,
+                  framework::Tensor* output) {
+    // Create pointers to input and output data
+    auto* in_data = input.data<T>();
+    auto* out_data = output->data<T>();
+
+    // Calculate the size of each item in sequence
+    int64_t item_size = input.numel() / input.dims()[0];
+    auto lod = input.lod()[0];
+    int seq_num = static_cast<int>(lod.size()) - 1;
+    for (int i = 0; i < seq_num; ++i) {
+      // Calculate the length of each sequence
+      int64_t seq_len = static_cast<int64_t>(lod[i + 1] - lod[i]);
+      // Point to the begin of next sequence
+      in_data += seq_len * item_size;
+      // Copy the last item of sequence to output
+      std::memcpy(out_data, (in_data - item_size), item_size * sizeof(T));
+      out_data += item_size;
+    }
+  }
+};
+
+template <typename T>
+class FirstSeqPoolFunctor {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::LoDTensor& input,
+                  framework::Tensor* output) {
+    // Create pointers to input and output data
+    auto* in_data = input.data<T>();
+    auto* out_data = output->data<T>();
+
+    // Calculate the size of each item in sequence
+    int64_t item_size = input.numel() / input.dims()[0];
+    auto lod = input.lod()[0];
+    int seq_num = static_cast<int>(lod.size()) - 1;
+    for (int i = 0; i < seq_num; ++i) {
+      // Calculate the length of each sequence
+      int64_t seq_len = static_cast<int64_t>(lod[i + 1] - lod[i]);
+      // Copy the first item of sequence to output
+      std::memcpy(out_data, in_data, item_size * sizeof(T));
+      // Point to the next sequence
+      in_data += seq_len * item_size;
+      out_data += item_size;
+    }
+  }
+};
+
 template <typename T>
 class SequencePoolFunctor<platform::CPUDeviceContext, T> {
  public:
@@ -116,6 +168,16 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> {
       max_pool(context, input, output, index);
       return;
     }
+    if (pooltype == "LAST") {
+      math::LastSeqPoolFunctor<T> last_pool;
+      last_pool(context, input, output);
+      return;
+    }
+    if (pooltype == "FIRST") {
+      math::FirstSeqPoolFunctor<T> first_pool;
+      first_pool(context, input, output);
+      return;
+    }
     auto lod = input.lod()[0];
     auto& place = *context.eigen_device();
     for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
@@ -133,10 +195,6 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> {
       } else if (pooltype == "SQRT") {
         out_e.device(place) = in_e.sum(Eigen::array<int, 1>({{0}})) /
                               std::sqrt(static_cast<T>(h));
-      } else if (pooltype == "LAST") {
-        out_e.device(place) = in_e.chip(h - 1, 0);
-      } else if (pooltype == "FIRST") {
-        out_e.device(place) = in_e.chip(0, 0);
       } else {
         PADDLE_THROW("unsupported pooling pooltype");
       }
diff --git a/paddle/fluid/operators/math/softmax.cu b/paddle/fluid/operators/math/softmax.cu
index a579182ec1bd5d10d95bbf8c6f5a0e70ceaaaf4b..3effe776258cb541dbba32f63eda457d917011f4 100644
--- a/paddle/fluid/operators/math/softmax.cu
+++ b/paddle/fluid/operators/math/softmax.cu
@@ -52,7 +52,7 @@ void SoftmaxCUDNNFunctor<T>::operator()(
       xDesc.descriptor<T>(layout, cudnn_tensor_dims);
   cudnnTensorDescriptor_t cudnn_y_desc =
       xDesc.descriptor<T>(layout, cudnn_tensor_dims);
-  PADDLE_ENFORCE(platform::dynload::cudnnSoftmaxForward(
+  CUDNN_ENFORCE(platform::dynload::cudnnSoftmaxForward(
       context.cudnn_handle(), CUDNN_SOFTMAX_ACCURATE,
       CUDNN_SOFTMAX_MODE_INSTANCE, CudnnDataType<T>::kOne(), cudnn_x_desc,
       X->data<T>(), CudnnDataType<T>::kZero(), cudnn_y_desc,
@@ -83,7 +83,7 @@ void SoftmaxGradCUDNNFunctor<T>::operator()(
       dxDesc.descriptor<T>(layout, cudnn_tensor_dims);
   cudnnTensorDescriptor_t cudnn_ygrad_desc =
       dyDesc.descriptor<T>(layout, cudnn_tensor_dims);
-  PADDLE_ENFORCE(platform::dynload::cudnnSoftmaxBackward(
+  CUDNN_ENFORCE(platform::dynload::cudnnSoftmaxBackward(
       context.cudnn_handle(), CUDNN_SOFTMAX_ACCURATE,
       CUDNN_SOFTMAX_MODE_INSTANCE, CudnnDataType<T>::kOne(), cudnn_y_desc,
       Y->data<T>(), cudnn_ygrad_desc, YGrad->data<T>(),
diff --git a/paddle/fluid/operators/merge_lod_tensor_op.cc b/paddle/fluid/operators/merge_lod_tensor_op.cc
index a16861b3b77fc980ab932b9d88859b38ec36108b..2dc1467b0d4816d5cc0535eb62e936cf342a241c 100644
--- a/paddle/fluid/operators/merge_lod_tensor_op.cc
+++ b/paddle/fluid/operators/merge_lod_tensor_op.cc
@@ -44,8 +44,10 @@ class MergeLoDTensorOp : public framework::OperatorBase {
         scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
     auto level = static_cast<size_t>(Attr<int>("level"));
 
-    auto &mask_dim = mask.dims();
+    PADDLE_ENFORCE(in_true.numel() || in_false.numel(),
+                   "Input(InTrue) or Input(InFalse) should be initialized.");
 
+    auto &mask_dim = mask.dims();
     std::unique_ptr<framework::LoDTensor> cpu_mask{new framework::LoDTensor()};
     if (platform::is_cpu_place(mask.place())) {
       cpu_mask->ShareDataWith(mask);
@@ -59,19 +61,27 @@ class MergeLoDTensorOp : public framework::OperatorBase {
     }
     auto *mask_data = cpu_mask->data<bool>();
 
-    int rank = in_true.dims().size();
-    platform::Place place = in_true.place();
-    std::type_index data_type = in_true.type();
-    framework::DDim in_true_dims =
-        framework::slice_ddim(in_true.dims(), 1, rank);
-
+    platform::Place place = dev_place;
     int64_t batch_size = in_true.dims()[0] + in_false.dims()[0];
 
-    auto in_true_dim_vec = framework::vectorize(in_true_dims);
-    in_true_dim_vec.insert(in_true_dim_vec.begin(), batch_size);
+    std::type_index data_type =
+        in_true.IsInitialized() ? in_true.type() : in_false.type();
+    int rank;
+    framework::DDim in_dims;
+    if (in_true.IsInitialized()) {
+      rank = in_true.dims().size();
+      in_dims = framework::slice_ddim(in_true.dims(), 1, rank);
+    } else {
+      rank = in_false.dims().size();
+      in_dims = framework::slice_ddim(in_false.dims(), 1, rank);
+    }
+
+    auto in_dim_vec = framework::vectorize(in_dims);
+    in_dim_vec.insert(in_dim_vec.begin(), batch_size);
 
-    framework::DDim out_dims = framework::make_ddim(in_true_dim_vec);
+    framework::DDim out_dims = framework::make_ddim(in_dim_vec);
     out->Resize(out_dims);
+
     out->mutable_data(place, data_type);
 
     auto *out_lod = out->mutable_lod();
diff --git a/paddle/fluid/operators/momentum_op.cc b/paddle/fluid/operators/momentum_op.cc
index dcd73e3c3e40f80e07b73944d1f0cc57fea010d3..5f43c5810812260c4384349bdb709716c9a182f5 100644
--- a/paddle/fluid/operators/momentum_op.cc
+++ b/paddle/fluid/operators/momentum_op.cc
@@ -98,7 +98,7 @@ The update equations are as follows:
 $$
 velocity = mu * velocity + gradient \\
 if (use\_nesterov):   \\
-  param = param - gradient * learning\_rate + mu * velocity * learning\_rate \\
+  param = param - (gradient + mu * velocity) * learning\_rate \\
 else:   \\
   param = param - learning\_rate * velocity. \\
 $$
diff --git a/paddle/fluid/operators/momentum_op.cu b/paddle/fluid/operators/momentum_op.cu
index 5eb9d9950248bb50bb823f071c7fff0ddcc47234..a3932db1f3a50305d585cd3d5e86fa1b527df78b 100644
--- a/paddle/fluid/operators/momentum_op.cu
+++ b/paddle/fluid/operators/momentum_op.cu
@@ -30,7 +30,7 @@ __global__ void MomentumKernel(const T* p, const T* g, const T* v,
       T g_val = g[i];
       T v_new = v[i] * mu + g_val;
       v_out[i] = v_new;
-      p_out[i] = p[i] - (g_val - v_new * mu) * lr;
+      p_out[i] = p[i] - (g_val + v_new * mu) * lr;
     }
   } else {
     for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
diff --git a/paddle/fluid/operators/momentum_op.h b/paddle/fluid/operators/momentum_op.h
index 04a1929b84a93af6465bacfe7974a1530296946d..264726040fb566a52b8c0cdee0a1524197d2a675 100644
--- a/paddle/fluid/operators/momentum_op.h
+++ b/paddle/fluid/operators/momentum_op.h
@@ -46,7 +46,7 @@ class MomentumOpKernel : public framework::OpKernel<T> {
 
     v_out = v * mu + g;
     if (use_nesterov) {
-      p_out = p - (g - v_out * mu) * lr[0];
+      p_out = p - (g + v_out * mu) * lr[0];
     } else {
       p_out = p - lr[0] * v_out;
     }
diff --git a/paddle/fluid/operators/mul_op.cc b/paddle/fluid/operators/mul_op.cc
index 51993398bd3427e1f0da155918395bc50fa65e45..2a8e4af516ce9341772d4668dc993215b4aae24d 100644
--- a/paddle/fluid/operators/mul_op.cc
+++ b/paddle/fluid/operators/mul_op.cc
@@ -54,9 +54,9 @@ class MulOp : public framework::OperatorWithKernel {
     auto x_mat_dims = framework::flatten_to_2d(x_dims, x_num_col_dims);
     auto y_mat_dims = framework::flatten_to_2d(y_dims, y_num_col_dims);
 
-    PADDLE_ENFORCE_EQ(
-        x_mat_dims[1], y_mat_dims[0],
-        "First matrix's width must be equal with second matrix's height.");
+    PADDLE_ENFORCE_EQ(x_mat_dims[1], y_mat_dims[0],
+                      "First matrix's width must be equal with second matrix's "
+                      "height. %s, %s");
     std::vector<int64_t> output_dims;
     output_dims.reserve(
         static_cast<size_t>(x_num_col_dims + y_dims.size() - y_num_col_dims));
diff --git a/paddle/fluid/operators/mul_op.h b/paddle/fluid/operators/mul_op.h
index 15dd975e3bbf80b2e616e6628555e812d025f70a..f72824806ed6ee3a4490938403d441326f8a3d4a 100644
--- a/paddle/fluid/operators/mul_op.h
+++ b/paddle/fluid/operators/mul_op.h
@@ -62,23 +62,31 @@ class MulGradKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     int x_num_col_dims = ctx.template Attr<int>("x_num_col_dims");
     int y_num_col_dims = ctx.template Attr<int>("y_num_col_dims");
-    const Tensor* x = ctx.Input<Tensor>("X");
-    const Tensor* y = ctx.Input<Tensor>("Y");
-    const Tensor x_matrix = x->dims().size() > 2
-                                ? framework::ReshapeToMatrix(*x, x_num_col_dims)
-                                : *x;
-    const Tensor y_matrix = y->dims().size() > 2
-                                ? framework::ReshapeToMatrix(*y, y_num_col_dims)
-                                : *y;
-    const Tensor* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* x = ctx.Input<framework::LoDTensor>("X");
+    auto* y = ctx.Input<framework::LoDTensor>("Y");
+    auto x_matrix = x->dims().size() > 2
+                        ? framework::ReshapeToMatrix(*x, x_num_col_dims)
+                        : static_cast<const Tensor&>(*x);
+    auto y_matrix = y->dims().size() > 2
+                        ? framework::ReshapeToMatrix(*y, y_num_col_dims)
+                        : static_cast<const Tensor&>(*y);
+    auto* dout = ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
 
     Tensor dout_mat;
     dout_mat.ShareDataWith(*dout);
     dout_mat.Resize({framework::flatten_to_2d(x->dims(), x_num_col_dims)[0],
                      framework::flatten_to_2d(y->dims(), y_num_col_dims)[1]});
 
-    Tensor* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    Tensor* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    auto* dx = ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<framework::LoDTensor>(framework::GradVarName("Y"));
+
+    if (dx != nullptr) {
+      dx->set_lod(x->lod());
+    }
+    if (dy != nullptr) {
+      dy->set_lod(y->lod());
+    }
+
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
     auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
     if (dx) {
diff --git a/paddle/fluid/operators/nccl/CMakeLists.txt b/paddle/fluid/operators/nccl/CMakeLists.txt
index ce0ddd89bfb0d73e237a6f9a777376624d8ef2d4..cdcba8035762d8f442eb8b8ed52a4e3e99ac31b6 100644
--- a/paddle/fluid/operators/nccl/CMakeLists.txt
+++ b/paddle/fluid/operators/nccl/CMakeLists.txt
@@ -1,3 +1,3 @@
-if(WITH_GPU)
+if(WITH_GPU AND NOT WIN32)
   nv_library(nccl_common SRCS nccl_gpu_common.cc DEPS device_context operator )
 endif()
diff --git a/paddle/fluid/operators/norm_op.cu b/paddle/fluid/operators/norm_op.cu
index 1d0021d33ff9ee65c3366183466b94266e6c2999..67449aa4c67bee6606928ef3a2d986a1bdec038f 100644
--- a/paddle/fluid/operators/norm_op.cu
+++ b/paddle/fluid/operators/norm_op.cu
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -11,14 +11,151 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#define EIGEN_USE_GPU
 
+#include <algorithm>
+#include "cub/cub.cuh"
 #include "paddle/fluid/operators/norm_op.h"
 
+namespace paddle {
+namespace operators {
+
+__device__ __forceinline__ float square_root(float x) { return sqrtf(x); }
+
+__device__ __forceinline__ double square_root(double x) { return sqrt(x); }
+
+template <typename T, int BlockDim>
+__global__ void Normalize(const T* x, const int pre,
+                          const int axis_n,  // dim in axis
+                          const int post, const T eps, T* y, T* out_norm) {
+  typedef cub::BlockReduce<T, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  int num = pre * post;
+  for (int i = blockIdx.x; i < num; i += gridDim.x) {
+    int base = (i / post) * post * axis_n + (i % post);
+
+    T sum = 0.0;
+    __shared__ T norm;
+    for (int j = threadIdx.x; j < axis_n; j += blockDim.x) {
+      const T x_ij = x[base + j * post];
+      sum += x_ij * x_ij;
+    }
+    T reduce_result = BlockReduce(temp_storage).Sum(sum);
+
+    if (threadIdx.x == 0) {
+      norm = square_root(reduce_result + eps);
+      out_norm[i] = norm;
+    }
+    __syncthreads();
+    for (int j = threadIdx.x; j < axis_n; j += blockDim.x) {
+      const int index = base + j * post;
+      y[index] = x[index] / norm;
+    }
+  }
+}
+
+template <typename DeviceContext, typename T>
+class NormCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in_x = ctx.Input<framework::Tensor>("X");
+    auto* out_y = ctx.Output<framework::Tensor>("Out");
+    auto* out_norm = ctx.Output<framework::Tensor>("Norm");
+    const T* x = in_x->data<T>();
+    T* y = out_y->mutable_data<T>(ctx.GetPlace());
+    T* norm = out_norm->mutable_data<T>(ctx.GetPlace());
+
+    auto xdim = in_x->dims();
+    auto ndim = out_norm->dims();
+    int axis = ctx.Attr<int>("axis");
+    T eps = static_cast<T>(ctx.Attr<float>("epsilon"));
+    if (axis < 0) axis = xdim.size() + axis;
+    int pre, n, post;
+    GetDims(xdim, axis, &pre, &n, &post);
+
+    auto& dev_ctx = ctx.cuda_device_context();
+
+    const int block = 512;
+    int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+    const int max_blocks = std::max(max_threads / block, 1);
+    int grid = std::min(max_blocks, pre * post);
+    Normalize<T, block><<<grid, block, 0, dev_ctx.stream()>>>(x, pre, n, post,
+                                                              eps, y, norm);
+  }
+};
+
+template <typename T, int BlockDim>
+__global__ void NormalizeGradient(const T* x, const T* x_norm, const T* y_grad,
+                                  const int pre, const int axis_n,
+                                  const int post, T* x_grad) {
+  typedef cub::BlockReduce<T, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage_sum;
+  int num = pre * post;
+  for (int i = blockIdx.x; i < num; i += gridDim.x) {
+    T sum = 0.0;
+    __shared__ T row_sum;
+    __shared__ T row_sqrt_norm;
+    __shared__ T row_norm;
+
+    auto base = (i / post) * post * axis_n + (i % post);
+
+    for (int j = threadIdx.x; j < axis_n; j += blockDim.x) {
+      int index = base + j * post;
+      sum += x[index] * y_grad[index];
+    }
+    T reduce_result = BlockReduce(temp_storage_sum).Sum(sum);
+
+    if (threadIdx.x == 0) {
+      row_sum = reduce_result;
+      row_sqrt_norm = x_norm[i];
+      row_norm = row_sqrt_norm * row_sqrt_norm;
+    }
+    __syncthreads();
+    for (int j = threadIdx.x; j < axis_n; j += blockDim.x) {
+      int index = base + j * post;
+      const T x_ij = x[index];
+      const T dy_ij = y_grad[index];
+      x_grad[index] = (dy_ij - x_ij * row_sum / row_norm) / row_sqrt_norm;
+    }
+  }
+}
+
+template <typename DeviceContext, typename T, typename AttrType = T>
+class NormGradCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in_x = ctx.Input<framework::Tensor>("X");
+    auto* in_norm = ctx.Input<framework::Tensor>("Norm");
+    auto* in_dy = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* out_dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    T* dx = out_dx->mutable_data<T>(ctx.GetPlace());
+    const T* x = in_x->data<T>();
+    const T* x_norm = in_norm->data<T>();
+    const T* dy = in_dy->data<T>();
+
+    auto xdim = in_x->dims();
+    int axis = ctx.Attr<int>("axis");
+    if (axis < 0) axis = xdim.size() + axis;
+    int pre, n, post;
+    GetDims(xdim, axis, &pre, &n, &post);
+
+    auto& dev_ctx = ctx.cuda_device_context();
+
+    const int block = 512;
+    int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+    const int max_blocks = std::max(max_threads / block, 1);
+    int grid = std::min(max_blocks, pre * post);
+    NormalizeGradient<T, block><<<grid, block, 0, dev_ctx.stream()>>>(
+        x, x_norm, dy, pre, n, post, dx);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
 namespace ops = paddle::operators;
 using CUDA = paddle::platform::CUDADeviceContext;
 
-REGISTER_OP_CUDA_KERNEL(norm, ops::NormKernel<CUDA, float>,
-                        ops::NormKernel<CUDA, double>);
-REGISTER_OP_CUDA_KERNEL(norm_grad, ops::NormGradKernel<CUDA, float>,
-                        ops::NormGradKernel<CUDA, double>);
+REGISTER_OP_CUDA_KERNEL(norm, ops::NormCUDAKernel<CUDA, float>,
+                        ops::NormCUDAKernel<CUDA, double>);
+REGISTER_OP_CUDA_KERNEL(norm_grad, ops::NormGradCUDAKernel<CUDA, float>,
+                        ops::NormGradCUDAKernel<CUDA, double>);
diff --git a/paddle/fluid/operators/norm_op.h b/paddle/fluid/operators/norm_op.h
index 3167bdc8ac718b23435690577e4163826d14a332..d0224177ecf7f0c918def08ff4dd6a3c8eb349d8 100644
--- a/paddle/fluid/operators/norm_op.h
+++ b/paddle/fluid/operators/norm_op.h
@@ -65,14 +65,17 @@ class NormKernel : public framework::OpKernel<T> {
     Eigen::DSizes<int, 1> rdim(1);
     // y = x / sqrt((sum(x * x) + epsilon))
     // norm = sqrt(sum(x * x) + epsilon)
-    auto sum = x.pow(2).sum(rdim) + eps;
+    auto x2 = x * x;
+    auto sum = x2.sum(rdim) + eps;
     norm.device(*place) = sum.sqrt();
+
     // y = x / norm
     Eigen::DSizes<int, 3> rshape(pre, 1, post);
     Eigen::DSizes<int, 3> bcast(1, n, 1);
     y.device(*place) = x / norm.reshape(rshape).broadcast(bcast);
   }
 };
+
 template <typename DeviceContext, typename T, typename AttrType = T>
 class NormGradKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/one_hot_op.cu b/paddle/fluid/operators/one_hot_op.cu
index 625065692c1f32c89d9e566d00051e237ac9a3af..59d8b9b8a8d554eb16826712ff634eed5df2d648 100644
--- a/paddle/fluid/operators/one_hot_op.cu
+++ b/paddle/fluid/operators/one_hot_op.cu
@@ -41,7 +41,7 @@ struct OneHotOpCUDAFunctor {
       : in_(in), out_(out), depth_(depth), ctx_(ctx) {}
 
   template <typename OutT>
-  void operator()() const {
+  void apply() const {
     auto* p_in_data = in_->data<InT>();
     auto numel = in_->numel();
     auto* p_out_data = out_->mutable_data<OutT>(ctx_.GetPlace());
diff --git a/paddle/fluid/operators/one_hot_op.h b/paddle/fluid/operators/one_hot_op.h
index 7e77f25089c4bd0297b0eb5a0ed7555cc0af5a9f..1ebd2676496940ff8f90caaaded5c8227bd7ae78 100644
--- a/paddle/fluid/operators/one_hot_op.h
+++ b/paddle/fluid/operators/one_hot_op.h
@@ -31,7 +31,7 @@ struct OneHotOpFunctor {
       : in_(in), out_(out), depth_(depth), ctx_(ctx) {}
 
   template <typename OutT>
-  void operator()() const {
+  void apply() const {
     auto* p_in_data = in_->data<InT>();
     auto numel = in_->numel();
     auto* p_out_data = out_->mutable_data<OutT>(ctx_.GetPlace());
diff --git a/paddle/fluid/operators/pad2d_op.cc b/paddle/fluid/operators/pad2d_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a706d05fd7c35ef993f5199f0f893622cb863c5d
--- /dev/null
+++ b/paddle/fluid/operators/pad2d_op.cc
@@ -0,0 +1,584 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+template <typename T>
+void Pad2DConstNCHW(const T* in_data, const int num, const int channels,
+                    const int in_height, const int in_width,
+                    const int out_height, const int out_width,
+                    const int pad_top, const int pad_left, T value,
+                    T* out_data) {
+  for (int n = 0; n < num; ++n) {
+    for (int c = 0; c < channels; ++c) {
+      for (int out_h = 0; out_h < out_height; ++out_h) {
+        for (int out_w = 0; out_w < out_width; ++out_w) {
+          int in_h = out_h - pad_top;
+          int in_w = out_w - pad_left;
+          out_data[out_h * out_width + out_w] =
+              (in_h < 0 || in_w < 0 || in_h >= in_height || in_w >= in_width)
+                  ? value
+                  : in_data[in_h * in_width + in_w];
+        }
+      }
+      in_data += in_height * in_width;
+      out_data += out_height * out_width;
+    }
+  }
+}
+
+template <typename T>
+void Pad2DConstNHWC(const T* in_data, const int num, const int channels,
+                    const int in_height, const int in_width,
+                    const int out_height, const int out_width,
+                    const int pad_top, const int pad_left, T value,
+                    T* out_data) {
+  for (int n = 0; n < num; ++n) {
+    for (int out_h = 0; out_h < out_height; ++out_h) {
+      for (int out_w = 0; out_w < out_width; ++out_w) {
+        int in_h = out_h - pad_top;
+        int in_w = out_w - pad_left;
+        const int out_index = (out_h * out_width + out_w) * channels;
+        if (in_h < 0 || in_w < 0 || in_h >= in_height || in_w >= in_width) {
+          for (int c = 0; c < channels; ++c) {
+            out_data[out_index + c] = value;
+          }
+        } else {
+          const int in_index = (in_h * in_width + in_w) * channels;
+          for (int c = 0; c < channels; ++c) {
+            out_data[out_index + c] = in_data[in_index + c];
+          }
+        }
+      }
+    }
+    in_data += in_height * in_width * channels;
+    out_data += out_height * out_width * channels;
+  }
+}
+
+template <typename T>
+void Pad2DReflectNCHW(const T* in_data, const int num, const int channels,
+                      const int in_height, const int in_width,
+                      const int out_height, const int out_width,
+                      const int pad_top, const int pad_left, T* out_data) {
+  for (int n = 0; n < num; ++n) {
+    for (int c = 0; c < channels; ++c) {
+      for (int out_h = 0; out_h < out_height; ++out_h) {
+        for (int out_w = 0; out_w < out_width; ++out_w) {
+          int in_h = out_h - pad_top;
+          int in_w = out_w - pad_left;
+          in_h = std::max(in_h, -in_h);  // reflect by 0
+          in_h =
+              std::min(in_h, 2 * in_height - in_h - 2);  // reflect by in_height
+          in_w = std::max(in_w, -in_w);                  // reflect by 0
+          in_w =
+              std::min(in_w, 2 * in_width - in_w - 2);  // reflect by in_width
+          out_data[out_h * out_width + out_w] = in_data[in_h * in_width + in_w];
+        }
+      }
+      in_data += in_height * in_width;
+      out_data += out_height * out_width;
+    }
+  }
+}
+
+template <typename T>
+void Pad2DReflectNHWC(const T* in_data, const int num, const int channels,
+                      const int in_height, const int in_width,
+                      const int out_height, const int out_width,
+                      const int pad_top, const int pad_left, T* out_data) {
+  for (int n = 0; n < num; ++n) {
+    for (int out_h = 0; out_h < out_height; ++out_h) {
+      for (int out_w = 0; out_w < out_width; ++out_w) {
+        const int out_index = (out_h * out_width + out_w) * channels;
+        int in_h = out_h - pad_top;
+        int in_w = out_w - pad_left;
+        in_h = std::max(in_h, -in_h);
+        in_h = std::min(in_h, 2 * in_height - in_h - 2);
+        in_w = std::max(in_w, -in_w);
+        in_w = std::min(in_w, 2 * in_width - in_w - 2);
+        const int in_index = (in_h * in_width + in_w) * channels;
+
+        for (int c = 0; c < channels; ++c) {
+          out_data[out_index + c] = in_data[in_index + c];
+        }
+      }
+    }
+    in_data += in_height * in_width * channels;
+    out_data += out_height * out_width * channels;
+  }
+}
+
+template <typename T>
+void Pad2DEdgeNCHW(const T* in_data, const int num, const int channels,
+                   const int in_height, const int in_width,
+                   const int out_height, const int out_width, const int pad_top,
+                   const int pad_left, T* out_data) {
+  for (int n = 0; n < num; ++n) {
+    for (int c = 0; c < channels; ++c) {
+      for (int out_h = 0; out_h < out_height; ++out_h) {
+        for (int out_w = 0; out_w < out_width; ++out_w) {
+          int in_h = std::min(in_height - 1, std::max(out_h - pad_top, 0));
+          int in_w = std::min(in_width - 1, std::max(out_w - pad_left, 0));
+          out_data[out_h * out_width + out_w] = in_data[in_h * in_width + in_w];
+        }
+      }
+      in_data += in_height * in_width;
+      out_data += out_height * out_width;
+    }
+  }
+}
+
+template <typename T>
+void Pad2DEdgeNHWC(const T* in_data, const int num, const int channels,
+                   const int in_height, const int in_width,
+                   const int out_height, const int out_width, const int pad_top,
+                   const int pad_left, T* out_data) {
+  for (int n = 0; n < num; ++n) {
+    for (int out_h = 0; out_h < out_height; ++out_h) {
+      for (int out_w = 0; out_w < out_width; ++out_w) {
+        const int out_index = (out_h * out_width + out_w) * channels;
+        int in_h = std::min(in_height - 1, std::max(out_h - pad_top, 0));
+        int in_w = std::min(in_width - 1, std::max(out_w - pad_left, 0));
+        const int in_index = (in_h * in_width + in_w) * channels;
+        for (int c = 0; c < channels; ++c) {
+          out_data[out_index + c] = in_data[in_index + c];
+        }
+      }
+    }
+    in_data += in_height * in_width * channels;
+    out_data += out_height * out_width * channels;
+  }
+}
+
+template <typename T>
+void Pad2DGradConstNCHW(T* d_in_data, const int num, const int channels,
+                        const int in_height, const int in_width,
+                        const int out_height, const int out_width,
+                        const int pad_top, const int pad_left,
+                        const T* d_out_data) {
+  for (int n = 0; n < num; ++n) {
+    for (int c = 0; c < channels; ++c) {
+      for (int out_h = 0; out_h < out_height; ++out_h) {
+        for (int out_w = 0; out_w < out_width; ++out_w) {
+          int in_h = out_h - pad_top;
+          int in_w = out_w - pad_left;
+          if (!(in_h < 0 || in_w < 0 || in_h >= in_height ||
+                in_w >= in_width)) {
+            d_in_data[in_h * in_width + in_w] =
+                d_out_data[out_h * out_width + out_w];
+          }
+        }
+      }
+      d_in_data += in_height * in_width;
+      d_out_data += out_height * out_width;
+    }
+  }
+}
+
+template <typename T>
+void Pad2DGradConstNHWC(T* d_in_data, const int num, const int channels,
+                        const int in_height, const int in_width,
+                        const int out_height, const int out_width,
+                        const int pad_top, const int pad_left,
+                        const T* d_out_data) {
+  for (int n = 0; n < num; ++n) {
+    for (int out_h = 0; out_h < out_height; ++out_h) {
+      for (int out_w = 0; out_w < out_width; ++out_w) {
+        int in_h = out_h - pad_top;
+        int in_w = out_w - pad_left;
+        const int out_index = (out_h * out_width + out_w) * channels;
+        if (!(in_h < 0 || in_w < 0 || in_h >= in_height || in_w >= in_width)) {
+          const int in_index = (in_h * in_width + in_w) * channels;
+          for (int c = 0; c < channels; ++c) {
+            d_in_data[in_index + c] = d_out_data[out_index + c];
+          }
+        }
+      }
+    }
+    d_in_data += in_height * in_width * channels;
+    d_out_data += out_height * out_width * channels;
+  }
+}
+
+template <typename T>
+void Pad2DGradReflectNCHW(T* d_in_data, const int num, const int channels,
+                          const int in_height, const int in_width,
+                          const int out_height, const int out_width,
+                          const int pad_top, const int pad_left,
+                          const T* d_out_data) {
+  for (int n = 0; n < num; ++n) {
+    for (int c = 0; c < channels; ++c) {
+      for (int out_h = 0; out_h < out_height; ++out_h) {
+        for (int out_w = 0; out_w < out_width; ++out_w) {
+          int in_h = out_h - pad_top;
+          int in_w = out_w - pad_left;
+          in_h = std::max(in_h, -in_h);  // reflect over 0
+          in_h = std::min(in_h,
+                          2 * in_height - in_h - 2);  // reflect over in_height
+          in_w = std::max(in_w, -in_w);               // reflect over 0
+          in_w =
+              std::min(in_w, 2 * in_width - in_w - 2);  // reflect over in_width
+          d_in_data[in_h * in_width + in_w] +=
+              d_out_data[out_h * out_width + out_w];
+        }
+      }
+      d_in_data += in_height * in_width;
+      d_out_data += out_height * out_width;
+    }
+  }
+}
+
+template <typename T>
+void Pad2DGradReflectNHWC(T* d_in_data, const int num, const int channels,
+                          const int in_height, const int in_width,
+                          const int out_height, const int out_width,
+                          const int pad_top, const int pad_left,
+                          const T* d_out_data) {
+  for (int n = 0; n < num; ++n) {
+    for (int out_h = 0; out_h < out_height; ++out_h) {
+      for (int out_w = 0; out_w < out_width; ++out_w) {
+        const int out_index = (out_h * out_width + out_w) * channels;
+        int in_h = out_h - pad_top;
+        int in_w = out_w - pad_left;
+        in_h = std::max(in_h, -in_h);
+        in_h = std::min(in_h, 2 * in_height - in_h - 2);
+        in_w = std::max(in_w, -in_w);
+        in_w = std::min(in_w, 2 * in_width - in_w - 2);
+        const int in_index = (in_h * in_width + in_w) * channels;
+        for (int c = 0; c < channels; ++c) {
+          d_in_data[in_index + c] += d_out_data[out_index + c];
+        }
+      }
+    }
+    d_in_data += in_height * in_width * channels;
+    d_out_data += out_height * out_width * channels;
+  }
+}
+
+template <typename T>
+void Pad2DGradEdgeNCHW(T* d_in_data, const int num, const int channels,
+                       const int in_height, const int in_width,
+                       const int out_height, const int out_width,
+                       const int pad_top, const int pad_left,
+                       const T* d_out_data) {
+  for (int n = 0; n < num; ++n) {
+    for (int c = 0; c < channels; ++c) {
+      for (int out_h = 0; out_h < out_height; ++out_h) {
+        for (int out_w = 0; out_w < out_width; ++out_w) {
+          int in_h = std::min(in_height - 1, std::max(out_h - pad_top, 0));
+          int in_w = std::min(in_width - 1, std::max(out_w - pad_left, 0));
+          d_in_data[in_h * in_width + in_w] +=
+              d_out_data[out_h * out_width + out_w];
+        }
+      }
+      d_in_data += in_height * in_width;
+      d_out_data += out_height * out_width;
+    }
+  }
+}
+
+template <typename T>
+void Pad2DGradEdgeNHWC(T* d_in_data, const int num, const int channels,
+                       const int in_height, const int in_width,
+                       const int out_height, const int out_width,
+                       const int pad_top, const int pad_left,
+                       const T* d_out_data) {
+  for (int n = 0; n < num; ++n) {
+    for (int out_h = 0; out_h < out_height; ++out_h) {
+      for (int out_w = 0; out_w < out_width; ++out_w) {
+        const int out_index = (out_h * out_width + out_w) * channels;
+        int in_h = std::min(in_height - 1, std::max(out_h - pad_top, 0));
+        int in_w = std::min(in_width - 1, std::max(out_w - pad_left, 0));
+        const int in_index = (in_h * in_width + in_w) * channels;
+        for (int c = 0; c < channels; ++c) {
+          d_in_data[in_index + c] += d_out_data[out_index + c];
+        }
+      }
+    }
+    d_in_data += in_height * in_width * channels;
+    d_out_data += out_height * out_width * channels;
+  }
+}
+
+template <typename T>
+class Pad2dCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto pads = context.Attr<std::vector<int>>("paddings");
+    auto mode = context.Attr<std::string>("mode");
+    auto data_format = context.Attr<std::string>("data_format");
+    T value = context.Attr<T>("pad_value");
+    auto* x = context.Input<Tensor>("X");
+    auto* out = context.Output<Tensor>("Out");
+    auto in_dims = x->dims();
+    auto out_dims = out->dims();
+    const T* in_data = x->data<T>();
+    T* out_data = out->mutable_data<T>(context.GetPlace());
+    const int pad_top = pads[0];
+    const int pad_left = pads[2];
+    const int num = in_dims[0];
+    if (data_format == "NCHW") {
+      const int channels = in_dims[1];
+      const int in_height = in_dims[2];
+      const int in_width = in_dims[3];
+      const int out_height = out_dims[2];
+      const int out_width = out_dims[3];
+      if (mode == "reflect") {
+        Pad2DReflectNCHW(in_data, num, channels, in_height, in_width,
+                         out_height, out_width, pad_top, pad_left, out_data);
+      } else if (mode == "edge") {
+        Pad2DEdgeNCHW(in_data, num, channels, in_height, in_width, out_height,
+                      out_width, pad_top, pad_left, out_data);
+      } else {
+        Pad2DConstNCHW(in_data, num, channels, in_height, in_width, out_height,
+                       out_width, pad_top, pad_left, value, out_data);
+      }
+    } else {
+      const int channels = in_dims[3];
+      const int in_height = in_dims[1];
+      const int in_width = in_dims[2];
+      const int out_height = out_dims[1];
+      const int out_width = out_dims[2];
+      if (mode == "reflect") {
+        Pad2DReflectNHWC(in_data, num, channels, in_height, in_width,
+                         out_height, out_width, pad_top, pad_left, out_data);
+      } else if (mode == "edge") {
+        Pad2DEdgeNHWC(in_data, num, channels, in_height, in_width, out_height,
+                      out_width, pad_top, pad_left, out_data);
+      } else {
+        Pad2DConstNHWC(in_data, num, channels, in_height, in_width, out_height,
+                       out_width, pad_top, pad_left, value, out_data);
+      }
+    }
+  }
+};
+
+template <typename T>
+class Pad2dGradCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto pads = context.Attr<std::vector<int>>("paddings");
+    auto mode = context.Attr<std::string>("mode");
+    auto data_format = context.Attr<std::string>("data_format");
+    auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* d_in = context.Output<Tensor>(framework::GradVarName("X"));
+    auto d_in_dims = d_in->dims();
+    auto d_out_dims = d_out->dims();
+    const T* d_out_data = d_out->data<T>();
+    T* d_in_data = d_in->mutable_data<T>(context.GetPlace());
+    math::SetConstant<platform::CPUDeviceContext, T> set_zero;
+    set_zero(context.template device_context<platform::CPUDeviceContext>(),
+             d_in, static_cast<T>(0));
+    const int pad_top = pads[0];
+    const int pad_left = pads[2];
+    const int num = d_in_dims[0];
+    if (data_format == "NCHW") {
+      const int channels = d_in_dims[1];
+      const int in_height = d_in_dims[2];
+      const int in_width = d_in_dims[3];
+      const int out_height = d_out_dims[2];
+      const int out_width = d_out_dims[3];
+      if (mode == "reflect") {
+        Pad2DGradReflectNCHW(d_in_data, num, channels, in_height, in_width,
+                             out_height, out_width, pad_top, pad_left,
+                             d_out_data);
+      } else if (mode == "edge") {
+        Pad2DGradEdgeNCHW(d_in_data, num, channels, in_height, in_width,
+                          out_height, out_width, pad_top, pad_left, d_out_data);
+      } else {
+        Pad2DGradConstNCHW(d_in_data, num, channels, in_height, in_width,
+                           out_height, out_width, pad_top, pad_left,
+                           d_out_data);
+      }
+    } else {
+      const int channels = d_in_dims[3];
+      const int in_height = d_in_dims[1];
+      const int in_width = d_in_dims[2];
+      const int out_height = d_out_dims[1];
+      const int out_width = d_out_dims[2];
+      if (mode == "reflect") {
+        Pad2DGradReflectNHWC(d_in_data, num, channels, in_height, in_width,
+                             out_height, out_width, pad_top, pad_left,
+                             d_out_data);
+      } else if (mode == "edge") {
+        Pad2DGradEdgeNHWC(d_in_data, num, channels, in_height, in_width,
+                          out_height, out_width, pad_top, pad_left, d_out_data);
+      } else {
+        Pad2DGradConstNHWC(d_in_data, num, channels, in_height, in_width,
+                           out_height, out_width, pad_top, pad_left,
+                           d_out_data);
+      }
+    }
+  }
+};
+
+class Pad2dOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of Pad2dOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of Pad2dOp should not be null.");
+
+    auto x_dim = ctx->GetInputDim("X");
+    auto paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+    PADDLE_ENFORCE_EQ(x_dim.size(), 4,
+                      "Size of paddings should be equal to 4.");
+    std::vector<int64_t> out_dims(x_dim.size());
+
+    auto data_format = ctx->Attrs().Get<std::string>("data_format");
+    out_dims[0] = x_dim[0];
+    if (data_format == "NCHW") {
+      out_dims[1] = x_dim[1];
+      out_dims[2] = x_dim[2] + paddings[0] + paddings[1];  // height
+      out_dims[3] = x_dim[3] + paddings[2] + paddings[3];  // width
+    } else {                                               // NHWC
+      out_dims[3] = x_dim[3];
+      out_dims[1] = x_dim[1] + paddings[0] + paddings[1];
+      out_dims[2] = x_dim[2] + paddings[2] + paddings[3];
+    }
+
+    ctx->SetOutputDim("Out", framework::make_ddim(out_dims));
+    if (out_dims[0] == x_dim[0]) {
+      // Only pass LoD when the first dimension is equal between
+      // output and input.
+      ctx->ShareLoD("X", /*->*/ "Out");
+    }
+  }
+};
+
+class Pad2dOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "The input of pad2d op. "
+             "The input should be a 4-D tensor with formate NCHW or NHWC.");
+    AddOutput("Out",
+              "The output of pad2d op. "
+              "A tensor with the same shape as X.");
+    AddAttr<std::vector<int>>(
+        "paddings",
+        "(vector<int>) "
+        "A list<int> to describe the padding rules."
+        "paddings=[0, 1, 2, 3] means "
+        "padding 0 row to top, 1 row to bottom, 2 columns to left "
+        "and 3 columns to right. Size of paddings must be 4.");
+    AddAttr<float>("pad_value",
+                   "(float, default 0.0) "
+                   "The value to fill the padded areas in constant mode.")
+        .SetDefault(0.0f);
+    AddAttr<std::string>("mode",
+                         "(float, default constant) "
+                         "Three modes: constant(default), reflect, edge.")
+        .SetDefault("constant");
+    AddAttr<std::string>(
+        "data_format",
+        "(string, default NCHW) Only used in "
+        "An optional string from: \"NHWC\", \"NCHW\". "
+        "Defaults to \"NHWC\". Specify the data format of the input data.")
+        .SetDefault("NCHW");
+    AddComment(R"DOC(
+Pad2d Operator.
+Pad 2-d images accordding to 'paddings' and 'mode'. 
+If mode is 'reflect', paddings[0] and paddings[1] must be no greater
+than height-1. And the width dimension has the same condition.
+
+Given that X is a channel of image from input:
+
+X = [[1, 2, 3],
+     [4, 5, 6]]
+
+Case 0:
+
+paddings = [0, 1, 2, 3],
+mode = 'constant'
+pad_value = 0
+
+Out = [[0, 0, 1, 2, 3, 0, 0, 0]
+       [0, 0, 4, 5, 6, 0, 0, 0]
+       [0, 0, 0, 0, 0, 0, 0, 0]]
+
+Case 1:
+
+paddings = [0, 1, 2, 1],
+mode = 'reflect'
+
+Out = [[3, 2, 1, 2, 3, 2]
+       [6, 5, 4, 5, 6, 5]
+       [3, 2, 1, 2, 3, 2]]
+
+Case 2:
+
+paddings = [0, 1, 2, 1],
+mode = 'edge'
+
+Out = [[1, 1, 1, 2, 3, 3]
+       [4, 4, 4, 5, 6, 6]
+       [4, 4, 4, 5, 6, 6]]
+)DOC");
+  }
+};
+
+class Pad2dOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null");
+    auto x_dims = ctx->GetInputDim("X");
+    auto x_grad_name = framework::GradVarName("X");
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
+    }
+  }
+};
+
+class Pad2dOpGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* bind = new framework::OpDesc();
+    bind->SetInput("X", Input("X"));
+    bind->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    bind->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    bind->SetAttrMap(Attrs());
+    bind->SetType("pad2d_grad");
+    return std::unique_ptr<framework::OpDesc>(bind);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(pad2d, ops::Pad2dOp, ops::Pad2dOpMaker,
+                  ops::Pad2dOpGradMaker);
+REGISTER_OPERATOR(pad2d_grad, ops::Pad2dOpGrad);
+REGISTER_OP_CPU_KERNEL(pad2d, ops::Pad2dCPUKernel<float>);
+REGISTER_OP_CPU_KERNEL(pad2d_grad, ops::Pad2dGradCPUKernel<float>);
diff --git a/paddle/fluid/operators/pad2d_op.cu b/paddle/fluid/operators/pad2d_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9ba0ddbd84a43cfd5f028ce072b5c7606fae343d
--- /dev/null
+++ b/paddle/fluid/operators/pad2d_op.cu
@@ -0,0 +1,432 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/gpu_info.h"
+
+namespace paddle {
+namespace operators {
+
+using platform::PADDLE_CUDA_NUM_THREADS;
+
+#define CUDA_1D_KERNEL_LOOP(i, n)                              \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
+using framework::Tensor;
+
+template <typename T>
+__global__ void Pad2DConstNCHW(const int nthreads, const T* in_data,
+                               const int num, const int channels,
+                               const int in_height, const int in_width,
+                               const int out_height, const int out_width,
+                               const int pad_top, const int pad_left, T value,
+                               T* out_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int nc = index / out_width;
+    const int out_w = index % out_width;
+    const int out_h = nc % out_height;
+    nc /= out_height;
+    int in_h = out_h - pad_top;
+    int in_w = out_w - pad_left;
+    out_data[index] =
+        (in_h < 0 || in_w < 0 || in_h >= in_height || in_w >= in_width)
+            ? value
+            : in_data[(nc * in_height + in_h) * in_width + in_w];
+  }
+}
+
+template <typename T>
+__global__ void Pad2DConstNHWC(const int nthreads, const T* in_data,
+                               const int num, const int channels,
+                               const int in_height, const int in_width,
+                               const int out_height, const int out_width,
+                               const int pad_top, const int pad_left, T value,
+                               T* out_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int n = index / channels;
+    const int c = index % channels;
+    const int out_w = n % out_width;
+    n /= out_width;
+    const int out_h = n % out_height;
+    n /= out_height;
+    const int in_h = out_h - pad_top;
+    const int in_w = out_w - pad_left;
+    out_data[index] =
+        (in_h < 0 || in_w < 0 || in_h >= in_height || in_w >= in_width)
+            ? value
+            : in_data[((n * in_height + in_h) * in_width + in_w) * channels +
+                      c];
+  }
+}
+
+template <typename T>
+__global__ void Pad2DReflectNCHW(const int nthreads, const T* in_data,
+                                 const int num, const int channels,
+                                 const int in_height, const int in_width,
+                                 const int out_height, const int out_width,
+                                 const int pad_top, const int pad_left,
+                                 T* out_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int nc = index / out_width;
+    const int out_w = index % out_width;
+    const int out_h = nc % out_height;
+    nc /= out_height;
+    int in_h = out_h - pad_top;
+    int in_w = out_w - pad_left;
+    in_h = max(in_h, -in_h);                     // reflect by 0
+    in_h = min(in_h, 2 * in_height - in_h - 2);  // reflect by in_height
+    in_w = max(in_w, -in_w);                     // reflect by 0
+    in_w = min(in_w, 2 * in_width - in_w - 2);   // reflect by in_width
+    out_data[index] = in_data[(nc * in_height + in_h) * in_width + in_w];
+  }
+}
+
+template <typename T>
+__global__ void Pad2DReflectNHWC(const int nthreads, const T* in_data,
+                                 const int num, const int channels,
+                                 const int in_height, const int in_width,
+                                 const int out_height, const int out_width,
+                                 const int pad_top, const int pad_left,
+                                 T* out_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int n = index / channels;
+    const int c = index % channels;
+    const int out_w = n % out_width;
+    n /= out_width;
+    const int out_h = n % out_height;
+    n /= out_height;
+    int in_h = out_h - pad_top;
+    int in_w = out_w - pad_left;
+    in_h = max(in_h, -in_h);
+    in_h = min(in_h, 2 * in_height - in_h - 2);
+    in_w = max(in_w, -in_w);
+    in_w = min(in_w, 2 * in_width - in_w - 2);
+    out_data[index] =
+        in_data[((n * in_height + in_h) * in_width + in_w) * channels + c];
+  }
+}
+
+template <typename T>
+__global__ void Pad2DEdgeNCHW(const int nthreads, const T* in_data,
+                              const int num, const int channels,
+                              const int in_height, const int in_width,
+                              const int out_height, const int out_width,
+                              const int pad_top, const int pad_left,
+                              T* out_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int nc = index / out_width;
+    const int out_w = index % out_width;
+    const int out_h = nc % out_height;
+    nc /= out_height;
+    int in_h = min(in_height - 1, max(out_h - pad_top, 0));
+    int in_w = min(in_width - 1, max(out_w - pad_left, 0));
+    out_data[index] = in_data[(nc * in_height + in_h) * in_width + in_w];
+  }
+}
+
+template <typename T>
+__global__ void Pad2DEdgeNHWC(const int nthreads, const T* in_data,
+                              const int num, const int channels,
+                              const int in_height, const int in_width,
+                              const int out_height, const int out_width,
+                              const int pad_top, const int pad_left,
+                              T* out_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int n = index / channels;
+    const int c = index % channels;
+    const int out_w = n % out_width;
+    n /= out_width;
+    const int out_h = n % out_height;
+    n /= out_height;
+    int in_h = min(in_height - 1, max(out_h - pad_top, 0));
+    int in_w = min(in_width - 1, max(out_w - pad_left, 0));
+    out_data[index] =
+        in_data[((n * in_height + in_h) * in_width + in_w) * channels + c];
+  }
+}
+
+template <typename T>
+__global__ void Pad2DGradConstNCHW(const int in_size, T* d_in_data,
+                                   const int num, const int channels,
+                                   const int in_height, const int in_width,
+                                   const int out_height, const int out_width,
+                                   const int pad_top, const int pad_left,
+                                   const T* d_out_data) {
+  CUDA_1D_KERNEL_LOOP(in_index, in_size) {
+    int nc = in_index / in_width;
+    const int out_w = in_index % in_width + pad_left;
+    const int out_h = nc % in_height + pad_top;
+    nc /= in_height;
+    d_in_data[in_index] =
+        d_out_data[(nc * out_height + out_h) * out_width + out_w];
+  }
+}
+
+template <typename T>
+__global__ void Pad2DGradConstNHWC(const int in_size, T* d_in_data,
+                                   const int num, const int channels,
+                                   const int in_height, const int in_width,
+                                   const int out_height, const int out_width,
+                                   const int pad_top, const int pad_left,
+                                   const T* d_out_data) {
+  CUDA_1D_KERNEL_LOOP(in_index, in_size) {
+    int n = in_index / channels;
+    const int c = in_index % channels;
+    const int out_w = n % in_width + pad_left;
+    n /= in_width;
+    const int out_h = n % in_height + pad_top;
+    n /= in_height;
+    d_in_data[in_index] =
+        d_out_data[((n * out_height + out_h) * out_width + out_w) * channels +
+                   c];
+  }
+}
+
+template <typename T>
+__global__ void Pad2DGradReflectNCHW(const int out_size, T* d_in_data,
+                                     const int num, const int channels,
+                                     const int in_height, const int in_width,
+                                     const int out_height, const int out_width,
+                                     const int pad_top, const int pad_left,
+                                     const T* d_out_data) {
+  CUDA_1D_KERNEL_LOOP(out_index, out_size) {
+    int nc = out_index / out_width;
+    const int out_w = out_index % out_width;
+    const int out_h = nc % out_height;
+    nc /= out_height;
+    int in_h = out_h - pad_top;
+    int in_w = out_w - pad_left;
+    in_h = max(in_h, -in_h);
+    in_w = max(in_w, -in_w);
+    in_h = min(in_h, 2 * in_height - in_h - 2);
+    in_w = min(in_w, 2 * in_width - in_w - 2);
+    atomicAdd(&d_in_data[(nc * in_height + in_h) * in_width + in_w],
+              d_out_data[out_index]);
+  }
+}
+
+template <typename T>
+__global__ void Pad2DGradReflectNHWC(const int out_size, T* d_in_data,
+                                     const int num, const int channels,
+                                     const int in_height, const int in_width,
+                                     const int out_height, const int out_width,
+                                     const int pad_top, const int pad_left,
+                                     const T* d_out_data) {
+  CUDA_1D_KERNEL_LOOP(out_index, out_size) {
+    const int c = out_index % channels;
+    int n = out_index / channels;
+    const int out_w = n % out_width;
+    n /= out_width;
+    const int out_h = n % out_height;
+    n /= out_height;
+    int in_h = out_h - pad_top;
+    int in_w = out_w - pad_left;
+    in_h = max(in_h, -in_h);
+    in_w = max(in_w, -in_w);
+    in_h = min(in_h, in_height * 2 - in_h - 2);
+    in_w = min(in_w, in_width * 2 - in_w - 2);
+    atomicAdd(
+        &d_in_data[((n * in_height + in_h) * in_width + in_w) * channels + c],
+        d_out_data[out_index]);
+  }
+}
+
+template <typename T>
+__global__ void Pad2DGradEdgeNCHW(const int out_size, T* d_in_data,
+                                  const int num, const int channels,
+                                  const int in_height, const int in_width,
+                                  const int out_height, const int out_width,
+                                  const int pad_top, const int pad_left,
+                                  const T* d_out_data) {
+  CUDA_1D_KERNEL_LOOP(out_index, out_size) {
+    int nc = out_index / out_width;
+    const int out_w = out_index % out_width;
+    const int out_h = nc % out_height;
+    nc /= out_height;
+    const int in_h = min(in_height - 1, max(out_h - pad_top, 0));
+    const int in_w = min(in_width - 1, max(out_w - pad_left, 0));
+    atomicAdd(&d_in_data[(nc * in_height + in_h) * in_width + in_w],
+              d_out_data[out_index]);
+  }
+}
+
+template <typename T>
+__global__ void Pad2DGradEdgeNHWC(const int out_size, T* d_in_data,
+                                  const int num, const int channels,
+                                  const int in_height, const int in_width,
+                                  const int out_height, const int out_width,
+                                  const int pad_top, const int pad_left,
+                                  const T* d_out_data) {
+  CUDA_1D_KERNEL_LOOP(out_index, out_size) {
+    const int c = out_index % channels;
+    int n = out_index / channels;
+    const int out_w = n % out_width;
+    n /= out_width;
+    const int out_h = n % out_height;
+    n /= out_height;
+    const int in_h = min(in_height - 1, max(out_h - pad_top, 0));
+    const int in_w = min(in_width - 1, max(out_w - pad_left, 0));
+    atomicAdd(
+        &d_in_data[((n * in_height + in_h) * in_width + in_w) * channels + c],
+        d_out_data[out_index]);
+  }
+}
+
+template <typename T>
+class Pad2dCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto pads = context.Attr<std::vector<int>>("paddings");
+    auto mode = context.Attr<std::string>("mode");
+    auto data_format = context.Attr<std::string>("data_format");
+    T value = context.Attr<T>("pad_value");
+    auto* x = context.Input<Tensor>("X");
+    auto* out = context.Output<Tensor>("Out");
+    auto in_dims = x->dims();
+    auto out_dims = out->dims();
+    const T* in_data = x->data<T>();
+    T* out_data = out->mutable_data<T>(context.GetPlace());
+    const int pad_top = pads[0];
+    const int pad_left = pads[2];
+    const int num = in_dims[0];
+
+    auto stream = context.cuda_device_context().stream();
+    int block = PADDLE_CUDA_NUM_THREADS;
+    const int out_size = out->numel();
+    int grid = (out_size + block - 1) / block;
+
+    if (data_format == "NCHW") {
+      const int channels = in_dims[1];
+      const int in_height = in_dims[2];
+      const int in_width = in_dims[3];
+      const int out_height = out_dims[2];
+      const int out_width = out_dims[3];
+      if (mode == "reflect") {
+        Pad2DReflectNCHW<T><<<grid, block, 0, stream>>>(
+            out_size, in_data, num, channels, in_height, in_width, out_height,
+            out_width, pad_top, pad_left, out_data);
+      } else if (mode == "edge") {
+        Pad2DEdgeNCHW<T><<<grid, block, 0, stream>>>(
+            out_size, in_data, num, channels, in_height, in_width, out_height,
+            out_width, pad_top, pad_left, out_data);
+      } else {
+        Pad2DConstNCHW<T><<<grid, block, 0, stream>>>(
+            out_size, in_data, num, channels, in_height, in_width, out_height,
+            out_width, pad_top, pad_left, value, out_data);
+      }
+    } else {
+      const int channels = in_dims[3];
+      const int in_height = in_dims[1];
+      const int in_width = in_dims[2];
+      const int out_height = out_dims[1];
+      const int out_width = out_dims[2];
+      if (mode == "reflect") {
+        Pad2DReflectNHWC<T><<<grid, block, 0, stream>>>(
+            out_size, in_data, num, channels, in_height, in_width, out_height,
+            out_width, pad_top, pad_left, out_data);
+      } else if (mode == "edge") {
+        Pad2DEdgeNHWC<T><<<grid, block, 0, stream>>>(
+            out_size, in_data, num, channels, in_height, in_width, out_height,
+            out_width, pad_top, pad_left, out_data);
+      } else {
+        Pad2DConstNHWC<T><<<grid, block, 0, stream>>>(
+            out_size, in_data, num, channels, in_height, in_width, out_height,
+            out_width, pad_top, pad_left, value, out_data);
+      }
+    }
+  }
+};
+
+template <typename T>
+class Pad2dGradCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto pads = context.Attr<std::vector<int>>("paddings");
+    auto mode = context.Attr<std::string>("mode");
+    auto data_format = context.Attr<std::string>("data_format");
+    auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* d_in = context.Output<Tensor>(framework::GradVarName("X"));
+    auto d_in_dims = d_in->dims();
+    auto d_out_dims = d_out->dims();
+    const T* d_out_data = d_out->data<T>();
+    T* d_in_data = d_in->mutable_data<T>(context.GetPlace());
+
+    math::SetConstant<platform::CUDADeviceContext, T> set_zero;
+    set_zero(context.template device_context<platform::CUDADeviceContext>(),
+             d_in, static_cast<T>(0));
+
+    const int pad_top = pads[0];
+    const int pad_left = pads[2];
+    const int num = d_in_dims[0];
+
+    auto stream = context.cuda_device_context().stream();
+    int block = PADDLE_CUDA_NUM_THREADS;
+    const int out_size = d_out->numel();
+    const int in_size = d_in->numel();
+    int grid = (out_size + block - 1) / block;
+
+    if (data_format == "NCHW") {
+      const int channels = d_in_dims[1];
+      const int in_height = d_in_dims[2];
+      const int in_width = d_in_dims[3];
+      const int out_height = d_out_dims[2];
+      const int out_width = d_out_dims[3];
+      if (mode == "reflect") {
+        Pad2DGradReflectNCHW<T><<<grid, block, 0, stream>>>(
+            out_size, d_in_data, num, channels, in_height, in_width, out_height,
+            out_width, pad_top, pad_left, d_out_data);
+      } else if (mode == "edge") {
+        Pad2DGradEdgeNCHW<T><<<grid, block, 0, stream>>>(
+            out_size, d_in_data, num, channels, in_height, in_width, out_height,
+            out_width, pad_top, pad_left, d_out_data);
+      } else {
+        grid = (in_size + block - 1) / block;
+        Pad2DGradConstNCHW<T><<<grid, block, 0, stream>>>(
+            in_size, d_in_data, num, channels, in_height, in_width, out_height,
+            out_width, pad_top, pad_left, d_out_data);
+      }
+    } else {
+      const int channels = d_in_dims[3];
+      const int in_height = d_in_dims[1];
+      const int in_width = d_in_dims[2];
+      const int out_height = d_out_dims[1];
+      const int out_width = d_out_dims[2];
+      if (mode == "reflect") {
+        Pad2DGradReflectNHWC<T><<<grid, block, 0, stream>>>(
+            out_size, d_in_data, num, channels, in_height, in_width, out_height,
+            out_width, pad_top, pad_left, d_out_data);
+      } else if (mode == "edge") {
+        Pad2DGradEdgeNHWC<T><<<grid, block, 0, stream>>>(
+            out_size, d_in_data, num, channels, in_height, in_width, out_height,
+            out_width, pad_top, pad_left, d_out_data);
+      } else {
+        grid = (in_size + block - 1) / block;
+        Pad2DGradConstNHWC<T><<<grid, block, 0, stream>>>(
+            in_size, d_in_data, num, channels, in_height, in_width, out_height,
+            out_width, pad_top, pad_left, d_out_data);
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(pad2d, ops::Pad2dCUDAKernel<float>);
+REGISTER_OP_CUDA_KERNEL(pad2d_grad, ops::Pad2dGradCUDAKernel<float>);
diff --git a/paddle/fluid/operators/pad_constant_like_op.cc b/paddle/fluid/operators/pad_constant_like_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..37646c7b4c50fc7409002aca56e5462bde93cc30
--- /dev/null
+++ b/paddle/fluid/operators/pad_constant_like_op.cc
@@ -0,0 +1,212 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/pad_constant_like_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class PadConstantLikeOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of PadConstantLikeOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"),
+                   "Input(Y) of PadConstantLikeOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of PadConstantLikeOp should not be null.");
+
+    auto x_dim = ctx->GetInputDim("X");
+    auto y_dim = ctx->GetInputDim("Y");
+
+    PADDLE_ENFORCE_EQ(x_dim.size(), y_dim.size(),
+                      "The dimention of X and Y should be the same.");
+
+    for (int i = 0; i < x_dim.size(); ++i) {
+      PADDLE_ENFORCE_GE(x_dim[i], y_dim[i]);
+    }
+    ctx->SetOutputDim("Out", x_dim);
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("Y")->type()),
+        ctx.device_context());
+  }
+};
+
+class PadConstantLikeOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "The input of pad_constant_like op. "
+             "The input should be a k-D tensor(k > 0 and k < 7)");
+    AddInput("Y",
+             "The input of pad_constant_like op. "
+             "The input should be a k-D tensor(k > 0 and k < 7)");
+    AddOutput("Out",
+              "The output of pad_constant_like op. "
+              "A tensor with the same shape as X.");
+    AddAttr<float>("pad_value",
+                   "(float, default 0.0) "
+                   "The value to fill the padded areas.")
+        .SetDefault(0.0f);
+    AddComment(R"DOC(
+PadConstantLikeOp Operator.
+
+Pad input(Y) with a pad_value, the number of values padded to the edges of each
+axis is specified by the difference of the shape of X and Y.
+((0, shape_x_0 - shape_y_0), … (0, shape_x_n - shape_y_n)) unique pad widths for
+each axis.
+The input should be a k-D tensor(k > 0 and k < 7). As an example:
+
+case1:
+    Given:
+        X = [[1, 2],
+             [3, 4],
+             [1, 2],
+             [3, 4]]],
+        X.shape = (4, 2)
+
+        Y = [[5, 6],
+            [7, 8]],
+        Y.shape = (2, 2)
+
+    And
+        pad_value = 0,
+
+    Return:
+        Out = [[5, 6],
+               [7, 8],
+               [0, 0],
+               [0, 0]]
+        Out.shape = (4, 2)
+
+case2:
+    Given:
+        X = [[[[ 0,  1,  2],
+               [ 3,  4,  5]],
+              [[ 6,  7,  8],
+               [ 9, 10, 11]],
+              [[12, 13, 14],
+               [15, 16, 17]]],
+             [[[18, 19, 20],
+               [21, 22, 23]],
+              [[24, 25, 26],
+               [27, 28, 29]],
+              [[30, 31, 32],
+               [33, 34, 35]]]]
+        X.shape = (2, 3, 2, 3)
+
+        Y = [[[[35, 36, 37]],
+              [[38, 39, 40]],
+              [[41, 42, 43]]]]
+        Y.shape = (1, 3, 1, 3)
+
+    And
+        pad_value = -1,
+
+    Return:
+
+        Out = [[[[35, 36, 37],
+                 [-1, -1, -1]],
+                [[38, 39, 40],
+                 [-1, -1, -1]],
+                [[41, 42, 43],
+                 [-1, -1, -1]]],
+               [[[-1, -1, -1],
+                 [-1, -1, -1]],
+                [[-1, -1, -1],
+                 [-1, -1, -1]],
+                [[-1, -1, -1],
+                 [-1, -1, -1]]]]
+        Out.shape = (2, 3, 2, 3)
+)DOC");
+  }
+};
+
+class PadConstantLikeOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null");
+    auto y_dim = ctx->GetInputDim("Y");
+    auto dout_dim = ctx->GetInputDim(framework::GradVarName("Out"));
+
+    PADDLE_ENFORCE_EQ(dout_dim.size(), y_dim.size(),
+                      "The dimention of X and Y should be the same.");
+
+    auto y_grad_name = framework::GradVarName("Y");
+    if (ctx->HasOutput(y_grad_name)) {
+      ctx->SetOutputDim(y_grad_name, y_dim);
+      ctx->ShareLoD("Y", /*->*/ y_grad_name);
+
+      for (int i = 0; i < y_dim.size(); ++i) {
+        PADDLE_ENFORCE_GE(dout_dim[i], y_dim[i]);
+      }
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("Y")->type()),
+        ctx.device_context());
+  }
+};
+
+class PadConstantLikeOpGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *bind = new framework::OpDesc();
+    bind->SetType("pad_constant_like_grad");
+    bind->SetInput("Y", Input("Y"));
+    bind->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    bind->SetOutput(framework::GradVarName("Y"), InputGrad("Y"));
+    bind->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(bind);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(pad_constant_like, ops::PadConstantLikeOp,
+                  ops::PadConstantLikeOpMaker, ops::PadConstantLikeOpGradMaker);
+REGISTER_OPERATOR(pad_constant_like_grad, ops::PadConstantLikeOpGrad);
+
+REGISTER_OP_CPU_KERNEL(
+    pad_constant_like,
+    ops::PadConstantLikeKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::PadConstantLikeKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    pad_constant_like_grad,
+    ops::PadConstantLikeGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::PadConstantLikeGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/pad_constant_like_op.cu b/paddle/fluid/operators/pad_constant_like_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ea69577904577de353b63491973bf74b7724e18e
--- /dev/null
+++ b/paddle/fluid/operators/pad_constant_like_op.cu
@@ -0,0 +1,27 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/fluid/operators/pad_constant_like_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    pad_constant_like,
+    ops::PadConstantLikeKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::PadConstantLikeKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    pad_constant_like_grad,
+    ops::PadConstantLikeGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::PadConstantLikeGradKernel<paddle::platform::CUDADeviceContext,
+                                   double>);
diff --git a/paddle/fluid/operators/pad_constant_like_op.h b/paddle/fluid/operators/pad_constant_like_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..01d66901afc49a487c344b039b65f547967e95ff
--- /dev/null
+++ b/paddle/fluid/operators/pad_constant_like_op.h
@@ -0,0 +1,93 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/math/padding.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class PadConstantLikeKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto in_x = context.Input<framework::Tensor>("X");
+    auto in_y = context.Input<framework::Tensor>("Y");
+    auto* out = context.Output<framework::Tensor>("Out");
+
+    if (in_x->dims() == in_y->dims()) {
+      // TensorCopy(in_y, context.GetPlace(), context, out);
+      out->ShareDataWith(*in_y);
+      return;
+    }
+
+    T pad_value = context.Attr<T>("pad_value");
+    out->mutable_data<T>(context.GetPlace());
+
+    int rank = context.Input<framework::Tensor>("X")->dims().size();
+
+    std::vector<int> pads(rank * 2, 0);
+
+    for (int j = 0; j < rank; ++j) {
+      pads[j * 2] = 0;
+      pads[j * 2 + 1] = static_cast<int>(in_x->dims()[j] - in_y->dims()[j]);
+    }
+
+    math::PaddingFunctor<DeviceContext, T>(rank, context, pads, pad_value,
+                                           *in_y, out);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class PadConstantLikeGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto in_y = context.Input<framework::Tensor>("Y");
+    auto in_dout =
+        context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* d_y = context.Output<framework::Tensor>(framework::GradVarName("Y"));
+
+    if (d_y == nullptr) {
+      return;
+    }
+
+    if (in_dout->dims() == in_y->dims()) {
+      // TensorCopy(in_dout, context.GetPlace(), context, d_y);
+      d_y->ShareDataWith(*in_dout);
+      return;
+    }
+
+    d_y->mutable_data<T>(context.GetPlace());
+    int rank = in_dout->dims().size();
+
+    std::vector<int> pads(static_cast<size_t>(rank) * 2, 0);
+    for (int j = 0; j < rank; ++j) {
+      pads[j * 2] = 0;
+      pads[j * 2 + 1] = static_cast<int>(in_dout->dims()[j] - in_y->dims()[j]);
+    }
+
+    math::PaddingGradFunctor<DeviceContext, T>(rank, context, pads, *in_dout,
+                                               d_y);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/pad_op.h b/paddle/fluid/operators/pad_op.h
index c93c096575a30dd9344894ead4b81acc16930e21..32698dac4917e183cfe36c831787b049985b19b3 100644
--- a/paddle/fluid/operators/pad_op.h
+++ b/paddle/fluid/operators/pad_op.h
@@ -18,117 +18,44 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/padding.h"
 
 namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
 
-template <typename T, size_t D, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
-
-template <typename DeviceContext, typename T, size_t D>
-void PadFunction(const framework::ExecutionContext& context) {
-  auto pads = context.Attr<std::vector<int>>("paddings");
-  Eigen::array<std::pair<int, int>, D> paddings;
-  for (size_t i = 0; i < paddings.size(); ++i) {
-    paddings[i].first = pads[i * 2];
-    paddings[i].second = pads[i * 2 + 1];
-  }
-  T pad_value = context.Attr<T>("pad_value");
-
-  auto* x = context.Input<Tensor>("X");
-  auto* out = context.Output<Tensor>("Out");
-  out->mutable_data<T>(context.GetPlace());
-
-  auto x_tensor = EigenTensor<T, D>::From(*x);
-  auto out_tensor = EigenTensor<T, D>::From(*out);
-  auto& place =
-      *context.template device_context<DeviceContext>().eigen_device();
-  out_tensor.device(place) = x_tensor.pad(paddings, pad_value);
-}
-
 template <typename DeviceContext, typename T>
 class PadKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    int rank = context.Input<Tensor>("X")->dims().size();
-    switch (rank) {
-      case 1:
-        PadFunction<DeviceContext, T, 1>(context);
-        break;
-      case 2:
-        PadFunction<DeviceContext, T, 2>(context);
-        break;
-      case 3:
-        PadFunction<DeviceContext, T, 3>(context);
-        break;
-      case 4:
-        PadFunction<DeviceContext, T, 4>(context);
-        break;
-      case 5:
-        PadFunction<DeviceContext, T, 5>(context);
-        break;
-      case 6:
-        PadFunction<DeviceContext, T, 6>(context);
-        break;
-      default:
-        PADDLE_THROW(
-            "PadOp only support tensors with no more than 6 dimensions.");
-    }
+    auto pads = context.Attr<std::vector<int>>("paddings");
+    T pad_value = context.Attr<T>("pad_value");
+    auto* x = context.Input<Tensor>("X");
+    auto* out = context.Output<Tensor>("Out");
+    out->mutable_data<T>(context.GetPlace());
+
+    int rank = x->dims().size();
+    math::PaddingFunctor<DeviceContext, T>(rank, context, pads, pad_value, *x,
+                                           out);
   }
 };
 
-template <typename DeviceContext, typename T, size_t D>
-void PadGradFunction(const framework::ExecutionContext& context) {
-  auto pads = context.Attr<std::vector<int>>("paddings");
-  Eigen::array<std::pair<int, int>, D> paddings;
-  for (size_t i = 0; i < paddings.size(); ++i) {
-    paddings[i].first = -pads[i * 2];
-    paddings[i].second = -pads[i * 2 + 1];
-  }
-  auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
-  auto* d_x = context.Output<Tensor>(framework::GradVarName("X"));
-  if (d_x != nullptr) {
-    d_x->mutable_data<T>(context.GetPlace());
-    auto d_x_tensor = EigenTensor<T, D>::From(*d_x);
-    auto d_out_tensor = EigenTensor<T, D>::From(*d_out);
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    d_x_tensor.device(place) = d_out_tensor.pad(paddings, 0);
-  }
-}
-
 template <typename DeviceContext, typename T>
 class PadGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    size_t rank =
-        context.Input<Tensor>(framework::GradVarName("Out"))->dims().size();
-    switch (rank) {
-      case 1:
-        PadGradFunction<DeviceContext, T, 1>(context);
-        break;
-      case 2:
-        PadGradFunction<DeviceContext, T, 2>(context);
-        break;
-      case 3:
-        PadGradFunction<DeviceContext, T, 3>(context);
-        break;
-      case 4:
-        PadGradFunction<DeviceContext, T, 4>(context);
-        break;
-      case 5:
-        PadGradFunction<DeviceContext, T, 5>(context);
-        break;
-      case 6:
-        PadGradFunction<DeviceContext, T, 6>(context);
-        break;
-      default:
-        PADDLE_THROW(
-            "PadOp only support tensors with no more than 6 dimensions.");
+    auto pads = context.Attr<std::vector<int>>("paddings");
+    auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* d_x = context.Output<Tensor>(framework::GradVarName("X"));
+    if (d_x == nullptr) {
+      return;
     }
+
+    d_x->mutable_data<T>(context.GetPlace());
+    int rank = d_out->dims().size();
+    math::PaddingGradFunctor<DeviceContext, T>(rank, context, pads, *d_out,
+                                               d_x);
   }
 };
 
diff --git a/paddle/fluid/operators/parallel_do_op.cc b/paddle/fluid/operators/parallel_do_op.cc
index c9744db3d0654ef63357963d9a9a3cb946f56e2d..97c36a83fc5eff421725d05f66fca05f5169d1bb 100644
--- a/paddle/fluid/operators/parallel_do_op.cc
+++ b/paddle/fluid/operators/parallel_do_op.cc
@@ -18,7 +18,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/threadpool.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
-#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace operators {
@@ -164,14 +163,11 @@ class ParallelDoOp : public framework::OperatorBase {
       auto &place = places[place_idx];
       auto *cur_scope = sub_scopes[place_idx];
 
-      workers.emplace_back(
-          framework::Async([program, cur_scope, place, block, place_idx] {
-            // Give the thread an id to distinguish parallel block with same id.
-            platform::RecordThread rt(static_cast<int>(place_idx) + 1);
-            framework::Executor executor(place);
-            executor.Run(*program, cur_scope, block->ID(),
-                         false /*create_local_scope*/);
-          }));
+      workers.emplace_back(framework::Async([program, cur_scope, place, block] {
+        framework::Executor executor(place);
+        executor.Run(*program, cur_scope, block->ID(),
+                     false /*create_local_scope*/);
+      }));
     }
     for (auto &worker : workers) {
       worker.wait();
@@ -242,14 +238,11 @@ class ParallelDoGradOp : public framework::OperatorBase {
       auto *cur_scope = sub_scopes[i];
 
       // execute
-      workers.emplace_back(
-          framework::Async([program, cur_scope, place, block, i] {
-            // Give the thread an id to distinguish parallel block with same id.
-            platform::RecordThread rt(static_cast<int>(i) + 1);
-            framework::Executor executor(place);
-            executor.Run(*program, cur_scope, block->ID(),
-                         false /*create_local_scope*/);
-          }));
+      workers.emplace_back(framework::Async([program, cur_scope, place, block] {
+        framework::Executor executor(place);
+        executor.Run(*program, cur_scope, block->ID(),
+                     false /*create_local_scope*/);
+      }));
     }
     for (auto &worker : workers) {
       worker.wait();
@@ -362,6 +355,7 @@ class ParallelDoGradOpDescMaker : public framework::SingleGradOpDescMaker {
         grad->SetInput(framework::GradVarName(output_param), og_names);
       }
     }
+    grad->SetInput("Communicator", {"nccl_com__do_not_change_"});
     grad->SetAttrMap(this->Attrs());
     grad->SetBlockAttr(kParallelBlock, grad_block_[0]);
 
diff --git a/paddle/fluid/operators/pool_cudnn_op.cu.cc b/paddle/fluid/operators/pool_cudnn_op.cu.cc
index be55bc43b14f1e6211f71b4080d1676838ad508c..31f083565fddee66aea1485ed71f41b6199f4502 100644
--- a/paddle/fluid/operators/pool_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/pool_cudnn_op.cu.cc
@@ -81,7 +81,7 @@ class PoolCUDNNOpKernel : public framework::OpKernel<T> {
     // ------------------- cudnn pool algorithm ---------------------
     auto handle = ctx.cuda_device_context().cudnn_handle();
     ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
-    PADDLE_ENFORCE(platform::dynload::cudnnPoolingForward(
+    CUDNN_ENFORCE(platform::dynload::cudnnPoolingForward(
         handle, cudnn_pool_desc, &alpha, cudnn_input_desc, input_data, &beta,
         cudnn_output_desc, output_data));
   }
@@ -154,7 +154,7 @@ class PoolCUDNNGradOpKernel : public framework::OpKernel<T> {
       T *input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
       // Because beta is zero, it is unnecessary to reset input_grad.
 
-      PADDLE_ENFORCE(platform::dynload::cudnnPoolingBackward(
+      CUDNN_ENFORCE(platform::dynload::cudnnPoolingBackward(
           handle, cudnn_pool_desc, &alpha, cudnn_output_desc, output_data,
           cudnn_output_desc, output_grad_data, cudnn_input_desc, input_data,
           &beta, cudnn_input_desc, input_grad_data));
diff --git a/paddle/fluid/operators/prefetch_op.cc b/paddle/fluid/operators/prefetch_op.cc
index 8734282fe496b8e90af19abd5549566d62316fc3..0519c15e13aac99802ff0f95b975712b36b44246 100644
--- a/paddle/fluid/operators/prefetch_op.cc
+++ b/paddle/fluid/operators/prefetch_op.cc
@@ -44,16 +44,20 @@ class PrefetchOp : public framework::OperatorBase {
     distributed::RPCClient* rpc_client =
         distributed::RPCClient::GetInstance<RPCCLIENT_T>();
 
+    std::vector<distributed::VarHandlePtr> rets;
     for (size_t i = 0; i < ins.size(); i++) {
       if (NeedSend(scope, ins[i])) {
         VLOG(3) << "sending " << ins[i] << " to " << epmap[i] << " to get "
                 << outs[i] << " back";
-        rpc_client->AsyncPrefetchVar(epmap[i], ctx, scope, ins[i], outs[i]);
+        rets.push_back(rpc_client->AsyncPrefetchVar(epmap[i], ctx, scope,
+                                                    ins[i], outs[i]));
       } else {
         VLOG(3) << "don't send no-initialied variable: " << ins[i];
       }
     }
-    rpc_client->Wait();
+    for (size_t i = 0; i < rets.size(); i++) {
+      PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient");
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/prelu_op.cc b/paddle/fluid/operators/prelu_op.cc
index db040509bc08c3f6ad031c5b97c93574e31337e0..e0c4c81bdd5b5d0af3bafe632a2fa033efd08050 100644
--- a/paddle/fluid/operators/prelu_op.cc
+++ b/paddle/fluid/operators/prelu_op.cc
@@ -1,11 +1,8 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
     http://www.apache.org/licenses/LICENSE-2.0
-
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -26,14 +23,43 @@ class PReluOp : public framework::OperatorWithKernel {
       : OperatorWithKernel(type, inputs, outputs, attrs) {}
 
   void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
-    PADDLE_ENFORCE(ctx->HasInput("Alpha"), "Input(Alpha) should not be null");
-    PADDLE_ENFORCE(product(ctx->GetInputDim("Alpha")) == 1,
-                   "Size of weight Alpha must be one.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null");
-    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    std::string mode = ctx->Attrs().Get<std::string>("mode");
+
+    auto x_dim = ctx->GetInputDim("X");
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of PreluOp should not be null");
+    PADDLE_ENFORCE(ctx->HasInput("Alpha"),
+                   "Input(Alpha) of PreluOp should not be null");
+
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of PreluOp should not be null");
+    if (mode == "all") {
+      PADDLE_ENFORCE(product(ctx->GetInputDim("Alpha")) == 1,
+                     "For mode 'all', size of weight Alpha must be one.");
+    } else if (mode == "channel") {
+      PADDLE_ENFORCE(product(ctx->GetInputDim("Alpha")) == x_dim[1],
+                     "For channel-wise mode, size of weight Alpha must be "
+                     "equal to the number of channels, should be %d",
+                     x_dim[1]);
+    } else if (mode == "element") {
+      PADDLE_ENFORCE(product(ctx->GetInputDim("Alpha")) == product(x_dim),
+                     "For element-wise mode, size of weight Alpha must be "
+                     "equal to the number of input, should be %d",
+                     product(x_dim));
+    } else {
+      PADDLE_THROW("Unkown mode %s", mode);
+    }
+    ctx->SetOutputDim("Out", x_dim);
     ctx->ShareLoD("X", /*->*/ "Out");
   }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
+        platform::CPUPlace());
+  }
 };
 
 class PReluOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -44,9 +70,7 @@ class PReluOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Out", "The output tensor of prelu operator.");
     AddComment(R"DOC(
 PRelu Operator.
-
 The equation is:
-
 $$
 f(x) =
 \begin{cases}
@@ -54,11 +78,15 @@ f(x) =
 x,         \qquad  \text{if} \ x >= 0
 \end{cases}
 $$
-
 The input `X` can carry the LoD (Level of Details) information,
 or not. And the output shares the LoD information with input `X`.
-
+There are modes: 
+  all: all elements share same weight
+  channel: elements in a channel share same weight
+  element: each element has a weight 
 )DOC");
+    AddAttr<std::string>("mode", "The mode for inputs to share weights.")
+        .SetDefault("all");
   }
 };
 
@@ -71,9 +99,23 @@ class PReluGradOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
                    "Input(Out@GRAD) should not be null");
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-    ctx->SetOutputDim(framework::GradVarName("Alpha"),
-                      ctx->GetInputDim("Alpha"));
+    auto x_grad_name = framework::GradVarName("X");
+    auto alpha_grad_name = framework::GradVarName("Alpha");
+
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, ctx->GetInputDim("X"));
+    }
+    if (ctx->HasOutput(alpha_grad_name)) {
+      ctx->SetOutputDim(alpha_grad_name, ctx->GetInputDim("Alpha"));
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
+        platform::CPUPlace());
   }
 };
 
diff --git a/paddle/fluid/operators/prelu_op.h b/paddle/fluid/operators/prelu_op.h
index a6197d354833a2f4173003ad2a970c487ad9a65b..12f1525594ecf0887618616ffe563bd2bda32496 100644
--- a/paddle/fluid/operators/prelu_op.h
+++ b/paddle/fluid/operators/prelu_op.h
@@ -1,11 +1,8 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
     http://www.apache.org/licenses/LICENSE-2.0
-
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -13,32 +10,16 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <string>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/transform.h"
-
 namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
 using platform::Transform;
 
-template <typename T>
-class PReluFunctor {
- public:
-  explicit PReluFunctor(const T* alpha) : alpha_(alpha) {}
-
-  HOSTDEVICE T operator()(const T& x) const {
-    if (x > 0)
-      return x;
-    else
-      return x * (*alpha_);
-  }
-
- private:
-  const T* alpha_;
-};
-
 template <typename DeviceContext, typename T>
 class PReluKernel : public framework::OpKernel<T> {
  public:
@@ -50,53 +31,92 @@ class PReluKernel : public framework::OpKernel<T> {
     const T* x_ptr = x->data<T>();
     T* o_ptr = out->mutable_data<T>(context.GetPlace());
 
-    auto* alpha_ptr = alpha->data<T>();
+    const T* alpha_ptr = alpha->data<T>();
+    std::string mode = context.Attr<std::string>("mode");
 
     int numel = x->numel();
-
-    Transform<DeviceContext> trans;
-    trans(context.template device_context<DeviceContext>(), x_ptr,
-          x_ptr + numel, o_ptr, PReluFunctor<T>(alpha_ptr));
-  }
-};
-
-template <typename T>
-class PReluGradFunctor {
- public:
-  explicit PReluGradFunctor(const T* alpha) : alpha_(alpha) {}
-
-  HOSTDEVICE T operator()(const T& out, const T& dout) const {
-    if (out > 0)
-      return dout;
-    else
-      return dout * (*alpha_);
+    auto dim = x->dims();
+    int index = 0;
+    int i = 0;
+    if (mode == "channel") {
+      int temp = numel / (dim[0] * dim[1]);
+      for (i = 0; i < numel; i++) {
+        index = (i / temp) % dim[1];
+        o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[index] * x_ptr[i];
+      }
+    } else if (mode == "element") {
+      for (i = 0; i < numel; i++) {
+        o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[i] * x_ptr[i];
+      }
+    } else {
+      for (i = 0; i < numel; i++) {
+        o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[0] * x_ptr[i];
+      }
+    }
   }
-
- private:
-  const T* alpha_;
 };
 
 template <typename DeviceContext, typename T>
 class PReluGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
+    auto* x = context.Input<Tensor>("X");
     auto* dx = context.Output<Tensor>(framework::GradVarName("X"));
     auto* dout = context.Input<Tensor>(framework::GradVarName("Out"));
-
+    auto* dalpha = context.Output<Tensor>(framework::GradVarName("Alpha"));
     auto* out = context.Input<Tensor>("Out");
     auto* alpha = context.Input<Tensor>("Alpha");
-    auto* alpha_ptr = alpha->data<T>();
-
-    T* dx_ptr = dx->mutable_data<T>(context.GetPlace());
+    const T* alpha_ptr = alpha->data<T>();
+    const T* x_ptr = x->data<T>();
     const T* dout_ptr = dout->data<T>();
     const T* out_ptr = out->data<T>();
-    int numel = dx->numel();
-
-    Transform<DeviceContext> trans;
-    trans(context.template device_context<DeviceContext>(), out_ptr,
-          out_ptr + numel, dout_ptr, dx_ptr, PReluGradFunctor<T>(alpha_ptr));
-
-    // TODO(Zhuoyuan): add dalpha upgrade when GPU kernels ready
+    std::string mode = context.Attr<std::string>("mode");
+    int numel = x->numel();
+    auto dim = x->dims();
+    int index = 0;
+    int i = 0;
+    int temp = 0;
+    if (dx) {
+      T* dx_ptr = dx->mutable_data<T>(context.GetPlace());
+      if (mode == "channel") {
+        for (i = 0; i < numel; i++) {
+          temp = numel / (dim[0] * dim[1]);
+          index = (i / temp) % dim[1];
+          dx_ptr[i] =
+              out_ptr[i] > 0 ? dout_ptr[i] : alpha_ptr[index] * dout_ptr[i];
+        }
+      } else if (mode == "element") {
+        for (i = 0; i < numel; i++) {
+          dx_ptr[i] = out_ptr[i] > 0 ? dout_ptr[i] : alpha_ptr[i] * dout_ptr[i];
+        }
+      } else {
+        for (i = 0; i < numel; i++) {
+          dx_ptr[i] = out_ptr[i] > 0 ? dout_ptr[i] : alpha_ptr[0] * dout_ptr[i];
+        }
+      }
+    }
+
+    index = 0;
+    if (dalpha) {
+      T* dalpha_ptr = dalpha->mutable_data<T>(context.GetPlace());
+      if (mode == "channel") {
+        for (i = 0; i < numel; i++) {
+          temp = numel / (dim[0] * dim[1]);
+          index = (i / temp) % dim[1];
+          dalpha_ptr[index] += out_ptr[i] > 0 ? 0 : x_ptr[i] * dout_ptr[i];
+        }
+      } else if (mode == "element") {
+        for (i = 0; i < numel; i++) {
+          dalpha_ptr[i] += out_ptr[i] > 0 ? 0 : x_ptr[i] * dout_ptr[i];
+        }
+      } else {
+        for (i = 0; i < numel; i++) {
+          dalpha_ptr[0] += out_ptr[i] > 0 ? 0 : x_ptr[i] * dout_ptr[i];
+        }
+      }
+    }
+
+    // TODO(Guanzhong): add GPU kernels
   }
 };
 
diff --git a/paddle/fluid/operators/print_op.cc b/paddle/fluid/operators/print_op.cc
index cceac402951ae6bf3fe0b4c96af5b7ce9ca1ba0e..e7f1caf4d3a81dc7633139933c6a4c3d51a4e2a0 100644
--- a/paddle/fluid/operators/print_op.cc
+++ b/paddle/fluid/operators/print_op.cc
@@ -13,14 +13,12 @@
    limitations under the License. */
 
 #include <algorithm>
-#include <ctime>
-
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/framework/variable.h"
 
 namespace paddle {
 namespace operators {
+using framework::GradVarName;
 
 #define CLOG std::cout
 
@@ -35,7 +33,7 @@ struct Formater {
   std::type_index dtype{typeid(const char)};
   framework::LoD lod;
   int summarize;
-  void* data{nullptr};
+  void *data{nullptr};
 
   void operator()(size_t size) {
     PrintMessage();
@@ -101,7 +99,7 @@ struct Formater {
 
   template <typename T>
   void Display(size_t size) {
-    auto* d = reinterpret_cast<T*>(data);
+    auto *d = reinterpret_cast<T *>(data);
     CLOG << "\tdata: ";
     if (summarize != -1) {
       summarize = std::min(size, (size_t)summarize);
@@ -120,51 +118,36 @@ struct Formater {
 // TODO(ChunweiYan) there should be some other printers for TensorArray
 class TensorPrintOp : public framework::OperatorBase {
  public:
-  TensorPrintOp(const std::string& type,
-                const framework::VariableNameMap& inputs,
-                const framework::VariableNameMap& outputs,
-                const framework::AttributeMap& attrs)
+  TensorPrintOp(const std::string &type,
+                const framework::VariableNameMap &inputs,
+                const framework::VariableNameMap &outputs,
+                const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
 
-  TensorPrintOp(const TensorPrintOp& o)
+  TensorPrintOp(const TensorPrintOp &o)
       : framework::OperatorBase(
-            static_cast<const framework::OperatorBase&>(o)) {
+            static_cast<const framework::OperatorBase &>(o)) {
     PADDLE_THROW("Not implemented.");
   }
 
  private:
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& place) const override {
-    const framework::Variable* in_var_ptr = nullptr;
-    std::string phase(kForward);
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    const framework::Variable *in_var_ptr = nullptr;
     std::string printed_var_name = "";
 
-    auto& inputs = Inputs();
-    if (inputs.find("In") != inputs.end() && !Inputs("In").empty()) {
-      in_var_ptr = scope.FindVar(Input("In"));
-      printed_var_name = Inputs("In").front();
-    } else if (inputs.find("In@GRAD") != inputs.end() &&
-               !Inputs("In@GRAD").empty()) {
-      in_var_ptr = scope.FindVar(Input("In@GRAD"));
-      printed_var_name = Inputs("In@GRAD").front();
-      phase = std::string(kBackward);
-    } else {
-      PADDLE_THROW("Unknown phase, should be forward or backward.");
-    }
+    in_var_ptr = scope.FindVar(Input("In"));
+    printed_var_name = Inputs("In").front();
 
     PADDLE_ENFORCE_NOT_NULL(in_var_ptr);
 
-    auto& in_tensor = in_var_ptr->Get<framework::LoDTensor>();
-    auto* out_var_ptr = scope.FindVar(Output("Out"));
-    auto& out_tensor = *out_var_ptr->GetMutable<framework::LoDTensor>();
-
-    // Just copy data from input tensor to output tensor
-    // output tensor share same memory with input tensor
-    out_tensor.ShareDataWith(in_tensor);
-    out_tensor.set_lod(in_tensor.lod());
+    auto &in_tensor = in_var_ptr->Get<framework::LoDTensor>();
 
     std::string print_phase = Attr<std::string>("print_phase");
-    if (print_phase != phase && print_phase != std::string(kBoth)) {
+    bool is_forward = Attr<bool>("is_forward");
+
+    if ((is_forward && print_phase == kBackward) ||
+        (!is_forward && print_phase == kForward)) {
       return;
     }
 
@@ -192,7 +175,7 @@ class TensorPrintOp : public framework::OperatorBase {
       formater.dtype = printed_tensor.type();
     }
     if (Attr<bool>("print_tensor_shape")) {
-      auto& dims = printed_tensor.dims();
+      auto &dims = printed_tensor.dims();
       formater.dims.resize(dims.size());
       for (int i = 0; i < dims.size(); ++i) formater.dims[i] = dims[i];
     }
@@ -200,7 +183,7 @@ class TensorPrintOp : public framework::OperatorBase {
       formater.lod = printed_tensor.lod();
     }
     formater.summarize = Attr<int>("summarize");
-    formater.data = reinterpret_cast<void*>(printed_tensor.data<void>());
+    formater.data = reinterpret_cast<void *>(printed_tensor.data<void>());
     formater(printed_tensor.numel());
   }
 
@@ -219,14 +202,14 @@ class PrintOpProtoAndCheckMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<bool>("print_tensor_type", "Whether to print the tensor's dtype.");
     AddAttr<bool>("print_tensor_shape", "Whether to print the tensor's shape.");
     AddAttr<bool>("print_tensor_lod", "Whether to print the tensor's lod.");
-    AddAttr<std::string>(
-        "print_phase",
-        "(string, default 'BOTH') Which phase to display including 'FORWARD' "
-        "'BACKWARD' and 'BOTH'.")
+    AddAttr<std::string>("print_phase",
+                         "(string, default 'FORWARD') Which phase to display "
+                         "including 'FORWARD' "
+                         "'BACKWARD' and 'BOTH'.")
         .SetDefault(std::string(kBoth))
         .InEnum({std::string(kForward), std::string(kBackward),
                  std::string(kBoth)});
-    AddOutput("Out", "Output tensor with same data as input tensor.");
+    AddAttr<bool>("is_forward", "Whether is forward or not").SetDefault(true);
     AddComment(R"DOC(
 Creates a print op that will print when a tensor is accessed.
 
@@ -238,40 +221,21 @@ tensor `t`.)DOC");
 
 class InferShapeForward : public framework::InferShapeBase {
  public:
-  void operator()(framework::InferShapeContext* context) const override {
+  void operator()(framework::InferShapeContext *context) const override {
     PADDLE_ENFORCE(context->HasInput("In"), "Input(In) should not be null.");
-    context->ShareLoD("In", /*->*/ "Out");
-    context->SetOutputDim("Out", context->GetInputDim("In"));
-  }
-};
-
-class InferShapeBackward : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext* context) const override {
-    PADDLE_ENFORCE(context->HasInput("In@GRAD"),
-                   "Input(In@GRAD) should not be null.");
-    context->ShareLoD("In@GRAD", /*->*/ "Out");
-    context->SetOutputDim("Out", context->GetInputDim("In@GRAD"));
   }
 };
 
-class InferVarType : public framework::VarTypeInference {
- public:
-  void operator()(const framework::OpDesc& op_desc,
-                  framework::BlockDesc* block) const override {}
-};
-
-class PrintOpProtoAndCheckGradOpMaker
-    : public framework::SingleGradOpDescMaker {
+class PrintOpGradientMaker : public framework::SingleGradOpDescMaker {
  public:
   using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 
   std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto* op_desc_ptr = new framework::OpDesc();
-    op_desc_ptr->SetType("print_grad");
-    op_desc_ptr->SetInput("In@GRAD", OutputGrad("Out"));
-    op_desc_ptr->SetOutput("Out", InputGrad("In"));
+    auto *op_desc_ptr = new framework::OpDesc();
+    op_desc_ptr->SetType("print");
+    op_desc_ptr->SetInput("In", InputGrad("In"));
     op_desc_ptr->SetAttrMap(Attrs());
+    op_desc_ptr->SetAttr("is_forward", false);
     return std::unique_ptr<framework::OpDesc>(op_desc_ptr);
   }
 };
@@ -282,6 +246,4 @@ class PrintOpProtoAndCheckGradOpMaker
 namespace ops = paddle::operators;
 
 REGISTER_OPERATOR(print, ops::TensorPrintOp, ops::PrintOpProtoAndCheckMaker,
-                  ops::PrintOpProtoAndCheckGradOpMaker, ops::InferShapeForward,
-                  ops::InferVarType);
-REGISTER_OPERATOR(print_grad, ops::TensorPrintOp, ops::InferShapeBackward);
+                  ops::PrintOpGradientMaker, ops::InferShapeForward);
diff --git a/paddle/fluid/operators/read_op.cc b/paddle/fluid/operators/read_op.cc
index 65fcce8bb019965a805ad09d50be0aba64e4f24e..a0d640b2020958af53a4405ae886eadb2a1e117e 100644
--- a/paddle/fluid/operators/read_op.cc
+++ b/paddle/fluid/operators/read_op.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
+#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace operators {
@@ -65,6 +66,12 @@ class ReadOp : public framework::OperatorBase {
             .GetMutable<framework::ReaderHolder>();
     std::vector<std::string> out_arg_names = Outputs("Out");
     std::vector<framework::LoDTensor> ins;
+
+    // For profiling
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    auto& ctx = *pool.Get(dev_place);
+    platform::RecordEvent record_event(Type(), &ctx);
+
     reader->ReadNext(&ins);
     if (ins.empty()) {
       if (Attr<bool>("throw_eof_exp")) {
diff --git a/paddle/fluid/operators/reader/CMakeLists.txt b/paddle/fluid/operators/reader/CMakeLists.txt
index a39c8a00538875e4e3284898230a6cb0693b7a12..728197377df04df8c993a48bc282431473fe9959 100644
--- a/paddle/fluid/operators/reader/CMakeLists.txt
+++ b/paddle/fluid/operators/reader/CMakeLists.txt
@@ -15,14 +15,14 @@ function(reader_library TARGET_NAME)
         PARENT_SCOPE)
 endfunction()
 
-reader_library(open_files_op SRCS open_files_op.cc)
+cc_library(buffered_reader SRCS buffered_reader.cc DEPS reader simple_threadpool)
+reader_library(open_files_op SRCS open_files_op.cc DEPS buffered_reader)
 reader_library(create_random_data_generator_op SRCS create_random_data_generator_op.cc)
 reader_library(create_shuffle_reader_op SRCS create_shuffle_reader_op.cc)
 reader_library(create_batch_reader_op SRCS create_batch_reader_op.cc)
 reader_library(create_recordio_file_reader_op SRCS create_recordio_file_reader_op.cc)
-reader_library(create_double_buffer_reader_op SRCS create_double_buffer_reader_op.cc)
+reader_library(create_double_buffer_reader_op SRCS create_double_buffer_reader_op.cc DEPS buffered_reader)
 reader_library(create_multi_pass_reader_op SRCS create_multi_pass_reader_op.cc)
-reader_library(create_threaded_reader_op SRCS create_threaded_reader_op.cc)
 reader_library(create_custom_reader_op SRCS create_custom_reader_op.cc)
 reader_library(create_py_reader_op SRCS create_py_reader_op.cc)
 
diff --git a/paddle/fluid/operators/reader/blocking_queue.h b/paddle/fluid/operators/reader/blocking_queue.h
index db8cf3b605c9175eeda4548b1e7c8203f26c5d89..28cc91a5ed5d74994e5b960a0a4dd3c6a5e6cdcc 100644
--- a/paddle/fluid/operators/reader/blocking_queue.h
+++ b/paddle/fluid/operators/reader/blocking_queue.h
@@ -81,6 +81,15 @@ class BlockingQueue {
     }
   }
 
+  void ReOpen() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    closed_ = false;
+    std::deque<T> new_deque;
+    queue_.swap(new_deque);
+    send_cv_.notify_all();
+    receive_cv_.notify_all();
+  }
+
   void Close() {
     std::lock_guard<std::mutex> lock(mutex_);
     closed_ = true;
diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc
new file mode 100644
index 0000000000000000000000000000000000000000..26ff221dfa0768bd2bcc9e6485a32485f0212ac6
--- /dev/null
+++ b/paddle/fluid/operators/reader/buffered_reader.cc
@@ -0,0 +1,108 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reader/buffered_reader.h"
+#include <vector>
+
+namespace paddle {
+namespace operators {
+namespace reader {
+BufferedReader::~BufferedReader() {
+  reader_->Shutdown();
+  while (!position_.empty()) {
+    position_.front().wait();
+    position_.pop();
+  }
+}
+
+BufferedReader::BufferedReader(
+    const std::shared_ptr<framework::ReaderBase> &reader,
+    const platform::Place &place, size_t buffer_size)
+    : framework::DecoratedReader(reader),
+      thread_pool_(1),
+      place_(place),
+      buffer_size_(buffer_size) {
+  cpu_buffer_.resize(buffer_size);
+  gpu_buffer_.resize(buffer_size);
+  ReadTillBufferFullAsync();
+}
+
+void BufferedReader::ReadTillBufferFullAsync() {
+  PADDLE_ENFORCE_EQ(position_.size(), 0U);
+  for (size_t i = 0; i < buffer_size_; ++i) {
+    ReadAsync(i);
+  }
+}
+
+void BufferedReader::ReadAsync(size_t i) {
+  position_.emplace(thread_pool_.enqueue([this, i]() -> size_t {
+    TensorVec &cpu = cpu_buffer_[i];
+    reader_->ReadNext(&cpu);
+
+    if (cpu.empty()) {
+      return -1UL;
+    }
+
+    if (platform::is_gpu_place(place_)) {
+      TensorVec &gpu = gpu_buffer_[i];
+      gpu.resize(cpu.size());
+      for (size_t i = 0; i < cpu.size(); ++i) {
+        framework::TensorCopySync(cpu[i], place_, &gpu[i]);
+        gpu[i].set_lod(cpu[i].lod());
+      }
+    }
+    return i;
+  }));
+}
+
+void BufferedReader::ShutdownImpl() {
+  reader_->Shutdown();
+  while (!position_.empty()) {
+    position_.pop();
+  }
+  prev_pos_ = -1UL;
+}
+
+void BufferedReader::StartImpl() {
+  reader_->Start();
+  ReadTillBufferFullAsync();
+}
+
+void BufferedReader::ReadNextImpl(std::vector<framework::LoDTensor> *out) {
+  if (position_.empty()) {
+    out->clear();
+    return;
+  }
+  size_t i = position_.front().get();
+  position_.pop();
+
+  if (i == -1UL) {
+    ReadNextImpl(out);
+    return;
+  }
+
+  *out = platform::is_gpu_place(place_) ? gpu_buffer_[i] : cpu_buffer_[i];
+
+  // Do not push current position into ReadAsync. Push the previous position
+  // Since all computation in fluid are async, change the data of
+  // current position may cause data error.
+  if (prev_pos_ != -1Ul) {
+    ReadAsync(prev_pos_);
+  }
+  prev_pos_ = i;
+}
+
+}  // namespace reader
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/reader/buffered_reader.h b/paddle/fluid/operators/reader/buffered_reader.h
new file mode 100644
index 0000000000000000000000000000000000000000..cbe2bc1b5fdd69d1a843b768e3289acd621369a6
--- /dev/null
+++ b/paddle/fluid/operators/reader/buffered_reader.h
@@ -0,0 +1,66 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <list>
+#include <queue>
+#include <vector>
+#include "ThreadPool.h"
+#include "paddle/fluid/framework/reader.h"
+
+namespace paddle {
+namespace operators {
+namespace reader {
+
+class BufferedReader : public framework::DecoratedReader {
+  using TensorVec = std::vector<framework::LoDTensor>;
+  using VecFuture = std::future<TensorVec>;
+
+ public:
+  BufferedReader(const std::shared_ptr<framework::ReaderBase>& reader,
+                 const platform::Place& place, size_t buffer_size);
+
+  ~BufferedReader() override;
+
+ private:
+  void ReadTillBufferFullAsync();
+
+  void ReadAsync(size_t i);
+
+ protected:
+  void ShutdownImpl() override;
+  void StartImpl() override;
+  void ReadNextImpl(std::vector<framework::LoDTensor>* out) override;
+
+ private:
+  ThreadPool thread_pool_;
+  platform::Place place_;
+  const size_t buffer_size_;
+
+  std::queue<std::future<size_t>> position_;
+
+  // The buffer for reading data.
+  // NOTE: the simplest way to implement buffered reader is do not use any
+  // buffer, just read async and create futures as buffer size. However, to
+  // malloc tensors every time is extremely slow. Here we store all data in
+  // buffers and prevent alloc every time.
+  std::vector<TensorVec> cpu_buffer_;
+  std::vector<TensorVec> gpu_buffer_;
+  size_t prev_pos_{-1UL};
+};
+
+}  // namespace reader
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/reader/create_batch_reader_op.cc b/paddle/fluid/operators/reader/create_batch_reader_op.cc
index ecbae3894d551186f53625a6cc9cfdb36adc8d2d..e17c2ffd39eea31fe85933eda144ab97cf8c3dd8 100644
--- a/paddle/fluid/operators/reader/create_batch_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_batch_reader_op.cc
@@ -20,15 +20,19 @@ namespace reader {
 
 class BatchReader : public framework::DecoratedReader {
  public:
-  BatchReader(const std::shared_ptr<ReaderBase>& reader, int batch_size)
-      : DecoratedReader(reader), batch_size_(batch_size) {
+  BatchReader(const std::shared_ptr<ReaderBase>& reader, int batch_size,
+              bool discard_leftover)
+      : DecoratedReader(reader),
+        batch_size_(static_cast<size_t>(batch_size)),
+        discard_leftover_(discard_leftover) {
     buffer_.reserve(batch_size_);
   }
 
-  void ReadNext(std::vector<framework::LoDTensor>* out) override;
+  void ReadNextImpl(std::vector<framework::LoDTensor>* out) override;
 
  private:
-  int batch_size_;
+  size_t batch_size_;
+  bool discard_leftover_;
   std::vector<std::vector<framework::LoDTensor>> buffer_;
 };
 
@@ -46,8 +50,9 @@ class CreateBatchReaderOp : public framework::OperatorBase {
     }
     const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
                                         ->Get<framework::ReaderHolder>();
-    out->Reset(
-        new BatchReader(underlying_reader.Get(), Attr<int>("batch_size")));
+    out->Reset(framework::MakeDecoratedReader<BatchReader>(
+        underlying_reader, Attr<int>("batch_size"),
+        Attr<bool>("discard_leftover")));
   }
 };
 
@@ -57,6 +62,10 @@ class CreateBatchReaderOpMaker : public DecoratedReaderMakerBase {
     AddAttr<int>("batch_size",
                  "How many instances the batch reader yields each time.")
         .GreaterThan(0);
+    AddAttr<bool>("discard_leftover",
+                  "If true, the leftover instances that are not enough for a "
+                  "new batch will be discarded.")
+        .SetDefault(true);
     AddComment(R"DOC(
       CreateBatchReader Operator
 
@@ -66,10 +75,10 @@ class CreateBatchReaderOpMaker : public DecoratedReaderMakerBase {
   }
 };
 
-void BatchReader::ReadNext(std::vector<framework::LoDTensor>* out) {
+void BatchReader::ReadNextImpl(std::vector<framework::LoDTensor>* out) {
   buffer_.clear();
   buffer_.reserve(batch_size_);
-  for (int i = 0; i < batch_size_; ++i) {
+  for (size_t i = 0; i < batch_size_; ++i) {
     buffer_.push_back(std::vector<framework::LoDTensor>());
     reader_->ReadNext(&buffer_.back());
     if (buffer_.back().empty()) {
@@ -77,15 +86,18 @@ void BatchReader::ReadNext(std::vector<framework::LoDTensor>* out) {
       break;
     }
   }
+  if (discard_leftover_ && buffer_.size() < batch_size_) {
+    buffer_.clear();
+  }
   // Concat instances
   out->clear();
   if (buffer_.empty()) {
     // if buffer_ is empty, the 'out' will return as an empty vector.
     return;
   }
-  int out_num = buffer_[0].size();
+  size_t out_num = buffer_[0].size();
   out->reserve(out_num);
-  for (int j = 0; j < out_num; ++j) {
+  for (size_t j = 0; j < out_num; ++j) {
     // Merge shape and check date type
     std::type_index batch_type = buffer_[0][j].type();
     framework::DDim batch_shape = buffer_[0][j].dims();
diff --git a/paddle/fluid/operators/reader/create_custom_reader_op.cc b/paddle/fluid/operators/reader/create_custom_reader_op.cc
index a75c6d4c567ac93f37b38070421133af305f20a3..85394b336fc967fc6973131fbedda4c796825185 100644
--- a/paddle/fluid/operators/reader/create_custom_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_custom_reader_op.cc
@@ -33,7 +33,7 @@ class CustomReader : public framework::DecoratedReader {
         source_var_names_(source_var_names),
         sink_var_names_(sink_var_names) {}
 
-  void ReadNext(std::vector<framework::LoDTensor>* out) override;
+  void ReadNextImpl(std::vector<framework::LoDTensor>* out) override;
 
  private:
   const framework::ProgramDesc program_;
@@ -60,10 +60,10 @@ class CreateCustomReaderOp : public framework::OperatorBase {
     }
     const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
                                         ->Get<framework::ReaderHolder>();
-    out->Reset(
-        new CustomReader(underlying_reader.Get(), *sub_block,
-                         Attr<std::vector<std::string>>("source_var_names"),
-                         Attr<std::vector<std::string>>("sink_var_names")));
+    out->Reset(framework::MakeDecoratedReader<CustomReader>(
+        underlying_reader, *sub_block,
+        Attr<std::vector<std::string>>("source_var_names"),
+        Attr<std::vector<std::string>>("sink_var_names")));
   }
 };
 
@@ -143,7 +143,7 @@ class CustomReaderInferVarType : public framework::VarTypeInference {
   }
 };
 
-void CustomReader::ReadNext(std::vector<framework::LoDTensor>* out) {
+void CustomReader::ReadNextImpl(std::vector<framework::LoDTensor>* out) {
   out->clear();
   std::vector<framework::LoDTensor> underlying_outs;
   reader_->ReadNext(&underlying_outs);
diff --git a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
index 5f734489a81764875988f440696682570ff4d1d7..ed719f91d0980480aa62a5cd3c1f819e6c0e7475 100644
--- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
@@ -12,74 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <thread>  // NOLINT
-
-#include "paddle/fluid/operators/reader/blocking_queue.h"
+#include "paddle/fluid/operators/reader/buffered_reader.h"
 #include "paddle/fluid/operators/reader/reader_op_registry.h"
 
 namespace paddle {
 namespace operators {
 namespace reader {
-
-// 'Double buffer' means we shall maintain two batches of input data at the same
-// time. So the kCacheSize shoul be at least 2.
-static constexpr size_t kCacheSize = 5;
-// There will be two bacthes out of the channel during training:
-// 1. the one waiting to be sent to the channel
-// 2. the one just be received from the channel, which is also being used by
-// subsequent operators.
-// So the channel size should be kChacheSize - 2
-static constexpr size_t kChannelSize = 3;  // kCacheSize - 2
-
-class DoubleBufferReader : public framework::DecoratedReader {
- public:
-  explicit DoubleBufferReader(
-      const std::shared_ptr<ReaderBase>& reader,
-      platform::Place target_place = platform::CPUPlace())
-      : DecoratedReader(reader), place_(target_place) {
-    cpu_tensor_cache_.resize(kCacheSize);
-    gpu_tensor_cache_.resize(kCacheSize);
-#ifdef PADDLE_WITH_CUDA
-    if (platform::is_gpu_place(place_)) {
-      for (size_t i = 0; i < kCacheSize; ++i) {
-        ctxs_.emplace_back(new platform::CUDADeviceContext(
-            boost::get<platform::CUDAPlace>(place_)));
-      }
-    }
-#endif
-    StartPrefetcher();
-  }
-
-  void ReadNext(std::vector<framework::LoDTensor>* out) override;
-  void ReInit() override;
-
-  ~DoubleBufferReader() { EndPrefetcher(); }
-
- private:
-  void StartPrefetcher() {
-    channel_ = new reader::BlockingQueue<size_t>(kChannelSize);
-    prefetcher_ = std::thread([this] { PrefetchThreadFunc(); });
-  }
-
-  void EndPrefetcher() {
-    channel_->Close();
-    if (prefetcher_.joinable()) {
-      prefetcher_.join();
-    }
-    delete channel_;
-    channel_ = nullptr;
-  }
-
-  void PrefetchThreadFunc();
-
-  std::thread prefetcher_;
-  reader::BlockingQueue<size_t>* channel_;
-  platform::Place place_;
-  std::vector<std::vector<framework::LoDTensor>> cpu_tensor_cache_;
-  std::vector<std::vector<framework::LoDTensor>> gpu_tensor_cache_;
-  std::vector<std::unique_ptr<platform::DeviceContext>> ctxs_;
-};
-
 class CreateDoubleBufferReaderOp : public framework::OperatorBase {
  public:
   using framework::OperatorBase::OperatorBase;
@@ -109,7 +47,8 @@ class CreateDoubleBufferReaderOp : public framework::OperatorBase {
       place = platform::CUDAPlace(static_cast<int>(num));
     }
 
-    out->Reset(new DoubleBufferReader(underlying_reader.Get(), place));
+    out->Reset(framework::MakeDecoratedReader<BufferedReader>(underlying_reader,
+                                                              place, 2));
   }
 };
 
@@ -136,57 +75,6 @@ class CreateDoubleBufferReaderOpMaker : public DecoratedReaderMakerBase {
   }
 };
 
-void DoubleBufferReader::ReadNext(std::vector<framework::LoDTensor>* out) {
-  size_t cached_tensor_id;
-  if (channel_->Receive(&cached_tensor_id)) {
-    if (platform::is_gpu_place(place_)) {
-      *out = gpu_tensor_cache_[cached_tensor_id];
-    } else {
-      // CPU place
-      *out = cpu_tensor_cache_[cached_tensor_id];
-    }
-  } else {
-    out->clear();
-  }
-}
-
-void DoubleBufferReader::ReInit() {
-  reader_->ReInit();
-  EndPrefetcher();
-  StartPrefetcher();
-}
-
-void DoubleBufferReader::PrefetchThreadFunc() {
-  VLOG(5) << "A new prefetch thread starts.";
-  size_t cached_tensor_id = 0;
-  while (true) {
-    auto& cpu_batch = cpu_tensor_cache_[cached_tensor_id];
-    reader_->ReadNext(&cpu_batch);
-    if (cpu_batch.empty()) {
-      // The underlying reader have no next data.
-      break;
-    }
-    if (platform::is_gpu_place(place_)) {
-      auto& gpu_batch = gpu_tensor_cache_[cached_tensor_id];
-      gpu_batch.resize(cpu_batch.size());
-      for (size_t i = 0; i < cpu_batch.size(); ++i) {
-        // TODO(fengjiayi): Use asynchronous TensorCopy instead
-        framework::TensorCopySync(cpu_batch[i], place_, &gpu_batch[i]);
-        gpu_batch[i].set_lod(cpu_batch[i].lod());
-      }
-    }
-    if (!channel_->Send(cached_tensor_id)) {
-      VLOG(5) << "WARNING: The double buffer channel has been closed. The "
-                 "prefetch thread will terminate.";
-      break;
-    }
-    ++cached_tensor_id;
-    cached_tensor_id %= kCacheSize;
-  }
-  channel_->Close();
-  VLOG(5) << "Prefetch thread terminates.";
-}
-
 }  // namespace reader
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc b/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc
index 19b54110b9aeece33b8d6c73612ae0e12dbfafbd..0a225597d34f43c7fb82aeae2552cdf16c8ba566 100644
--- a/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc
@@ -24,23 +24,22 @@ class MultiPassReader : public framework::DecoratedReader {
   MultiPassReader(const std::shared_ptr<ReaderBase>& reader, int pass_num)
       : DecoratedReader(reader), pass_num_(pass_num), pass_count_(0) {}
 
-  void ReadNext(std::vector<framework::LoDTensor>* out) override {
+  void ReadNextImpl(std::vector<framework::LoDTensor>* out) override {
     reader_->ReadNext(out);
-    if (out->empty()) {
+    if (out->empty() && pass_count_ < pass_num_ - 1) {
+      reader_->Shutdown();
+      reader_->Start();
+      reader_->ReadNext(out);
       ++pass_count_;
-      if (pass_count_ < pass_num_) {
-        reader_->ReInit();
-        reader_->ReadNext(out);
-      }
     }
   }
 
-  void ReInit() override {
+ private:
+  void StartImpl() override {
     pass_count_ = 0;
-    reader_->ReInit();
+    reader_->Start();
   }
 
- private:
   int pass_num_;
   mutable int pass_count_;
 };
@@ -60,7 +59,8 @@ class CreateMultiPassReaderOp : public framework::OperatorBase {
     const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
                                         ->Get<framework::ReaderHolder>();
     int pass_num = Attr<int>("pass_num");
-    out->Reset(new MultiPassReader(underlying_reader.Get(), pass_num));
+    out->Reset(framework::MakeDecoratedReader<MultiPassReader>(
+        underlying_reader, pass_num));
   }
 };
 
diff --git a/paddle/fluid/operators/reader/create_py_reader_op.cc b/paddle/fluid/operators/reader/create_py_reader_op.cc
index 36587360f7347a10e01d4e994482027d9a9bb5d0..0f31ca1a94326956ae5e6dffd582daedeb55a9e3 100644
--- a/paddle/fluid/operators/reader/create_py_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_py_reader_op.cc
@@ -19,9 +19,10 @@ namespace paddle {
 namespace operators {
 namespace reader {
 
-class PyReader : public framework::ReaderBase {
+class PyReader : public framework::FileReader {
  public:
-  explicit PyReader(const std::shared_ptr<LoDTensorBlockingQueue>& queue) {
+  explicit PyReader(const std::shared_ptr<LoDTensorBlockingQueue>& queue)
+      : framework::FileReader() {
     PADDLE_ENFORCE(queue != nullptr, "LoDTensorBlockingQueue must not be null");
     queue_ = queue;
   }
@@ -32,7 +33,11 @@ class PyReader : public framework::ReaderBase {
     if (!success) out->clear();
   }
 
-  void ReInit() override {}
+  ~PyReader() { queue_->Close(); }
+
+  void Shutdown() override { queue_->Close(); }
+
+  void Start() override { queue_->ReOpen(); }
 
  private:
   std::shared_ptr<LoDTensorBlockingQueue> queue_;
@@ -51,14 +56,14 @@ class CreatePyReaderOp : public framework::OperatorBase {
 
     const std::string& queue_name = Input("blocking_queue");
     auto* queue_holder_var = scope.FindVar(queue_name);
-    PADDLE_ENFORCE(
-        queue_holder_var != nullptr,
+    PADDLE_ENFORCE_NOT_NULL(
+        queue_holder_var,
         "No LoDTensorBlockingQueueHolder variable with name %s found",
         queue_name);
     auto* queue_holder =
         queue_holder_var->template GetMutable<LoDTensorBlockingQueueHolder>();
 
-    out->Reset(new PyReader(queue_holder->GetQueue()));
+    out->Reset(std::make_shared<PyReader>(queue_holder->GetQueue()));
   }
 };
 
diff --git a/paddle/fluid/operators/reader/create_random_data_generator_op.cc b/paddle/fluid/operators/reader/create_random_data_generator_op.cc
index 5b7e8a063a034f0be056065826fca0fe807bc9a7..e5c116dfcd71ef40597ca19d1da0b51038baaad1 100644
--- a/paddle/fluid/operators/reader/create_random_data_generator_op.cc
+++ b/paddle/fluid/operators/reader/create_random_data_generator_op.cc
@@ -19,11 +19,11 @@ namespace operators {
 namespace reader {
 
 template <typename T>
-class RandomDataGenerator : public framework::ReaderBase {
+class RandomDataGenerator : public framework::FileReader {
  public:
   RandomDataGenerator(const std::vector<framework::DDim>& shapes, float low,
                       float high)
-      : framework::ReaderBase(), low_(low), high_(high), shapes_(shapes) {
+      : framework::FileReader(), low_(low), high_(high), shapes_(shapes) {
     PADDLE_ENFORCE_LE(low, high,
                       "'low' shouldn't be greater than 'high'.(%f vs %f)", low,
                       high);
@@ -32,7 +32,7 @@ class RandomDataGenerator : public framework::ReaderBase {
     dist_ = std::uniform_real_distribution<float>(low_, high_);
   }
 
-  void ReadNext(std::vector<framework::LoDTensor>* out) override {
+  void ReadNextImpl(std::vector<framework::LoDTensor>* out) override {
     out->clear();
     out->reserve(shapes_.size());
     for (const framework::DDim& shape : shapes_) {
@@ -51,8 +51,6 @@ class RandomDataGenerator : public framework::ReaderBase {
     }
   }
 
-  void ReInit() override { return; }
-
  private:
   float low_;
   float high_;
@@ -79,8 +77,8 @@ class CreateRandomDataGeneratorOp : public framework::OperatorBase {
     std::vector<framework::DDim> shapes = RestoreShapes(shape_concat, ranks);
     auto* out = scope.FindVar(Output("Out"))
                     ->template GetMutable<framework::ReaderHolder>();
-    out->Reset(new RandomDataGenerator<T>(shapes, Attr<float>("low"),
-                                          Attr<float>("high")));
+    out->Reset(std::make_shared<RandomDataGenerator<T>>(
+        shapes, Attr<float>("low"), Attr<float>("high")));
   }
 };
 
diff --git a/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc b/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc
index 559827f08494af6730aafa1e67c46a47c21dedf6..a08a9dbd0da46e73082cdd24c019e8d210d8bcc4 100644
--- a/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc
@@ -21,10 +21,8 @@ namespace reader {
 template <bool ThreadSafe>
 class RecordIOFileReader : public framework::FileReader {
  public:
-  explicit RecordIOFileReader(const std::string& filename,
-                              const std::vector<framework::DDim>& dims)
-      : FileReader(dims),
-        scanner_(filename),
+  explicit RecordIOFileReader(const std::string& filename)
+      : scanner_(filename),
         dev_ctx_(*platform::DeviceContextPool::Instance().Get(
             platform::CPUPlace())) {
     if (ThreadSafe) {
@@ -33,18 +31,21 @@ class RecordIOFileReader : public framework::FileReader {
     LOG(INFO) << "Creating file reader" << filename;
   }
 
-  void ReInit() override { scanner_.Reset(); }
-
  protected:
   void ReadNextImpl(std::vector<framework::LoDTensor>* out) override {
+    std::unique_ptr<std::lock_guard<std::mutex>> guard;
     if (ThreadSafe) {
-      std::lock_guard<std::mutex> guard(*mutex_);
-      *out = framework::ReadFromRecordIO(&scanner_, dev_ctx_);
-    } else {
-      *out = framework::ReadFromRecordIO(&scanner_, dev_ctx_);
+      guard.reset(new std::lock_guard<std::mutex>(*mutex_));
+    }
+
+    bool ok = framework::ReadFromRecordIO(&scanner_, dev_ctx_, out);
+    if (!ok) {
+      out->clear();
     }
   }
 
+  void StartImpl() override { scanner_.Reset(); }
+
  private:
   std::unique_ptr<std::mutex> mutex_;
   recordio::Scanner scanner_;
@@ -58,20 +59,11 @@ class CreateRecordIOReaderOp : public framework::OperatorBase {
  private:
   void RunImpl(const framework::Scope& scope,
                const platform::Place& dev_place) const override {
-    const auto& shape_concat = Attr<std::vector<int>>("shape_concat");
-    const auto& ranks = Attr<std::vector<int>>("ranks");
-    PADDLE_ENFORCE(!shape_concat.empty() && !ranks.empty());
-    PADDLE_ENFORCE_EQ(std::accumulate(ranks.begin(), ranks.end(), 0),
-                      static_cast<int>(shape_concat.size()),
-                      "The accumulate of all ranks should be equal to the "
-                      "shape concat's length.");
     std::string filename = Attr<std::string>("filename");
-
     auto* out = scope.FindVar(Output("Out"))
                     ->template GetMutable<framework::ReaderHolder>();
 
-    out->Reset(new RecordIOFileReader<true>(
-        filename, RestoreShapes(shape_concat, ranks)));
+    out->Reset(std::make_shared<RecordIOFileReader<true>>(filename));
   }
 };
 
diff --git a/paddle/fluid/operators/reader/create_shuffle_reader_op.cc b/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
index 57e8e21214b7c99e52550fe51a67c9b5201cb46f..3f72890a7cee1453585d50afa04fa62a9b059dc3 100644
--- a/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
@@ -34,7 +34,7 @@ class ShuffleReader : public framework::DecoratedReader {
     ReloadBuffer();
   }
 
-  void ReadNext(std::vector<framework::LoDTensor>* out) override {
+  void ReadNextImpl(std::vector<framework::LoDTensor>* out) override {
     out->clear();
     if (iteration_pos_ >= buffer_.size()) {
       VLOG(10) << "Resetting shuffle buffer";
@@ -47,6 +47,17 @@ class ShuffleReader : public framework::DecoratedReader {
   }
 
  private:
+  void ShutdownImpl() override {
+    reader_->Shutdown();
+    buffer_.clear();
+    iteration_pos_ = 0;
+  }
+
+  void StartImpl() override {
+    reader_->Start();
+    ReloadBuffer();
+  }
+
   void ReloadBuffer() {
     buffer_.clear();
     buffer_.reserve(buffer_size_);
@@ -86,9 +97,8 @@ class CreateShuffleReaderOp : public framework::OperatorBase {
     }
     const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
                                         ->Get<framework::ReaderHolder>();
-    out->Reset(
-        new ShuffleReader(underlying_reader.Get(),
-                          static_cast<size_t>(Attr<int>("buffer_size"))));
+    out->Reset(framework::MakeDecoratedReader<ShuffleReader>(
+        underlying_reader, static_cast<size_t>(Attr<int>("buffer_size"))));
   }
 };
 
diff --git a/paddle/fluid/operators/reader/create_threaded_reader_op.cc b/paddle/fluid/operators/reader/create_threaded_reader_op.cc
deleted file mode 100644
index 3798015146f4ffb085aa82e23ca3f1fb3c5cf5a4..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/reader/create_threaded_reader_op.cc
+++ /dev/null
@@ -1,79 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/detail/safe_ref.h"
-#include "paddle/fluid/operators/reader/reader_op_registry.h"
-
-namespace paddle {
-namespace operators {
-namespace reader {
-
-class ThreadedReader : public framework::DecoratedReader {
- public:
-  explicit ThreadedReader(const std::shared_ptr<ReaderBase>& reader)
-      : DecoratedReader(reader) {}
-
-  void ReadNext(std::vector<framework::LoDTensor>* out) override {
-    std::lock_guard<std::mutex> lock(mutex_);
-    reader_->ReadNext(out);
-  }
-
-  void ReInit() override { reader_->ReInit(); }
-
- private:
-  std::mutex mutex_;
-};
-
-class CreateThreadedReaderOp : public framework::OperatorBase {
- public:
-  using framework::OperatorBase::OperatorBase;
-
- private:
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& dev_place) const override {
-    auto* out = detail::Ref(scope.FindVar(Output("Out")))
-                    .GetMutable<framework::ReaderHolder>();
-    if (out->Get() != nullptr) {
-      return;
-    }
-    const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
-                                        ->Get<framework::ReaderHolder>();
-    out->Reset(new ThreadedReader(underlying_reader.Get()));
-  }
-};
-
-class CreateThreadedReaderOpMaker : public DecoratedReaderMakerBase {
- protected:
-  void Apply() override {
-    AddComment(R"DOC(
-      CreateThreadedReader Operator
-
-      This operator creates a threaded reader. A threaded reader's
-      'ReadNext()' can be invoked by several threads at the same
-      time.
-      When the attribute 'safe_mode' is true, the threaded reader's
-      'ReInit()' is disabled to avoid unexpected bugs in multi-thread
-      environment.
-    )DOC");
-  }
-};
-
-}  // namespace reader
-}  // namespace operators
-}  // namespace paddle
-
-namespace reader = paddle::operators::reader;
-REGISTER_DECORATED_READER_OPERATOR(create_threaded_reader,
-                                   reader::CreateThreadedReaderOp,
-                                   reader::CreateThreadedReaderOpMaker);
diff --git a/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h b/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h
index 30d962ba10a954a837f9771d21cedf0feb643439..4f7cfc24ec035349f3c85e84d876ad9b5b5493a6 100644
--- a/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h
+++ b/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h
@@ -38,12 +38,10 @@ class LoDTensorBlockingQueue {
 
  public:
   bool Push(const std::vector<framework::LoDTensor>& lod_tensor_vec) {
-    CheckDims(lod_tensor_vec);
     return queue_.Send(lod_tensor_vec);
   }
 
   bool Push(std::vector<framework::LoDTensor>&& lod_tensor_vec) {
-    CheckDims(lod_tensor_vec);
     return queue_.Send(std::move(lod_tensor_vec));
   }
 
@@ -58,25 +56,13 @@ class LoDTensorBlockingQueue {
 
   inline size_t Size() const { return queue_.Size(); }
 
-  inline void Close() { return queue_.Close(); }
+  inline void ReOpen() { queue_.ReOpen(); }
+
+  inline void Close() { queue_.Close(); }
 
   inline bool IsClosed() const { return queue_.IsClosed(); }
 
  private:
-  void CheckDims(const std::vector<framework::LoDTensor>& lod_tensor_vec) {
-    PADDLE_ENFORCE(dims_.size() == lod_tensor_vec.size(),
-                   "Expect input size is %d but found %s", dims_.size(),
-                   lod_tensor_vec.size());
-    for (size_t i = 0; i < dims_.size(); ++i) {
-      const auto& in_dims = framework::slice_ddim(
-          lod_tensor_vec[i].dims(), 1, lod_tensor_vec[i].dims().size());
-      const auto& expect_dims =
-          framework::slice_ddim(dims_[i], 1, dims_[i].size());
-      PADDLE_ENFORCE(in_dims == expect_dims,
-                     "Dims of the %d-th input tensor do not match", i);
-    }
-  }
-
   BlockingQueue<std::vector<framework::LoDTensor>> queue_;
   std::vector<framework::DDim> dims_;
 };
diff --git a/paddle/fluid/operators/reader/open_files_op.cc b/paddle/fluid/operators/reader/open_files_op.cc
index 31e5d81e55ed9703eb3a9ef2595fa2a280f1a734..38223e069975a08791d58d6ae10e2112b79a61fe 100644
--- a/paddle/fluid/operators/reader/open_files_op.cc
+++ b/paddle/fluid/operators/reader/open_files_op.cc
@@ -12,152 +12,200 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <cmath>
+#include <stdexcept>
 #include <thread>  // NOLINT
-
+#include "ThreadPool.h"
+#include "paddle/fluid/framework/blocking_queue.h"
 #include "paddle/fluid/operators/reader/blocking_queue.h"
+#include "paddle/fluid/operators/reader/buffered_reader.h"
 #include "paddle/fluid/operators/reader/reader_op_registry.h"
 
 namespace paddle {
 namespace operators {
 namespace reader {
 
-class MultiFileReader : public framework::ReaderBase {
+class IReaderContainer {
  public:
-  MultiFileReader(const std::vector<std::string>& file_names,
-                  const std::vector<framework::DDim>& dims, size_t thread_num,
-                  size_t buffer_size)
-      : buffer_size_(buffer_size) {
-    readers_.reserve(file_names.size());
-    for (const std::string& f_name : file_names) {
-      readers_.emplace_back(CreateReaderByFileName(f_name, dims));
+  virtual ~IReaderContainer() {}
+  virtual void AppendReader(
+      std::unique_ptr<framework::ReaderBase>&& readers) = 0;
+  virtual void Stop() = 0;
+  virtual void Start() = 0;
+  virtual void ReadNext(std::vector<framework::LoDTensor>* out) = 0;
+};
+
+class OrderedReaderContainer : public IReaderContainer {
+ public:
+  void AppendReader(std::unique_ptr<framework::ReaderBase>&& reader) override {
+    pending_.emplace(std::move(reader));
+  }
+
+  void Stop() override {
+    while (!pending_.empty()) {
+      MoveFrontPendingToDone();
     }
-    prefetchers_.resize(thread_num);
-    StartNewScheduler();
   }
 
-  void ReadNext(std::vector<framework::LoDTensor>* out) override;
-  void ReInit() override;
+  void Start() override { std::swap(done_, pending_); }
 
-  ~MultiFileReader() { EndScheduler(); }
+  void ReadNext(std::vector<framework::LoDTensor>* out) override {
+    if (!pending_.empty()) {
+      pending_.front()->ReadNext(out);
+      if (out->empty()) {
+        MoveFrontPendingToDone();
+        ReadNext(out);
+      }
+    } else {
+      out->clear();
+    }
+  }
 
  private:
-  void StartNewScheduler();
-  void EndScheduler();
-  void ScheduleThreadFunc();
-  void PrefetchThreadFunc(size_t reader_idx, size_t thread_idx);
-
-  std::vector<std::unique_ptr<framework::ReaderBase>> readers_;
-  std::thread scheduler_;
-  std::vector<std::thread> prefetchers_;
-  size_t buffer_size_;
-  reader::BlockingQueue<size_t>* waiting_reader_idx_;
-  reader::BlockingQueue<size_t>* available_thread_idx_;
-  reader::BlockingQueue<std::vector<framework::LoDTensor>>* buffer_;
+  void MoveFrontPendingToDone() {
+    pending_.front()->Shutdown();
+    pending_.front()->Start();
+    done_.emplace(move(pending_.front()));
+    pending_.pop();
+  }
+
+  std::queue<std::unique_ptr<framework::ReaderBase>> pending_;
+  std::queue<std::unique_ptr<framework::ReaderBase>> done_;
 };
 
-void MultiFileReader::ReadNext(std::vector<framework::LoDTensor>* out) {
-  if (!buffer_->Receive(out)) {
-    out->clear();
-  }
-}
-
-void MultiFileReader::ReInit() {
-  EndScheduler();
-  StartNewScheduler();
-}
-
-void MultiFileReader::StartNewScheduler() {
-  size_t thread_num = prefetchers_.size();
-  waiting_reader_idx_ = new reader::BlockingQueue<size_t>(readers_.size());
-  available_thread_idx_ = new reader::BlockingQueue<size_t>(thread_num);
-  buffer_ = new reader::BlockingQueue<std::vector<framework::LoDTensor>>(
-      buffer_size_);
-
-  for (size_t i = 0; i < readers_.size(); ++i) {
-    waiting_reader_idx_->Send(i);
-  }
-  waiting_reader_idx_->Close();
-  for (size_t i = 0; i < thread_num; ++i) {
-    available_thread_idx_->Send(i);
-  }
+class PreemptiveReaderContainer : public IReaderContainer {
+  using ReaderList = std::list<std::unique_ptr<framework::ReaderBase>>;
 
-  scheduler_ = std::thread([this] { ScheduleThreadFunc(); });
-}
+  struct FutureItem {
+    std::vector<framework::LoDTensor> data_;
+    ReaderList::iterator reader_it_;
+    std::exception_ptr exception_;
+  };
 
-void MultiFileReader::EndScheduler() {
-  available_thread_idx_->Close();
-  buffer_->Close();
-  waiting_reader_idx_->Close();
-  if (scheduler_.joinable()) {
-    scheduler_.join();
-  }
-  delete buffer_;
-  delete available_thread_idx_;
-  delete waiting_reader_idx_;
-}
-
-void MultiFileReader::ScheduleThreadFunc() {
-  VLOG(5) << "MultiFileReader schedule thread starts.";
-  size_t completed_thread_num = 0;
-  size_t thread_idx;
-  while (available_thread_idx_->Receive(&thread_idx)) {
-    std::thread& prefetcher = prefetchers_[thread_idx];
-    if (prefetcher.joinable()) {
-      prefetcher.join();
-    }
-    size_t reader_idx;
-    if (waiting_reader_idx_->Receive(&reader_idx)) {
-      // Still have files to read. Start a new prefetch thread.
-      prefetcher = std::thread([this, reader_idx, thread_idx] {
-        PrefetchThreadFunc(reader_idx, thread_idx);
-      });
-    } else {
-      // No more file to read.
-      ++completed_thread_num;
-      if (completed_thread_num == prefetchers_.size()) {
-        buffer_->Close();
-        break;
+  using FutureList = std::list<std::future<FutureItem>>;
+
+ public:
+  explicit PreemptiveReaderContainer(size_t thread_num) : pool_(thread_num) {}
+
+  void Stop() override {
+    if (!pending_.empty()) {
+      for (auto& reader : pending_) {
+        reader->Shutdown();
+      }
+      for (auto& fu : futures_) {
+        fu.wait();
       }
+      futures_.clear();
+      for (auto& reader : pending_) {
+        reader->Start();
+        done_.emplace_back(std::move(reader));
+      }
+      pending_.clear();
+      bool timeout;
+      complete_queue_.PopAll(1000, &timeout);
+      PADDLE_ENFORCE(!timeout);
     }
   }
-  // If users invoke ReInit() when scheduler is running, it will close the
-  // 'avaiable_thread_idx_' and prefecther threads have no way to tell scheduler
-  // to release their resource. So a check is needed before scheduler ends.
-  for (auto& p : prefetchers_) {
-    if (p.joinable()) {
-      p.join();
+
+  void Start() override {
+    for (auto& reader : done_) {
+      AppendReader(std::move(reader));
     }
+    done_.clear();
   }
-  VLOG(5) << "MultiFileReader schedule thread terminates.";
-}
-
-void MultiFileReader::PrefetchThreadFunc(size_t reader_idx, size_t thread_idx) {
-  VLOG(5) << "The prefetch thread of file idx '" << reader_idx << "' starts.";
-  std::unique_ptr<framework::ReaderBase>& reader = readers_[reader_idx];
-  while (true) {
-    std::vector<framework::LoDTensor> ins;
-    reader->ReadNext(&ins);
-    if (ins.empty()) {
-      reader->ReInit();
-      break;
+
+  void ReadNext(std::vector<framework::LoDTensor>* out) override {
+    if (!pending_.empty()) {
+      auto future_it = complete_queue_.Pop();
+      FutureItem item = future_it->get();
+      if (item.exception_) {
+        for (auto it = futures_.begin(); it != futures_.end(); ++it) {
+          if (it != future_it) {
+            it->wait();  // Wait all other threads complete.
+          }
+        }
+        std::rethrow_exception(item.exception_);
+
+      } else if (item.data_.empty()) {  // reader done.
+        done_.emplace_back(std::move(*item.reader_it_));
+        pending_.erase(item.reader_it_);
+        futures_.erase(future_it);
+        ReadNext(out);
+      } else {
+        *out = item.data_;
+        // continue read async
+        ReadAsync(item.reader_it_, &future_it);
+      }
+    } else {
+      out->clear();
     }
-    try {
-      buffer_->Send(std::move(ins));
-    } catch (paddle::platform::EnforceNotMet e) {
-      VLOG(5) << "WARNING: The buffer channel has been closed. The prefetch "
-                 "thread of file idx '"
-              << reader_idx << "' will terminate.";
-      break;
+  }
+
+ private:
+  void AppendReader(std::unique_ptr<framework::ReaderBase>&& reader) override {
+    pending_.emplace_back(std::move(reader));
+    auto reader_it = pending_.end();
+    --reader_it;
+
+    futures_.emplace_back();
+    auto future_it = futures_.end();
+    --future_it;
+
+    ReadAsync(reader_it, &future_it);
+  }
+
+  void ReadAsync(const ReaderList::iterator& reader_it,
+                 FutureList::iterator* future_it_ptr) {
+    auto& future_it = *future_it_ptr;
+    *future_it = pool_.enqueue([reader_it, future_it, this] {
+      try {
+        FutureItem item;
+        item.reader_it_ = reader_it;
+        (*reader_it)->ReadNext(&item.data_);
+        if (item.data_.empty()) {
+          (*reader_it)->Shutdown();
+          (*reader_it)->Start();
+        }
+        complete_queue_.Push(future_it);
+        return item;
+      } catch (...) {
+        FutureItem item;
+        item.exception_ = std::current_exception();
+        complete_queue_.Push(future_it);
+        return item;
+      }
+    });
+  }
+
+  FutureList futures_;
+  ThreadPool pool_;
+  framework::BlockingQueue<FutureList::iterator> complete_queue_;
+  std::list<std::unique_ptr<framework::ReaderBase>> pending_;
+  std::list<std::unique_ptr<framework::ReaderBase>> done_;
+};
+
+class MultiFileReader : public framework::ReaderBase {
+ public:
+  MultiFileReader(const std::vector<std::string>& file_names,
+                  std::unique_ptr<IReaderContainer>&& container)
+      : container_(std::move(container)) {
+    for (auto& fn : file_names) {
+      container_->AppendReader(CreateReaderByFileName(fn));
     }
   }
 
-  if (!available_thread_idx_->Send(thread_idx)) {
-    VLOG(5) << "WARNING: The available_thread_idx_ channel has been closed. "
-               "Fail to send thread_idx.";
+  ~MultiFileReader() { container_->Stop(); }
+
+ protected:
+  void ReadNextImpl(std::vector<framework::LoDTensor>* out) override {
+    container_->ReadNext(out);
   }
-  VLOG(5) << "The prefetch thread of file idx '" << reader_idx
-          << "' terminates.";
-}
+  void ShutdownImpl() override { container_->Stop(); }
+  void StartImpl() override { container_->Start(); }
+
+ private:
+  std::unique_ptr<IReaderContainer> container_;
+};
 
 class OpenFilesOp : public framework::OperatorBase {
  public:
@@ -175,14 +223,27 @@ class OpenFilesOp : public framework::OperatorBase {
                       "shape concat's length.");
     const auto& file_names = Attr<std::vector<std::string>>("file_names");
     PADDLE_ENFORCE(!file_names.empty(), "No file to be read!");
-    const size_t thread_num = Attr<int>("thread_num");
-    const size_t buffer_size = Attr<int>("buffer_size");
+    bool is_test = Attr<bool>("is_test");
 
     auto* out = scope.FindVar(Output("Out"))
                     ->template GetMutable<framework::ReaderHolder>();
-    out->Reset(new MultiFileReader(file_names,
-                                   RestoreShapes(shape_concat, ranks),
-                                   thread_num, buffer_size));
+    std::unique_ptr<IReaderContainer> container;
+
+    if (is_test) {
+      container.reset(new OrderedReaderContainer());
+    } else {
+      container.reset(new PreemptiveReaderContainer(
+          static_cast<size_t>(Attr<int>("thread_num"))));
+    }
+
+    std::shared_ptr<framework::ReaderBase> reader(
+        new MultiFileReader(file_names, std::move(container)));
+    auto buffer_size = Attr<int>("buffer_size");
+    if (buffer_size > 1) {
+      reader = framework::MakeDecoratedReader<BufferedReader>(
+          reader, platform::CPUPlace(), buffer_size);
+    }
+    out->Reset(reader);
   }
 };
 
@@ -190,9 +251,7 @@ class OpenFilesOpMaker : public FileReaderMakerBase {
  protected:
   void Apply() override {
     AddAttr<std::vector<std::string>>("file_names", "Files to be read.");
-    AddAttr<int>("thread_num", "The maximal concurrent prefetch thread number.")
-        .GreaterThan(0);
-    AddAttr<int>("buffer_size", "The size of prefetch buffer.").GreaterThan(0);
+    AddAttr<bool>("is_test", "Used for testing data.").SetDefault(false);
 
     AddComment(R"DOC(
       OpenFiles Operator
@@ -200,6 +259,11 @@ class OpenFilesOpMaker : public FileReaderMakerBase {
       An OpenFilesOp creates a MultiFileReader, which is able to
       read data multi-threaded from multiple files.
     )DOC");
+    AddAttr<int>("thread_num",
+                 "The maximal concurrent prefetch thread number. Used only "
+                 "when is_test = False");
+    AddAttr<int>("buffer_size", "The reading buffer of these files.")
+        .GreaterThan(0);
   }
 };
 
diff --git a/paddle/fluid/operators/reader/reader_op_registry.cc b/paddle/fluid/operators/reader/reader_op_registry.cc
index e11256a49ffa6adc9410376cc8a71fa017df7e9c..b82aab1214992be73d876a42424234e3cea46455 100644
--- a/paddle/fluid/operators/reader/reader_op_registry.cc
+++ b/paddle/fluid/operators/reader/reader_op_registry.cc
@@ -39,7 +39,7 @@ std::unordered_map<std::string, FileReaderCreator>& FileReaderRegistry() {
 }
 
 std::unique_ptr<framework::ReaderBase> CreateReaderByFileName(
-    const std::string& file_name, const std::vector<framework::DDim>& dims) {
+    const std::string& file_name) {
   size_t separator_pos = file_name.find_last_of(kFileFormatSeparator);
   PADDLE_ENFORCE_NE(separator_pos, std::string::npos,
                     "File name illegal! A legal file name should be like: "
@@ -49,7 +49,7 @@ std::unique_ptr<framework::ReaderBase> CreateReaderByFileName(
   auto itor = FileReaderRegistry().find(filetype);
   PADDLE_ENFORCE(itor != FileReaderRegistry().end(),
                  "No file reader registered for '%s' format.", filetype);
-  framework::ReaderBase* reader = (itor->second)(file_name, dims);
+  framework::ReaderBase* reader = (itor->second)(file_name);
   return std::unique_ptr<framework::ReaderBase>(reader);
 }
 
diff --git a/paddle/fluid/operators/reader/reader_op_registry.h b/paddle/fluid/operators/reader/reader_op_registry.h
index 244bf15f068a47efc29ee54492cdbdeb10025020..25c3e7d77b788d38daf6dee1fc79e5c1c97e8842 100644
--- a/paddle/fluid/operators/reader/reader_op_registry.h
+++ b/paddle/fluid/operators/reader/reader_op_registry.h
@@ -25,22 +25,21 @@ namespace reader {
 
 static constexpr char kFileFormatSeparator[] = ".";
 
-using FileReaderCreator = std::function<framework::ReaderBase*(
-    const std::string&, const std::vector<framework::DDim>&)>;
+using FileReaderCreator =
+    std::function<framework::ReaderBase*(const std::string&)>;
 
 std::unordered_map<std::string, FileReaderCreator>& FileReaderRegistry();
 
 template <typename Reader>
 int RegisterFileReader(const std::string& filetype) {
-  FileReaderRegistry()[filetype] = [](
-      const std::string& fn, const std::vector<framework::DDim>& dims) {
-    return new Reader(fn, dims);
+  FileReaderRegistry()[filetype] = [](const std::string& fn) {
+    return new Reader(fn);
   };
   return 0;
 }
 
 std::unique_ptr<framework::ReaderBase> CreateReaderByFileName(
-    const std::string& file_name, const std::vector<framework::DDim>& dims);
+    const std::string& file_name);
 
 extern std::vector<framework::DDim> RestoreShapes(
     const std::vector<int>& shape_concat, const std::vector<int>& ranks);
diff --git a/paddle/fluid/operators/recv_op.cc b/paddle/fluid/operators/recv_op.cc
index 9854a31f5b10f5ecd940c0d41c2c3e468fc17bad..4d34b8a1686efb1fc30020f0d27e9a3c3a6c0866 100644
--- a/paddle/fluid/operators/recv_op.cc
+++ b/paddle/fluid/operators/recv_op.cc
@@ -40,18 +40,19 @@ class RecvOp : public framework::OperatorBase {
 
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     auto& ctx = *pool.Get(place);
-    // For profiling
-    platform::RecordEvent record_event(Type(), &ctx);
 
     distributed::RPCClient* rpc_client =
         distributed::RPCClient::GetInstance<RPCCLIENT_T>();
 
+    std::vector<distributed::VarHandlePtr> rets;
     for (size_t i = 0; i < outs.size(); i++) {
       VLOG(3) << "getting " << outs[i] << " from " << epmap[i];
-      rpc_client->AsyncGetVar(epmap[i], ctx, scope, outs[i]);
+      rets.push_back(rpc_client->AsyncGetVar(epmap[i], ctx, scope, outs[i]));
     }
     if (sync_mode) {
-      rpc_client->Wait();
+      for (size_t i = 0; i < rets.size(); i++) {
+        PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient");
+      }
     }
   }
 };
@@ -59,6 +60,8 @@ class RecvOp : public framework::OperatorBase {
 class RecvOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() {
+    AddInput("X", "(Any) Dummy inputs, used for control dependency")
+        .AsDuplicable();
     AddOutput("Out", "(Tensor) Variables to get from server.").AsDuplicable();
     AddComment(R"DOC(
 Recv operator
diff --git a/paddle/fluid/operators/reduce_sum_op.cc b/paddle/fluid/operators/reduce_sum_op.cc
index c5b5398787b44e658b0f8390162df0e6c3006651..f0e5f6580fbc9e70562cb2fdd7e0c5d8729bc9a7 100644
--- a/paddle/fluid/operators/reduce_sum_op.cc
+++ b/paddle/fluid/operators/reduce_sum_op.cc
@@ -23,12 +23,13 @@ REGISTER_OP_CPU_KERNEL(
     ops::ReduceKernel<paddle::platform::CPUDeviceContext, int, ops::SumFunctor>,
     ops::ReduceKernel<paddle::platform::CPUDeviceContext, int64_t,
                       ops::SumFunctor>);
-REGISTER_OP_CPU_KERNEL(reduce_sum_grad,
-                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
-                                             float, ops::SumGradFunctor>,
-                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
-                                             double, ops::SumGradFunctor>,
-                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
-                                             int, ops::SumGradFunctor>,
-                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
-                                             int64_t, ops::SumGradFunctor>);
+REGISTER_OP_CPU_KERNEL(
+    reduce_sum_grad,
+    ops::ReduceSumGradKernel<paddle::platform::CPUDeviceContext, float,
+                             ops::SumGradFunctor>,
+    ops::ReduceSumGradKernel<paddle::platform::CPUDeviceContext, double,
+                             ops::SumGradFunctor>,
+    ops::ReduceSumGradKernel<paddle::platform::CPUDeviceContext, int,
+                             ops::SumGradFunctor>,
+    ops::ReduceSumGradKernel<paddle::platform::CPUDeviceContext, int64_t,
+                             ops::SumGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_sum_op.h b/paddle/fluid/operators/reduce_sum_op.h
index e67d7e1da5f0244d2dee346873692a80cbad2fc4..3e8d1bbdba504669bc06e0637094e3bee840adf2 100644
--- a/paddle/fluid/operators/reduce_sum_op.h
+++ b/paddle/fluid/operators/reduce_sum_op.h
@@ -14,11 +14,69 @@
 
 #pragma once
 
+#include <vector>
+
 #include "paddle/fluid/operators/reduce_op.h"
 
 namespace paddle {
 namespace operators {
 
+// use for loop to speed up Eigen broadcast. 4 timer faster then broadcast
+template <typename DeviceContext, typename T, typename Functor>
+class ReduceSumGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto dims = context.Attr<std::vector<int>>("dim");
+    if (context.GetPlace().type() == typeid(platform::CPUPlace) &&
+        dims.size() == 1) {
+      auto* input0 = context.Input<Tensor>("X");
+      auto* input2 = context.Input<Tensor>(framework::GradVarName("Out"));
+      auto* output = context.Output<Tensor>(framework::GradVarName("X"));
+      output->mutable_data<T>(context.GetPlace());
+      const auto* input2_d = input2->data<T>();
+      auto* output_d = output->data<T>();
+
+      // handle reduce_all
+      if (input2->dims().size() == 1 && input2->dims()[0] == 1) {
+        for (int64_t i = 0; i < framework::product(input0->dims()); ++i) {
+          output_d[i] = input2_d[0];
+        }
+        return;
+      }
+
+      // handle reduce by one dimension
+      int reduce_dim_index = dims[0];
+      if (reduce_dim_index < 0) {
+        reduce_dim_index += input0->dims().size();
+      }
+
+      auto& input_dim = input0->dims();
+      int64_t before_dim = 1;
+      for (int i = 0; i < reduce_dim_index; ++i) {
+        before_dim *= input_dim[i];
+      }
+      int64_t reduce_dim = input_dim[reduce_dim_index];
+      int64_t after_dim = 1;
+      for (int i = reduce_dim_index + 1; i < input_dim.size(); ++i) {
+        after_dim *= input_dim[i];
+      }
+      for (int64_t i = 0; i < before_dim; ++i) {
+        for (int64_t j = 0; j < reduce_dim; ++j) {
+          for (int64_t k = 0; k < after_dim; ++k) {
+            output_d[i * reduce_dim * after_dim + j * after_dim + k] =
+                input2_d[i * after_dim + k];
+          }
+        }
+      }
+      return;
+    }
+
+    // default use Eigen broadcast
+    ReduceGradKernel<DeviceContext, T, Functor> kernel;
+    kernel.Compute(context);
+  }
+};
+
 struct SumFunctor {
   template <typename DeviceContext, typename X, typename Y, typename Dim>
   void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
@@ -31,7 +89,7 @@ struct SumGradFunctor {
             typename DY, typename Dim>
   void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
                   const Dim& dim, int size) {
-    dx->device(place) = dy->broadcast(dim);
+    dx->device(place) = dy->eval().broadcast(dim);
   }
 };
 
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index 918f3be533d51367eade5f5108ad2eab954a9303..d72f85f2c44db2fa887732cfc05e1376a6a79e4a 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -127,12 +127,6 @@ class ReshapeOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Out", "(Tensor). The output tensor of reshape operator.");
     AddAttr<std::vector<int>>(
         "shape", "(std::vector<int>) Target shape of reshape operator.");
-    AddAttr<bool>("inplace",
-                  "(default: false) Change the source tensor's shape without "
-                  "memory copy. When Attr(inplace) is set true, the output "
-                  "tensor shares memory with Input(X), otherwise, a new output "
-                  "tensor is created, and its data are copied from Input(x).")
-        .SetDefault(false);
     AddComment(R"DOC(
 Reshape Operator.
 
@@ -216,7 +210,7 @@ class ReshapeKernel {
     if (shape_tensor) {
       auto *shape_data = shape_tensor->data<int>();
       framework::Tensor cpu_shape_tensor;
-      if (platform::is_gpu_place(ctx.GetPlace())) {
+      if (platform::is_gpu_place(shape_tensor->place())) {
         TensorCopySync(*shape_tensor, platform::CPUPlace(), &cpu_shape_tensor);
         shape_data = cpu_shape_tensor.data<int>();
       }
@@ -233,16 +227,9 @@ class ReshapeKernel {
           "sequence_reshape op.");
     }
 
-    bool inplace = ctx.Attr<bool>("inplace");
+    out->mutable_data(ctx.GetPlace(), in->type());
+    framework::TensorCopySync(*in, ctx.GetPlace(), out);
     out->Resize(out_dims);
-    if (!inplace) {
-      out->mutable_data(ctx.GetPlace(), in->type());
-      framework::TensorCopySync(*in, ctx.GetPlace(), out);
-      out->Resize(out_dims);
-    } else {
-      out->ShareDataWith(*in);
-      out->Resize(out_dims);
-    }
   }
 };
 
@@ -251,19 +238,93 @@ class ReshapeGradKernel {
   void operator()(const framework::ExecutionContext &ctx) const {
     auto *d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
     auto *d_x = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto in_dims = d_x->dims();
 
     d_x->mutable_data(ctx.GetPlace(), d_out->type());
-    bool inplace = ctx.Attr<bool>("inplace");
+    framework::TensorCopySync(*d_out, ctx.GetPlace(), d_x);
+    d_x->Resize(in_dims);
+  }
+};
 
-    auto in_dims = d_x->dims();
-    if (!inplace) {
-      framework::TensorCopy(*d_out, ctx.GetPlace(), ctx.device_context(), d_x);
-      ctx.device_context().Wait();
-      d_x->Resize(in_dims);
-    } else {
-      d_x->ShareDataWith(*d_out);
-      d_x->Resize(in_dims);
+// FIXME(zcd): reshape2 adds an intermediate output(XShape) based on reshape,
+// the XShape is used to carry the shape and lod of X which will be used in
+// reshape_grad, in this way, the framework can reuse the memory of X
+// immediately the reshape_op is finished.
+// Considering compatibility issues, we could not fix reshape_op
+class Reshape2Op : public ReshapeOp {
+ public:
+  Reshape2Op(const std::string &type, const framework::VariableNameMap &inputs,
+             const framework::VariableNameMap &outputs,
+             const framework::AttributeMap &attrs)
+      : ReshapeOp(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    ReshapeOp::InferShape(ctx);
+    PADDLE_ENFORCE(ctx->HasOutput("XShape"),
+                   "Output(XShape) of ReshapeOp should not be null.");
+    const auto &x_dims = ctx->GetInputDim("X");
+    std::vector<int64_t> xshape_dims(x_dims.size() + 1);
+    xshape_dims[0] = 0;
+    for (int i = 0; i < x_dims.size(); ++i) {
+      xshape_dims[i + 1] = x_dims[i];
     }
+    ctx->SetOutputDim("XShape", framework::make_ddim(xshape_dims));
+    ctx->ShareLoD("X", /*->*/ "XShape");
+  }
+};
+
+class Reshape2OpMaker : public ReshapeOpMaker {
+ public:
+  void Make() override {
+    ReshapeOpMaker::Make();
+    AddOutput("XShape",
+              "XShape is just used to store the shape and lod of X, which will "
+              "be used in FlattenGradOp.")
+        .AsIntermediate();
+  }
+};
+
+class Reshape2GradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
+    grad_op->SetType("reshape2_grad");
+    grad_op->SetInput("XShape", Output("XShape"));
+    grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    grad_op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+
+class Reshape2GradOp : public framework::OperatorWithKernel {
+ public:
+  Reshape2GradOp(const std::string &type,
+                 const framework::VariableNameMap &inputs,
+                 const framework::VariableNameMap &outputs,
+                 const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("XShape"), "Input(XShape) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) shouldn't be null.");
+    auto xshape_dims = ctx->GetInputDim("XShape");
+    auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
+    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
+    ctx->ShareLoD("XShape", framework::GradVarName("X"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(
+            ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"))
+                ->type()),
+        ctx.device_context());
   }
 };
 
@@ -282,6 +343,17 @@ REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape_grad, float, ops::ReshapeGradKernel,
                                ops::ReshapeGradKernel, int64_t,
                                ops::ReshapeGradKernel);
 
+REGISTER_OPERATOR(reshape2, ops::Reshape2Op, ops::Reshape2OpMaker,
+                  ops::Reshape2GradMaker);
+REGISTER_OPERATOR(reshape2_grad, ops::Reshape2GradOp);
+REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, double,
+                               ops::ReshapeKernel, int, ops::ReshapeKernel,
+                               int64_t, ops::ReshapeKernel);
+REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape2_grad, float, ops::ReshapeGradKernel,
+                               double, ops::ReshapeGradKernel, int,
+                               ops::ReshapeGradKernel, int64_t,
+                               ops::ReshapeGradKernel);
+
 #ifdef PADDLE_WITH_CUDA
 REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel, double,
                                 ops::ReshapeKernel, int, ops::ReshapeKernel,
@@ -290,4 +362,11 @@ REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape_grad, float, ops::ReshapeGradKernel,
                                 double, ops::ReshapeGradKernel, int,
                                 ops::ReshapeGradKernel, int64_t,
                                 ops::ReshapeGradKernel);
+REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, double,
+                                ops::ReshapeKernel, int, ops::ReshapeKernel,
+                                int64_t, ops::ReshapeKernel);
+REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2_grad, float, ops::ReshapeGradKernel,
+                                double, ops::ReshapeGradKernel, int,
+                                ops::ReshapeGradKernel, int64_t,
+                                ops::ReshapeGradKernel);
 #endif
diff --git a/paddle/fluid/operators/rmsprop_op.cc b/paddle/fluid/operators/rmsprop_op.cc
index 919ebe48ca38040274bd2052b95ef96eccff4db6..2f773f222e50a440801b06a4fd997bf237b34772 100644
--- a/paddle/fluid/operators/rmsprop_op.cc
+++ b/paddle/fluid/operators/rmsprop_op.cc
@@ -36,9 +36,13 @@ class RmspropOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
                    "Output(param_out) of RmspropOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("MomentOut"),
-                   "Output(Momentum_out) of RmspropOp should not be null.");
+                   "Output(MomentOut) of RmspropOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("MeanSquareOut"),
                    "Output(MeanSquareOut) of RmspropOp should not be null.");
+    if (ctx->Attrs().Get<bool>("centered")) {
+      PADDLE_ENFORCE(ctx->HasOutput("MeanGradOut"),
+                     "Output(MeanGradOut) of RmspropOp should not be null.");
+    }
 
     auto param_dim = ctx->GetInputDim("Param");
     PADDLE_ENFORCE_EQ(
@@ -58,6 +62,9 @@ class RmspropOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim("ParamOut", param_dim);
     ctx->SetOutputDim("MomentOut", param_dim);
     ctx->SetOutputDim("MeanSquareOut", param_dim);
+    if (ctx->Attrs().Get<bool>("centered")) {
+      ctx->SetOutputDim("MeanGradOut", param_dim);
+    }
   }
 };
 
@@ -70,6 +77,10 @@ class RmspropOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("MeanSquare",
              "(Tensor, default Tensor<float>)"
              " The mean square value that gets updated.");
+    AddInput("MeanGrad",
+             "(Tensor, default Tensor<float>)"
+             " The moving average of gradient")
+        .AsDispensable();
     AddInput("LearningRate",
              "(Tensor, default Tensor<float>) "
              "The learning rate should be a tensor of size 1.");
@@ -82,6 +93,8 @@ class RmspropOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("ParamOut", "(Tensor) Output updated parameter value.");
     AddOutput("MomentOut", "(Tensor) Output updated moment.");
     AddOutput("MeanSquareOut", "(Tensor) Output Mean squared updated value.");
+    AddOutput("MeanGradOut",
+              "(Tensor) Output moving average of gradient updated value.");
 
     AddAttr<float>("epsilon",
                    "(float, default 1e-10) Constant "
@@ -93,6 +106,8 @@ class RmspropOpMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault(0.9f);
     AddAttr<float>("momentum", "(float, default 0.0) Constant value.")
         .SetDefault(0.0f);
+    AddAttr<bool>("centered", "(bool, default false) use centered rmsprop.")
+        .SetDefault(false);
     AddComment(R"DOC(
 Rmsprop Optimizer. 
 
@@ -103,6 +118,14 @@ MomentOut = momentum * Moment +
 ParamOut = Param -  MomentOut
 $$
 
+if centered is true:
+
+mean_grad = decay * mean_square{t-1} + (1-decay) * gradient
+mean_square = decay * mean_square{t-1} + (1-decay) * gradient ** 2
+mom = momentum * mom{t-1} + learning_rate * g_t /
+    sqrt(mean_square - mean_grad**2 + epsilon)
+param -= mom
+
 The original slides that proposed Rmsprop: Slide 29 of
 http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)
 
diff --git a/paddle/fluid/operators/rmsprop_op.h b/paddle/fluid/operators/rmsprop_op.h
index 12836f43bde47ac87eb0af33dea501593b659a5d..25ed32c5ebb2ff5be962ac1e3e38c970623d705c 100644
--- a/paddle/fluid/operators/rmsprop_op.h
+++ b/paddle/fluid/operators/rmsprop_op.h
@@ -41,6 +41,7 @@ class RmspropOpKernel : public framework::OpKernel<T> {
     float epsilon = ctx.Attr<float>("epsilon");
     float rho = ctx.Attr<float>("decay");
     float momentum = ctx.Attr<float>("momentum");
+    bool centered = ctx.Attr<bool>("centered");
 
     auto p = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Param"));
     auto ms = EigenVector<T>::Flatten(*ctx.Input<Tensor>("MeanSquare"));
@@ -53,12 +54,24 @@ class RmspropOpKernel : public framework::OpKernel<T> {
     auto ms_out = EigenVector<T>::Flatten(*mean_square_out);
     auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
 
-    Eigen::DSizes<int, 1> grad_dsize(grad->numel());
+    Eigen::DSizes<int, 1> grad_dsize(static_cast<int>(grad->numel()));
 
     ms_out.device(place) = rho * ms + (1 - rho) * g * g;
-    mom_out.device(place) =
-        momentum * mom +
-        lr.broadcast(grad_dsize) * g / (ms_out + epsilon).sqrt();
+    if (centered) {
+      auto mg = EigenVector<T>::Flatten(*ctx.Input<Tensor>("MeanGrad"));
+      auto* mean_grad_out = ctx.Output<Tensor>("MeanGradOut");
+      mean_grad_out->mutable_data<T>(ctx.GetPlace());
+      auto mg_out = EigenVector<T>::Flatten(*mean_grad_out);
+
+      mg_out.device(place) = rho * mg + (1 - rho) * g;
+      mom_out.device(place) = momentum * mom +
+                              lr.broadcast(grad_dsize) * g /
+                                  (ms_out - mg_out.square() + epsilon).sqrt();
+    } else {
+      mom_out.device(place) =
+          momentum * mom +
+          lr.broadcast(grad_dsize) * g / (ms_out + epsilon).sqrt();
+    }
     p_out.device(place) = p - mom_out;
   }
 };
diff --git a/paddle/fluid/operators/roi_pool_op.cu b/paddle/fluid/operators/roi_pool_op.cu
index 50450b62f7b1c0b2b5abf01a43581a0e2d2cd01e..46e20285db6d7acd39dead3994409645adddf494 100644
--- a/paddle/fluid/operators/roi_pool_op.cu
+++ b/paddle/fluid/operators/roi_pool_op.cu
@@ -31,7 +31,7 @@ static inline int NumBlocks(const int N) {
 
 template <typename T>
 __global__ void GPUROIPoolForward(
-    const int nthreads, const T* input_data, const int64_t* input_rois,
+    const int nthreads, const T* input_data, const T* input_rois,
     const float spatial_scale, const int channels, const int height,
     const int width, const int pooled_height, const int pooled_width,
     int* roi_batch_id_data, T* output_data, int64_t* argmax_data) {
@@ -43,7 +43,7 @@ __global__ void GPUROIPoolForward(
     int c = (i / pooled_width / pooled_height) % channels;
     int n = i / pooled_width / pooled_height / channels;
 
-    const int64_t* offset_input_rois = input_rois + n * kROISize;
+    const T* offset_input_rois = input_rois + n * kROISize;
     int roi_batch_ind = roi_batch_id_data[n];
     int roi_start_w = round(offset_input_rois[0] * spatial_scale);
     int roi_start_h = round(offset_input_rois[1] * spatial_scale);
@@ -93,7 +93,7 @@ __global__ void GPUROIPoolForward(
 
 template <typename T>
 __global__ void GPUROIPoolBackward(
-    const int nthreads, const int64_t* input_rois, const T* output_grad,
+    const int nthreads, const T* input_rois, const T* output_grad,
     const int64_t* argmax_data, const int num_rois, const float spatial_scale,
     const int channels, const int height, const int width,
     const int pooled_height, const int pooled_width, int* roi_batch_id_data,
@@ -174,8 +174,8 @@ class GPUROIPoolOpKernel : public framework::OpKernel<T> {
 
     GPUROIPoolForward<
         T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
-        output_size, in->data<T>(), rois->data<int64_t>(), spatial_scale,
-        channels, height, width, pooled_height, pooled_width,
+        output_size, in->data<T>(), rois->data<T>(), spatial_scale, channels,
+        height, width, pooled_height, pooled_width,
         roi_batch_id_list_gpu.data<int>(), out->mutable_data<T>(ctx.GetPlace()),
         argmax->mutable_data<int64_t>(ctx.GetPlace()));
   }
@@ -228,7 +228,7 @@ class GPUROIPoolGradOpKernel : public framework::OpKernel<T> {
       if (output_grad_size > 0) {
         GPUROIPoolBackward<
             T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
-            output_grad_size, rois->data<int64_t>(), out_grad->data<T>(),
+            output_grad_size, rois->data<T>(), out_grad->data<T>(),
             argmax->data<int64_t>(), rois_num, spatial_scale, channels, height,
             width, pooled_height, pooled_width,
             roi_batch_id_list_gpu.data<int>(),
diff --git a/paddle/fluid/operators/roi_pool_op.h b/paddle/fluid/operators/roi_pool_op.h
index c4f739b2c6b2d62ebebcc15fd627ebad040e7b3f..07de7c9f0e070cef7c6f38f8d564ab76910842db 100644
--- a/paddle/fluid/operators/roi_pool_op.h
+++ b/paddle/fluid/operators/roi_pool_op.h
@@ -72,7 +72,7 @@ class CPUROIPoolOpKernel : public framework::OpKernel<T> {
     T* output_data = out->mutable_data<T>(ctx.GetPlace());
     int64_t* argmax_data = argmax->mutable_data<int64_t>(ctx.GetPlace());
 
-    const int64_t* rois_data = rois->data<int64_t>();
+    const T* rois_data = rois->data<T>();
     for (int n = 0; n < rois_num; ++n) {
       int roi_batch_id = roi_batch_id_data[n];
       int roi_start_w = round(rois_data[0] * spatial_scale);
@@ -171,7 +171,7 @@ class CPUROIPoolGradOpKernel : public framework::OpKernel<T> {
         }
       }
 
-      const int64_t* rois_data = rois->data<int64_t>();
+      const T* rois_data = rois->data<T>();
       const T* out_grad_data = out_grad->data<T>();
       const int64_t* argmax_data = argmax->data<int64_t>();
       T* in_grad_data = in_grad->mutable_data<T>(ctx.GetPlace());
diff --git a/paddle/fluid/operators/sampling_id_op.cc b/paddle/fluid/operators/sampling_id_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..724463c95c4a29fb5c00fe791b389d3908771640
--- /dev/null
+++ b/paddle/fluid/operators/sampling_id_op.cc
@@ -0,0 +1,76 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/sampling_id_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+class SamplingIdOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SamplingIdOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of SamplingIdOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->Attrs().Get<float>("min") < ctx->Attrs().Get<float>("max"),
+        "min must less then max");
+
+    auto input_dims = ctx->GetInputDim("X");
+    PADDLE_ENFORCE(input_dims.size() == 2,
+                   "Input(X, Filter) should be 2-D tensor.");
+
+    framework::DDim dims = input_dims;
+    ctx->SetOutputDim("Out", dims);
+    ctx->ShareLoD("X", "Out");
+  }
+};
+
+class SamplingIdOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "The input tensor of softmax. "
+             "2-D with shape [batch_size, input_feature_dimensions].");
+    AddOutput("Out", "SamplingId data tensor.");
+    AddComment(R"DOC(
+SamplingId Operator.
+A layer for sampling id from multinomial distribution from the
+ input. Sampling one id for one sample.)DOC");
+    AddAttr<float>("min", "Minimum value of random. [default 0.0].")
+        .SetDefault(0.0f);
+    AddAttr<float>("max", "Maximun value of random. [default 1.0].")
+        .SetDefault(1.0f);
+    AddAttr<int>("seed",
+                 "Random seed used for the random number engine. "
+                 "0 means use a seed generated by the system."
+                 "Note that if seed is not 0, this operator will always "
+                 "generate the same random numbers every time. [default 0].")
+        .SetDefault(0);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(sampling_id, ops::SamplingIdOp, ops::SamplingIdOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+
+REGISTER_OP_CPU_KERNEL(sampling_id, paddle::operators::SamplingIdKernel<float>,
+                       paddle::operators::SamplingIdKernel<double>);
diff --git a/paddle/fluid/operators/sampling_id_op.cu b/paddle/fluid/operators/sampling_id_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a4f0470314d00b5e370fd478736b54579c88448c
--- /dev/null
+++ b/paddle/fluid/operators/sampling_id_op.cu
@@ -0,0 +1,19 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include "paddle/fluid/operators/sampling_id_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(sampling_id, paddle::operators::SamplingIdKernel<float>,
+                        paddle::operators::SamplingIdKernel<double>);
diff --git a/paddle/fluid/operators/sampling_id_op.h b/paddle/fluid/operators/sampling_id_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..133d3f72dbd6ab13c98d124369038309c94cba5b
--- /dev/null
+++ b/paddle/fluid/operators/sampling_id_op.h
@@ -0,0 +1,80 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <iostream>
+#include <iterator>
+#include <random>
+#include <sstream>
+#include <vector>
+
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+class SamplingIdKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* input = context.Input<Tensor>("X");
+    const int batch_size = static_cast<int>(input->dims()[0]);
+    const int width = static_cast<int>(input->dims()[1]);
+
+    PADDLE_ENFORCE_GE(batch_size, 0,
+                      "batch_size(dims[0]) must be nonnegative.");
+    PADDLE_ENFORCE_GE(width, 0, "width(dims[1]) must be nonnegative.");
+
+    std::vector<T> ins_vector;
+    framework::TensorToVector(*input, context.device_context(), &ins_vector);
+
+    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
+    std::minstd_rand engine;
+    if (seed == 0) {
+      seed = std::random_device()();
+    }
+    engine.seed(seed);
+    std::uniform_real_distribution<T> dist(
+        static_cast<T>(context.Attr<float>("min")),
+        static_cast<T>(context.Attr<float>("max")));
+
+    std::vector<int64_t> ids(batch_size);
+    for (int i = 0; i < batch_size; ++i) {
+      T r = dist(engine);
+      int idx = width - 1;
+      for (int j = 0; j < width; ++j) {
+        if ((r -= ins_vector[i * width + j]) < 0) {
+          idx = j;
+          break;
+        }
+      }
+      ids[i] = int64_t(idx);
+    }
+
+    std::vector<int64_t> out_dim;
+    out_dim.push_back(static_cast<int64_t>(batch_size));
+
+    Tensor* output = context.Output<Tensor>("Out");
+    output->Resize(framework::make_ddim(out_dim));
+    output->mutable_data<T>(context.GetPlace());
+    framework::TensorFromVector(ids, context.device_context(), output);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/save_combine_op.cc b/paddle/fluid/operators/save_combine_op.cc
index cfee9207083b46f7c27354f22e82a7d3c38a027c..5b05f757c0355ed15617dea925b5d4929fcbfee0 100644
--- a/paddle/fluid/operators/save_combine_op.cc
+++ b/paddle/fluid/operators/save_combine_op.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <stdint.h>
-#include <sys/stat.h>
 #include <fstream>
 #include <numeric>
 #include <sstream>
@@ -23,40 +22,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/port.h"
 
 namespace paddle {
 namespace operators {
 
-// TODO(sidgoyal78): These function are needed by other files (save_op), move
-// them to paddle::filesystem namespace. (as noted by yuyang18 in save_op).
-constexpr char kSEP = '/';
-static bool FileExists(const std::string &filepath) {
-  struct stat buffer;
-  return (stat(filepath.c_str(), &buffer) == 0);
-}
-
-static std::string DirName(const std::string &filepath) {
-  auto pos = filepath.rfind(kSEP);
-  if (pos == std::string::npos) {
-    return "";
-  }
-  return filepath.substr(0, pos);
-}
-
-static void MkDir(const char *path) {
-  if (mkdir(path, 0755)) {
-    PADDLE_ENFORCE_EQ(errno, EEXIST, "%s mkdir failed!", path);
-  }
-}
-
-static void MkDirRecursively(const char *fullpath) {
-  if (*fullpath == '\0') return;  // empty string
-  if (FileExists(fullpath)) return;
-
-  MkDirRecursively(DirName(fullpath).c_str());
-  MkDir(fullpath);
-}
-
 class SaveCombineOp : public framework::OperatorBase {
  public:
   SaveCombineOp(const std::string &type,
diff --git a/paddle/fluid/operators/save_op.cc b/paddle/fluid/operators/save_op.cc
index 201a51130d6b6f94104e2dabf9e7facffa672ae0..e79cffcf498c52ed14db235f6221cfdf08399c9d 100644
--- a/paddle/fluid/operators/save_op.cc
+++ b/paddle/fluid/operators/save_op.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <stdint.h>
-#include <sys/stat.h>
 #include <fstream>
 #include <numeric>
 
@@ -25,6 +24,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/port.h"
 
 namespace paddle {
 namespace operators {
@@ -33,36 +33,6 @@ namespace operators {
 // to directory specified.
 constexpr char LOOKUP_TABLE_PATH[] = "kLookupTablePath";
 
-// TODO(yuyang18): If the functions below are needed by other files, move them
-// to paddle::filesystem namespace.
-constexpr char kSEP = '/';
-static bool FileExists(const std::string &filepath) {
-  struct stat buffer;
-  return (stat(filepath.c_str(), &buffer) == 0);
-}
-
-static std::string DirName(const std::string &filepath) {
-  auto pos = filepath.rfind(kSEP);
-  if (pos == std::string::npos) {
-    return "";
-  }
-  return filepath.substr(0, pos);
-}
-
-static void MkDir(const char *path) {
-  if (mkdir(path, 0755)) {
-    PADDLE_ENFORCE_EQ(errno, EEXIST, "%s mkdir failed!", path);
-  }
-}
-
-static void MkDirRecursively(const char *fullpath) {
-  if (*fullpath == '\0') return;  // empty string
-  if (FileExists(fullpath)) return;
-
-  MkDirRecursively(DirName(fullpath).c_str());
-  MkDir(fullpath);
-}
-
 class SaveOp : public framework::OperatorBase {
  public:
   SaveOp(const std::string &type, const framework::VariableNameMap &inputs,
@@ -142,6 +112,8 @@ class SaveOp : public framework::OperatorBase {
     std::string filename = lt_var->data();
     VLOG(4) << "SaveSelectedRows get File name: " << filename;
 
+    MkDirRecursively(DirName(filename).c_str());
+
     auto &selectedRows = var->Get<framework::SelectedRows>();
 
     // get device context from pool
diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc
index 7f8822e40053b5bcd394f446138a2292d80b69bf..c614de2eac143b3a545c60226aefa93dd72dea4f 100644
--- a/paddle/fluid/operators/scale_op.cc
+++ b/paddle/fluid/operators/scale_op.cc
@@ -13,8 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/scale_op.h"
+
 #include <string>
 
+#include "paddle/fluid/operators/detail/safe_ref.h"
+
 namespace paddle {
 namespace operators {
 
@@ -52,6 +55,21 @@ $$Out = scale*X$$
   }
 };
 
+class ScaleOpVarTypeInference : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {
+    auto &in_var_name = op_desc.Input("X").front();
+    auto &in_var = detail::Ref(block->FindVarRecursive(in_var_name));
+
+    auto out_var_name = op_desc.Output("Out").front();
+    auto *out_var = block->FindVarRecursive(out_var_name);
+
+    out_var->SetType(in_var.GetType());
+    out_var->SetDataType(in_var.GetDataType());
+  }
+};
+
 class ScaleGradMaker : public framework::SingleGradOpDescMaker {
  public:
   using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
@@ -71,7 +89,8 @@ class ScaleGradMaker : public framework::SingleGradOpDescMaker {
 
 namespace ops = paddle::operators;
 
-REGISTER_OPERATOR(scale, ops::ScaleOp, ops::ScaleOpMaker, ops::ScaleGradMaker);
+REGISTER_OPERATOR(scale, ops::ScaleOp, ops::ScaleOpMaker, ops::ScaleGradMaker,
+                  ops::ScaleOpVarTypeInference);
 REGISTER_OP_CPU_KERNEL(
     scale, ops::ScaleKernel<paddle::platform::CPUDeviceContext, float>,
     ops::ScaleKernel<paddle::platform::CPUDeviceContext, double>,
diff --git a/paddle/fluid/operators/scale_op.h b/paddle/fluid/operators/scale_op.h
index c6a59b76adcd6b4d3e7db5e7c7185f266f46841f..fe035aba81dd74d21539974beed255275be3013b 100644
--- a/paddle/fluid/operators/scale_op.h
+++ b/paddle/fluid/operators/scale_op.h
@@ -22,17 +22,29 @@ namespace operators {
 template <typename DeviceContext, typename T>
 class ScaleKernel : public framework::OpKernel<T> {
  public:
-  virtual void Compute(const framework::ExecutionContext& context) const {
-    auto* tensor = context.Output<framework::Tensor>("Out");
-    auto* in = context.Input<framework::Tensor>("X");
-    tensor->mutable_data<T>(in->place());
+  virtual void Compute(const framework::ExecutionContext& ctx) const {
+    auto* in_var = ctx.InputVar("X");
+    auto* in = ctx.Input<framework::Tensor>("X");
 
-    auto scale = static_cast<T>(context.Attr<float>("scale"));
+    auto* out_var = ctx.OutputVar("Out");
+    auto* out = ctx.Output<framework::Tensor>("Out");
+    out->mutable_data<T>(in->place());
 
-    auto eigen_out = framework::EigenVector<T>::Flatten(*tensor);
+    PADDLE_ENFORCE_EQ(in->dims(), out->dims(),
+                      "in and out should have the same dim");
+
+    auto scale = static_cast<T>(ctx.Attr<float>("scale"));
+
+    if (in_var->IsType<framework::SelectedRows>() && in_var != out_var) {
+      auto& in_slr = in_var->Get<framework::SelectedRows>();
+      auto* out_slr = out_var->GetMutable<framework::SelectedRows>();
+      out_slr->set_rows(in_slr.rows());
+      out_slr->set_height(in_slr.height());
+    }
+
+    auto eigen_out = framework::EigenVector<T>::Flatten(*out);
     auto eigen_in = framework::EigenVector<T>::Flatten(*in);
-    auto& dev =
-        *context.template device_context<DeviceContext>().eigen_device();
+    auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
     eigen_out.device(dev) = scale * eigen_in;
   }
 };
diff --git a/paddle/fluid/operators/scatter_op.cc b/paddle/fluid/operators/scatter_op.cc
index bf5e0d864495ce3a651a31c9d5a7664fe9eb2396..c32d2603cf76f55a9e723196977b0a70c92d597a 100644
--- a/paddle/fluid/operators/scatter_op.cc
+++ b/paddle/fluid/operators/scatter_op.cc
@@ -81,8 +81,8 @@ class ScatterOpMaker : public framework::OpProtoAndCheckerMaker {
   void Make() override {
     AddInput("X", "The source input of scatter op");
     AddInput("Ids", "The index input of scatter op where X will be updated");
-    AddInput("Updates", "The updated value of updates op");
-    AddOutput("Out", "The output of add op");
+    AddInput("Updates", "The updated value of scatter op");
+    AddOutput("Out", "The output of scatter op");
     AddComment(R"DOC(
 Scatter Operator.
 
@@ -90,7 +90,7 @@ This operator obtains output by updating the input on selected indices on the fi
 
 $$
 Out = X \\
-Out[Ids] = X[Ids] + Updates
+Out[Ids] = Updates
 $$
 
 )DOC");
diff --git a/paddle/fluid/operators/scatter_op.h b/paddle/fluid/operators/scatter_op.h
index d29947b55e751a3e7993f765198364f4debe2472..2eefbba9726af4d38b40d91e9242faa2923dca20 100644
--- a/paddle/fluid/operators/scatter_op.h
+++ b/paddle/fluid/operators/scatter_op.h
@@ -34,9 +34,9 @@ class ScatterOpKernel : public framework::OpKernel<T> {
     auto *Updates = ctx.Input<Tensor>("Updates");
     auto *Out = ctx.Output<Tensor>("Out");
 
-    // In place output: Out = X, Out[Ids] += Updates
-    Out->ShareDataWith(*X);
-    // Apply ScatterUpdate: Out[index] += Updates[:]
+    // In place output: Out = X, Out[Ids] = Updates
+    framework::TensorCopySync(*X, ctx.GetPlace(), Out);
+    // Apply ScatterUpdate: Out[index] = Updates[:]
     ScatterAssign<T>(ctx.device_context(), *Updates, *Ids, Out);
   }
 };
@@ -53,9 +53,9 @@ class ScatterGradientOpKernel : public framework::OpKernel<T> {
     auto *dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));
 
     // In place gradient: dX = dO
-    dX->ShareDataWith(*dOut);
+    framework::TensorCopySync(*dOut, ctx.GetPlace(), dX);
     dUpdates->mutable_data<T>(ctx.GetPlace());
-    // Gradient by Gather: dUpdates += dO[Ids]
+    // Gradient by Gather: dUpdates = dO[Ids]
     CPUGather<T>(ctx.device_context(), *dOut, *Ids, dUpdates);
   }
 };
diff --git a/paddle/fluid/operators/send_barrier_op.cc b/paddle/fluid/operators/send_barrier_op.cc
index 6b4572dcccc21e783f1df0b9bcde11d532ff4ba8..40404295266899c6ac2f7b1e08fdf7db40958794 100644
--- a/paddle/fluid/operators/send_barrier_op.cc
+++ b/paddle/fluid/operators/send_barrier_op.cc
@@ -37,33 +37,29 @@ class SendBarrierOp : public framework::OperatorBase {
   void RunImpl(const framework::Scope& scope,
                const platform::Place& place) const override {
     std::vector<std::string> eps = Attr<std::vector<std::string>>("endpoints");
-    bool sync_mode = Attr<bool>("sync_mode");
-
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    auto& ctx = *pool.Get(place);
-    // For profiling
-    platform::RecordEvent record_event(Type(), &ctx);
 
     distributed::RPCClient* rpc_client =
         distributed::RPCClient::GetInstance<RPCCLIENT_T>();
 
-    VLOG(3) << "SendBarrierOp sync_mode:" << sync_mode;
+    VLOG(3) << "SendBarrierOp sync";
 
     // need to wait before sending send_barrier message
-    rpc_client->Wait();
-    if (sync_mode) {
-      for (auto& ep : eps) {
-        VLOG(3) << "send barrier, ep: " << ep;
-        rpc_client->AsyncSendBatchBarrier(ep);
-      }
-      rpc_client->Wait();
+    PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
+    for (auto& ep : eps) {
+      VLOG(3) << "send barrier, ep: " << ep;
+      rpc_client->AsyncSendBatchBarrier(ep);
     }
+    PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
   }
 };
 
 class SendBarrierOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() {
+    AddInput("X", "(Any) Dummy inputs, used for control dependency")
+        .AsDuplicable();
+    AddOutput("Out", "(Any) Dummy outputs, used for control dependency")
+        .AsDuplicable();
     AddComment(R"DOC(
 SendBarrier operator
 
@@ -75,7 +71,6 @@ the Parameter Server would knew all variables have been sent.
                                       "(string vector, default 127.0.0.1:6164)"
                                       "Server endpoints to send variables to.")
         .SetDefault({"127.0.0.1:6164"});
-    AddAttr<bool>("sync_mode", "work in sync_mode or not").SetDefault(true);
   }
 };
 
diff --git a/paddle/fluid/operators/send_op.cc b/paddle/fluid/operators/send_op.cc
index 0cac329aafa8c4c67cae48ba62a48575f5edba92..48322ac7fd54a2e4cc3405a2c4dcddfc273f5a66 100644
--- a/paddle/fluid/operators/send_op.cc
+++ b/paddle/fluid/operators/send_op.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <future>  // NOLINT
 #include <ostream>
 
+#include "paddle/fluid/framework/blocking_queue.h"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -42,24 +43,22 @@ class SendOp : public framework::OperatorBase {
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     auto& ctx = *pool.Get(place);
 
-    // For profiling
-    platform::RecordEvent record_event(Type(), &ctx);
-
     distributed::RPCClient* rpc_client =
         distributed::RPCClient::GetInstance<RPCCLIENT_T>();
 
+    std::vector<distributed::VarHandlePtr> rets;
     for (size_t i = 0; i < ins.size(); i++) {
       if (NeedSend(scope, ins[i])) {
         VLOG(3) << "sending " << ins[i] << " to " << epmap[i];
-        // TODO(Yancey1989): we need to use an IO threadpool which has
-        // a larger number of threads than the computing threadpool.
-        rpc_client->AsyncSendVar(epmap[i], ctx, scope, ins[i]);
+        rets.push_back(rpc_client->AsyncSendVar(epmap[i], ctx, scope, ins[i]));
       } else {
         VLOG(3) << "don't send no-initialied variable: " << ins[i];
       }
     }
     if (sync_send) {
-      rpc_client->Wait();
+      for (size_t i = 0; i < rets.size(); i++) {
+        PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient");
+      }
     }
   }
 };
@@ -69,6 +68,8 @@ class SendOpMaker : public framework::OpProtoAndCheckerMaker {
   void Make() {
     AddInput("X", "(Tensor, SelectedRows) Input variables to be sent")
         .AsDuplicable();
+    AddOutput("Out", "(Any) Dummy outputs, used for control dependency")
+        .AsDuplicable();
     AddComment(R"DOC(
 Send operator
 
diff --git a/paddle/fluid/operators/send_recv_util.h b/paddle/fluid/operators/send_recv_util.h
index deab005149027caffa962783df944fad7110382f..dc26c53c64f06ce21856fb5af8f2a5eb3fc75bb7 100644
--- a/paddle/fluid/operators/send_recv_util.h
+++ b/paddle/fluid/operators/send_recv_util.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <string>
+#include "paddle/fluid/framework/ir/node.h"
 
 namespace paddle {
 namespace operators {
@@ -22,7 +23,10 @@ inline bool NeedSend(const framework::Scope& scope,
                      const std::string& varname) {
   // dummy variable is only used in parallel executor to represent
   // some dependency relationship, we don't need to send/recv it.
-  if (varname == "dummy") return false;
+  // TODO(paddle-dev): Why would parallel executor logic leaked into here?
+  if (varname.find(framework::ir::Node::kControlDepVarName) !=
+      std::string::npos)
+    return false;
   auto* var = scope.FindVar(varname);
   PADDLE_ENFORCE_NOT_NULL(var, "Can not find variable '%s' in the send side.",
                           varname);
diff --git a/paddle/fluid/operators/sequence_enumerate_op.cc b/paddle/fluid/operators/sequence_enumerate_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..58e48c228bb34814700fd0f7a3d62ef4b1a435dd
--- /dev/null
+++ b/paddle/fluid/operators/sequence_enumerate_op.cc
@@ -0,0 +1,97 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/sequence_enumerate_op.h"
+
+namespace paddle {
+namespace operators {
+
+class SequenceEnumerateOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(
+        ctx->HasInput("X"),
+        "Input(X) of SequecceEnumerate operator should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("Out"),
+        "Output(X) of SequenceEnumerate operator should not be null.");
+
+    const auto x_dims = ctx->GetInputDim("X");
+    PADDLE_ENFORCE_EQ(
+        x_dims.size(), 2UL,
+        "Input(X) of SequenceEnumerate operator's rank should be 2.");
+    PADDLE_ENFORCE_EQ(
+        x_dims[1], 1UL,
+        "Input(X) of SequenceEnumerate operator's 2nd dimension should be 1.");
+
+    const auto win_size = ctx->Attrs().Get<int>("win_size");
+    ctx->SetOutputDim("Out", {x_dims[0], win_size});
+    ctx->ShareLoD("X", "Out");
+  }
+};
+
+class SequenceEnumerateOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(2-D LoDTensor with the 2nd dimension equal to 1) "
+             "Input LoDTensor of SequenceEnumerate operator.");
+    AddOutput("Out",
+              "(2-D LoDTensor with the 2nd dimension equal to win_size) "
+              "Output LoDTensor of SequenceEnumerate operator.");
+    AddAttr<int>("win_size", "(int) The enumerate sequence window size.")
+        .AddCustomChecker([](const int& win_size) {
+          PADDLE_ENFORCE(win_size >= 2,
+                         "The window size should be not less than 2.");
+        });
+    AddAttr<int>("pad_value", "(int) The enumerate sequence padding value.")
+        .SetDefault(0);
+    AddComment(R"DOC(
+Sequence Enumerate Operator.
+
+Generate a new sequence for the input index sequence, which enumerates all the
+sub-sequences with length `win_size` of the input. 
+The enumerated sequence has the same 1st dimension with variable `input`, and
+the 2nd dimension is `win_size`, padded by `pad_value` if necessary in generation.
+    
+Examples:
+Case 1:
+  Input:
+    X.lod = [[0, 3, 5]]
+    X.data = [[1], [2], [3], [4], [5]]
+    X.dims = [5, 1]
+  Attrs:
+    win_size = 2
+    pad_value = 0
+  Output:
+    Out.lod = [[0, 3, 5]]
+    Out.data = [[1, 2], [2, 3], [3, 0], [4, 5], [5, 0]]
+    Out.dims = [5, 2]
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(sequence_enumerate, ops::SequenceEnumerateOp,
+                             ops::SequenceEnumerateOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    sequence_enumerate,
+    ops::SequenceEnumerateKernel<paddle::platform::CPUDeviceContext, int32_t>,
+    ops::SequenceEnumerateKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/sequence_enumerate_op.cu b/paddle/fluid/operators/sequence_enumerate_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..bdc9a615aa9a1ecd99c1f6995361f8c5ff0aa383
--- /dev/null
+++ b/paddle/fluid/operators/sequence_enumerate_op.cu
@@ -0,0 +1,84 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include "paddle/fluid/operators/sequence_enumerate_op.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+
+namespace paddle {
+namespace operators {
+using platform::PADDLE_CUDA_NUM_THREADS;
+using LoDTensor = framework::LoDTensor;
+
+template <typename T>
+__global__ void CalcOutPut(const T* in_data, const size_t* in_lod,
+                           const size_t lod_len, const int64_t win_size,
+                           const int64_t pad_value, T* out_data) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < in_lod[lod_len - 1]) {
+    int end_idx = 0;
+    // Get LoD interval of index
+    for (int i = 1; i < lod_len; ++i) {
+      if (index < in_lod[i]) {
+        end_idx = in_lod[i];
+        break;
+      }
+    }
+    for (size_t i = 0; i < win_size; ++i) {
+      int word_pos = index + i;
+      out_data[index * win_size + i] =
+          word_pos < end_idx ? in_data[word_pos] : pad_value;
+    }
+  }
+}
+
+template <typename T>
+class SequenceEnumerateOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in = context.Input<LoDTensor>("X");
+    auto* out = context.Output<LoDTensor>("Out");
+    int win_size = context.Attr<int>("win_size");
+    int pad_value = context.Attr<int>("pad_value");
+
+    auto in_dims = in->dims();
+    auto in_lod = in->lod();
+
+    PADDLE_ENFORCE_EQ(
+        static_cast<uint64_t>(in_dims[0]), in_lod[0].back(),
+        "The actual input data's size mismatched with LoD information.");
+
+    /* Generate enumerate sequence set */
+    auto stream = context.cuda_device_context().stream();
+    auto lod0 = in_lod[0];
+    auto in_len = in->numel();
+    auto in_data = in->data<T>();
+    auto out_data = out->mutable_data<T>(context.GetPlace());
+    // Copy LoD to GPU
+    const size_t* dev_in_lod_ptr = lod0.CUDAData(context.GetPlace());
+    // Calc output tensor
+    CalcOutPut<<<(in_len - 1) / PADDLE_CUDA_NUM_THREADS + 1,
+                 PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
+        in_data, dev_in_lod_ptr, lod0.size(), win_size, pad_value, out_data);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_CUDA_KERNEL(
+    sequence_enumerate,
+    paddle::operators::SequenceEnumerateOpCUDAKernel<int32_t>,
+    paddle::operators::SequenceEnumerateOpCUDAKernel<int64_t>);
diff --git a/paddle/fluid/operators/sequence_enumerate_op.h b/paddle/fluid/operators/sequence_enumerate_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..dc18d9b2071303377505155476b87ed029eaf986
--- /dev/null
+++ b/paddle/fluid/operators/sequence_enumerate_op.h
@@ -0,0 +1,56 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+using LoDTensor = framework::LoDTensor;
+
+template <typename DeviceContext, typename T>
+class SequenceEnumerateKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in = context.Input<LoDTensor>("X");
+    auto* out = context.Output<LoDTensor>("Out");
+    int win_size = context.Attr<int>("win_size");
+    int pad_value = context.Attr<int>("pad_value");
+
+    auto in_dims = in->dims();
+    auto in_lod = in->lod();
+
+    PADDLE_ENFORCE_EQ(
+        static_cast<uint64_t>(in_dims[0]), in_lod[0].back(),
+        "The actual input data's size mismatched with LoD information.");
+
+    // Generate enumerate sequence set
+    auto lod0 = in_lod[0];
+    auto in_data = in->data<T>();
+    auto out_data = out->mutable_data<T>(context.GetPlace());
+    for (size_t i = 0; i < lod0.size() - 1; ++i) {
+      for (size_t idx = lod0[i]; idx < lod0[i + 1]; ++idx) {
+        for (int word_idx = 0; word_idx < win_size; ++word_idx) {
+          size_t word_pos = idx + word_idx;
+          out_data[win_size * idx + word_idx] =
+              word_pos < lod0[i + 1] ? in_data[word_pos] : pad_value;
+        }
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/sequence_expand_op.h b/paddle/fluid/operators/sequence_expand_op.h
index 39301e1ac0971dfe0ca7854257f10ddeb60f1000..9228c81310463c3cb1d32fb613dd51d175b99c0e 100644
--- a/paddle/fluid/operators/sequence_expand_op.h
+++ b/paddle/fluid/operators/sequence_expand_op.h
@@ -53,25 +53,27 @@ struct SequenceExpandFunctor<platform::CPUDeviceContext, T> {
       const framework::Vector<size_t>& ref_lod, /*expand referenced lod*/
       LoDTensor* out) {
     int out_offset = 0;
-    auto& eigen_place = *context.eigen_device();
+    int x_item_length = x.numel() / x.dims()[0];
+    auto out_data = out->data<T>();
+    auto x_data = x.data<T>();
     for (size_t i = 1; i < ref_lod.size(); ++i) {
       int repeat_num = ref_lod[i] - ref_lod[i - 1];
       int x_start = x_lod[i - 1];
       int x_end = x_lod[i];
       int x_seq_len = x_end - x_start;
       if (repeat_num > 0) {
-        auto x_sub_tensor = x.Slice(x_start, x_end);
-        x_sub_tensor.Resize({1, x_sub_tensor.numel()});
         int out_start = out_offset;
         if (out->lod().size() == 1) {
           out_start = out->lod()[0][out_offset];
         }
-        auto out_sub_tensor =
-            out->Slice(out_start, out_start + x_seq_len * repeat_num);
-        out_sub_tensor.Resize({repeat_num, x_sub_tensor.dims()[1]});
-        EigenMatrix<T>::From(out_sub_tensor).device(eigen_place) =
-            EigenMatrix<T>::From(x_sub_tensor)
-                .broadcast(Eigen::array<int, 2>({{repeat_num, 1}}));
+        for (int j = 0; j < repeat_num; j++) {
+          for (int k = 0; k < x_seq_len; k++) {
+            for (int l = 0; l < x_item_length; l++) {
+              out_data[(out_start + j * x_seq_len + k) * x_item_length + l] =
+                  x_data[(x_start + k) * x_item_length + l];
+            }
+          }
+        }
       }
       out_offset += repeat_num;
     }
diff --git a/paddle/fluid/operators/sequence_mask_op.cc b/paddle/fluid/operators/sequence_mask_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e45c18d6aff65ecac565ef05e36b2d47ad8744b8
--- /dev/null
+++ b/paddle/fluid/operators/sequence_mask_op.cc
@@ -0,0 +1,26 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/sequence_mask_op.h"
+
+REGISTER_OPERATOR(sequence_mask, paddle::operators::SequenceMaskOp,
+                  paddle::operators::SequenceMaskOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+
+REGISTER_OP_CPU_KERNEL(
+    sequence_mask,
+    paddle::operators::SequenceMaskKernel<paddle::platform::CPUDeviceContext,
+                                          int>,
+    paddle::operators::SequenceMaskKernel<paddle::platform::CPUDeviceContext,
+                                          int64_t>);
diff --git a/paddle/fluid/operators/sequence_mask_op.cu b/paddle/fluid/operators/sequence_mask_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ff5acf4d9edd5f0f15cbcb22eae212c2d49ccaab
--- /dev/null
+++ b/paddle/fluid/operators/sequence_mask_op.cu
@@ -0,0 +1,22 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/sequence_mask_op.h"
+
+REGISTER_OP_CUDA_KERNEL(
+    sequence_mask,
+    paddle::operators::SequenceMaskKernel<paddle::platform::CUDADeviceContext,
+                                          int>,
+    paddle::operators::SequenceMaskKernel<paddle::platform::CUDADeviceContext,
+                                          int64_t>);
diff --git a/paddle/fluid/operators/sequence_mask_op.h b/paddle/fluid/operators/sequence_mask_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..18acb735cecabd1e01f7821c880fd8ed5e52971f
--- /dev/null
+++ b/paddle/fluid/operators/sequence_mask_op.h
@@ -0,0 +1,154 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#ifdef __NVCC__
+#include <thrust/device_ptr.h>
+#include <thrust/functional.h>
+#include <thrust/reduce.h>
+#else
+#include <algorithm>
+#endif
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/for_range.h"
+
+namespace paddle {
+namespace operators {
+
+class SequenceMaskOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must exist");
+    PADDLE_ENFORCE(ctx->HasOutput("Y"), "Output(Y) must exist");
+
+    auto maxlen = ctx->Attrs().Get<int>("maxlen");
+    if (maxlen > 0) {  // We can only infershape when maxlen > 0
+      auto dim = framework::vectorize2int(ctx->GetInputDim("X"));
+      dim.push_back(maxlen);
+      ctx->SetOutputDim("Y", framework::make_ddim(dim));
+    }
+  }
+};
+
+class SequenceMaskOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "The input tensor of sequence_mask op.");
+    AddOutput("Y", "The output mask of sequence_mask op.");
+    AddAttr<int>("maxlen",
+                 "The maximum length of the sequence. If maxlen < 0, maxlen "
+                 "= max(Input(X)).")
+        .SetDefault(-1)
+        .AddCustomChecker([](int &v) {
+          PADDLE_ENFORCE(v < 0 || v >= 1,
+                         "Attr(maxlen) must be less than 0 or larger than 1");
+        });
+    AddAttr<int>("out_dtype", "Output data type");
+    AddComment(R"DOC(
+SequenceMask Operator
+
+This operator outputs a Mask according to Input(X) and Attr(maxlen).
+Supposing Input(X) is a Tensor with shape [d_1, d_2, ..., d_n], the
+Output(Y) is a mask with shape [d_1, d_2, ..., d_n, maxlen], where:
+
+Y(i_1, i_2, ..., i_n, j) = (j < X(i_1, i_2, ..., i_n)) 
+
+If maxlen < 0, maxlen = max(X)
+    )DOC");
+  }
+};
+
+template <typename Tx, typename Ty>
+struct SequenceMaskForRangeFunctor {
+  HOSTDEVICE SequenceMaskForRangeFunctor(const Tx *x, Ty *y, int maxlen)
+      : x_(x), y_(y), maxlen_(maxlen) {}
+
+  HOSTDEVICE void operator()(int y_idx) const {
+    int x_idx = y_idx / maxlen_;
+    int j = y_idx % maxlen_;
+    y_[y_idx] = static_cast<Ty>(j < x_[x_idx] ? 1 : 0);
+  }
+
+ private:
+  const Tx *x_;
+  Ty *y_;
+  int maxlen_;
+};
+
+template <typename DeviceContext, typename Tx>
+struct SequenceMaskFunctor {
+  using Tensor = framework::LoDTensor;
+
+  SequenceMaskFunctor(const DeviceContext &ctx, const Tx *x, Tensor *y,
+                      int limits, int maxlen)
+      : ctx_(ctx), x_(x), y_(y), limits_(limits), maxlen_(maxlen) {}
+
+  template <typename Ty>
+  void apply() const {
+    auto *y_data = y_->mutable_data<Ty>(ctx_.GetPlace());
+    platform::ForRange<DeviceContext> for_range(ctx_, limits_);
+    for_range(SequenceMaskForRangeFunctor<Tx, Ty>(x_, y_data, maxlen_));
+  }
+
+ private:
+  const DeviceContext &ctx_;
+  const Tx *x_;
+  Tensor *y_;
+  int limits_;
+  int maxlen_;
+};
+
+template <typename DeviceContext, typename Tx>
+class SequenceMaskKernel : public framework::OpKernel<Tx> {
+  using Tensor = framework::LoDTensor;
+
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *x = ctx.Input<Tensor>("X");
+    auto *y = ctx.Output<Tensor>("Y");
+    auto maxlen = ctx.Attr<int>("maxlen");
+
+    auto *x_data = x->data<Tx>();
+    auto x_numel = x->numel();
+    if (maxlen < 0) {
+#ifdef __NVCC__
+      VLOG(10)
+          << "SequenceMaskOp on GPU may be slow when maxlen is not provided.";
+      maxlen = static_cast<int>(
+          thrust::reduce(thrust::device_pointer_cast(x_data),
+                         thrust::device_pointer_cast(x_data) + x_numel,
+                         static_cast<Tx>(0), thrust::maximum<Tx>()));
+#else
+      maxlen = static_cast<int>(*std::max_element(x_data, x_data + x_numel));
+#endif
+      auto y_dim = framework::vectorize2int(x->dims());
+      y_dim.push_back(maxlen);
+      y->Resize(framework::make_ddim(y_dim));
+    }
+
+    auto out_dtype = static_cast<framework::proto::VarType::Type>(
+        ctx.Attr<int>("out_dtype"));
+    auto &dev_ctx = ctx.template device_context<DeviceContext>();
+    framework::VisitDataType(out_dtype,
+                             SequenceMaskFunctor<DeviceContext, Tx>(
+                                 dev_ctx, x_data, y, x_numel * maxlen, maxlen));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/sequence_pad_op.cc b/paddle/fluid/operators/sequence_pad_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..44d73aa4076abfe15c906478702ac7c4a55303d4
--- /dev/null
+++ b/paddle/fluid/operators/sequence_pad_op.cc
@@ -0,0 +1,194 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/sequence_pad_op.h"
+
+namespace paddle {
+namespace operators {
+
+class SequencePadOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SequencePadOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("PadValue"),
+                   "Input(PadValue) of SequencePadOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of SequencePadOp should not be null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    PADDLE_ENFORCE_GE(x_dims.size(), 2,
+                      "The rank of Input(x) can't be less than 2.");
+    auto time_step_dims = framework::slice_ddim(x_dims, 1, x_dims.size());
+    auto pad_value_dims = ctx->GetInputDim("PadValue");
+    PADDLE_ENFORCE(pad_value_dims == framework::make_ddim({1}) ||
+                       pad_value_dims == time_step_dims,
+                   "The Input(PadValue) must be a scalar or a tensor whose "
+                   "shape equals to time steps in sequences");
+
+    int out_dim_0 = -1;
+    int out_dim_1 = -1;
+
+    if (ctx->IsRuntime()) {
+      // run time
+      framework::Variable* x_var =
+          boost::get<framework::Variable*>(ctx->GetInputVarPtrs("X")[0]);
+      const auto& x_lod = x_var->Get<LoDTensor>().lod();
+      PADDLE_ENFORCE(!x_lod.empty(), "The Input(X) must hold lod info.");
+      const auto& x_lod_0 = x_lod[0];
+      PADDLE_ENFORCE_GE(x_lod_0.size(), 2,
+                        "The Input(X)'s lod info is corrupted.");
+      PADDLE_ENFORCE_EQ(
+          x_dims[0], static_cast<int64_t>(x_lod_0.back()),
+          "The Input(X)'s lod info mismatches the actual tensor shape.");
+
+      int seq_num = x_lod_0.size() - 1;
+      int max_seq_len = math::MaximumSequenceLength(x_lod_0);
+      int padded_length = ctx->Attrs().Get<int>("padded_length");
+      if (padded_length == -1) {
+        padded_length = max_seq_len;
+      }
+      PADDLE_ENFORCE_GE(padded_length, max_seq_len,
+                        "The Attr(padded_length) must be -1 or an int greater "
+                        "than the length of the longest original sequence.");
+      out_dim_0 = seq_num;
+      out_dim_1 = padded_length;
+    } else {
+      // compile time
+      framework::VarDesc* x_desc =
+          boost::get<framework::VarDesc*>(ctx->GetInputVarPtrs("X")[0]);
+      PADDLE_ENFORCE_GE(x_desc->GetLoDLevel(), 1);
+    }
+
+    std::vector<int> out_dims_vec{out_dim_0, out_dim_1};
+    auto time_step_dims_vec = framework::vectorize2int(time_step_dims);
+    out_dims_vec.insert(out_dims_vec.end(), time_step_dims_vec.begin(),
+                        time_step_dims_vec.end());
+    ctx->SetOutputDim("Out", framework::make_ddim(out_dims_vec));
+  }
+};
+
+class SequencePadOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(LoDTensor, default LoDTensor<float>) Input variable which "
+             "should contain lod information.");
+    AddInput("PadValue",
+             "(LoDTensor), this Tensor holds values that will be fill into "
+             "padded steps. It can be a scalar or a tensor whose shape equals "
+             "to time steps in sequences. If it's a scalar, it will be "
+             "automatically broadcasted to the shape of time step.");
+    AddOutput(
+        "Out",
+        "(LoDTensor) The output vairable, which contains padded sequences.");
+    AddAttr<int>(
+        "padded_length",
+        "The length of padded sequences. It can be setted to -1 or "
+        "any positive int. When it is -1, all sequences will be padded up to "
+        "the length of the longest one among them; when it a certain positive "
+        "value, it must be greater than the length of the longest original "
+        "sequence.")
+        .SetDefault(-1);
+    AddComment(R"DOC(
+      Sequence Pad Operator
+
+      This operator pads sequences in a same batch to a consistent length. 
+      The length is specified by attribute 'padded_length'. New elements, 
+      whose values are specified by input 'PadValue', will be appended to 
+      the end of each sequence, to make their final lengths consistent.
+
+      Following are cases to better explain how this works:
+
+      Case 1:
+
+      Given a 1-level LoDTensor input(X):
+          X.lod = [[0, 2,       5]]
+          X.data = [a, b, c, d, e]
+      and Input(PadValue):
+          PadValue.data = [0]
+      and attribite 'padded_length' = 4,
+      then we get LoDTensor:
+          Out.data = [[a, b, 0, 0], 
+                      [c, d, e, 0]]
+      
+      Case 2:
+
+      Given a 1-level LoDTensor input(X):
+          X.lod = [[0,               2,                           5]]
+          X.data = [[a1, a2], [b1, b2], [c1, c2], [d1, d2], [e1, e2]]
+      and Input(PadValue):
+          PadValue.data = [0]
+      and attribite 'padded_length' = -1, which mean using the length 
+      of longest input sequence(3 in this case),
+      then we get LoDTensor:
+          Out.data = [[[a1, a2], [b1, b2], [0, 0]], 
+                      [[c1, c2], [d1, d2], [e1, e2]]]
+
+      Case 3:
+
+      Given a 1-level LoDTensor input(X):
+          X.lod = [[0,               2,                           5]]
+          X.data = [[a1, a2], [b1, b2], [c1, c2], [d1, d2], [e1, e2]]
+      and Input(PadValue):
+          PadValue.data = [p1, p2]
+      and attribite 'padded_length' = -1, which mean using the length 
+      of longest input sequence(3 in this case),
+      then we get LoDTensor:
+          Out.data = [[[a1, a2], [b1, b2], [p1, p2]], 
+                      [[c1, c2], [d1, d2], [e1, e2]]]
+
+    )DOC");
+  }
+};
+
+class SequencePadGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SequencePadGradOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) of SequencePadGradOp should not be null.");
+
+    if (ctx->HasOutput(framework::GradVarName("X"))) {
+      ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+      ctx->ShareLoD("X", /*->*/ framework::GradVarName("X"));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(sequence_pad, ops::SequencePadOp, ops::SequencePadOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(sequence_pad_grad, ops::SequencePadGradOp);
+REGISTER_OP_CPU_KERNEL(
+    sequence_pad,
+    ops::SequencePadOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SequencePadOpKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::SequencePadOpKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::SequencePadOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(
+    sequence_pad_grad,
+    ops::SequencePadGradOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SequencePadGradOpKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::SequencePadGradOpKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::SequencePadGradOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/sequence_pad_op.cu b/paddle/fluid/operators/sequence_pad_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ff8f81a2f0ec4a72befc3be2a5fc48c3a586c824
--- /dev/null
+++ b/paddle/fluid/operators/sequence_pad_op.cu
@@ -0,0 +1,29 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/sequence_pad_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    sequence_pad,
+    ops::SequencePadOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SequencePadOpKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::SequencePadOpKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::SequencePadOpKernel<paddle::platform::CUDADeviceContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(
+    sequence_pad_grad,
+    ops::SequencePadGradOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SequencePadGradOpKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::SequencePadGradOpKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::SequencePadGradOpKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/sequence_pad_op.h b/paddle/fluid/operators/sequence_pad_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..5fc9da69d787ff3aeffa716689d44772ad8f7bd2
--- /dev/null
+++ b/paddle/fluid/operators/sequence_pad_op.h
@@ -0,0 +1,66 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/sequence_padding.h"
+
+namespace paddle {
+namespace operators {
+
+using LoDTensor = framework::LoDTensor;
+using LoD = framework::LoD;
+
+template <typename DeviceContext, typename T>
+class SequencePadOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const auto* x = ctx.Input<LoDTensor>("X");
+    auto* out = ctx.Output<LoDTensor>("Out");
+    out->mutable_data<T>(ctx.GetPlace());
+
+    const auto* pad_value = ctx.Input<LoDTensor>("PadValue");
+
+    int padded_length = ctx.Attr<int>("padded_length");
+
+    math::PaddingLoDTensorFunctor<DeviceContext, T>()(
+        ctx.template device_context<DeviceContext>(), *x, out, *pad_value,
+        padded_length, 0, false, math::kBatchLengthWidth);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class SequencePadGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* d_x = ctx.Output<LoDTensor>(framework::GradVarName("X"));
+    if (d_x) {
+      const auto* d_out = ctx.Input<LoDTensor>(framework::GradVarName("Out"));
+      d_x->mutable_data<T>(ctx.GetPlace());
+
+      int padded_length = ctx.Attr<int>("padded_length");
+
+      math::UnpaddingLoDTensorFunctor<DeviceContext, T>()(
+          ctx.template device_context<DeviceContext>(), *d_out, d_x,
+          padded_length, 0, false, math::kBatchLengthWidth);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/sequence_softmax_cudnn_op.cu.cc b/paddle/fluid/operators/sequence_softmax_cudnn_op.cu.cc
index 0ddacb57106c090e8f4f9350a65a30ca102f8e0a..7aca9f7111956dba63e2ceee10077879fe092bdf 100644
--- a/paddle/fluid/operators/sequence_softmax_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/sequence_softmax_cudnn_op.cu.cc
@@ -68,7 +68,9 @@ class SequenceSoftmaxGradCUDNNKernel : public framework::OpKernel<T> {
     auto* out_grad = ctx.Input<LoDTensor>(framework::GradVarName("Out"));
     auto* x = ctx.Input<LoDTensor>("X");
     auto* x_grad = ctx.Output<LoDTensor>(framework::GradVarName("X"));
-
+    if (x_grad) {
+      x_grad->set_lod(x->lod());
+    }
     auto lod = x->lod();
     const size_t level = lod.size() - 1;
 
diff --git a/paddle/fluid/operators/sequence_softmax_op.h b/paddle/fluid/operators/sequence_softmax_op.h
index cb93a02b8386ed50ff176fc25b88449b7eb16902..bca564e16f9951519eefe25126aadebb4c1326b6 100644
--- a/paddle/fluid/operators/sequence_softmax_op.h
+++ b/paddle/fluid/operators/sequence_softmax_op.h
@@ -66,6 +66,9 @@ class SequenceSoftmaxGradKernel : public framework::OpKernel<T> {
     auto* out_grad = ctx.Input<LoDTensor>(framework::GradVarName("Out"));
     auto* x = ctx.Input<LoDTensor>("X");
     auto* x_grad = ctx.Output<LoDTensor>(framework::GradVarName("X"));
+    if (x_grad) {
+      x_grad->set_lod(x->lod());
+    }
 
     auto lod = x->lod();
     const size_t level = lod.size() - 1;
diff --git a/paddle/fluid/operators/sgd_op.h b/paddle/fluid/operators/sgd_op.h
index 2685ce217ee0f0d3e89f3751e96218dcd19bead4..d8b0165b2a89b04bd55671a37d96ee4ba275b2eb 100644
--- a/paddle/fluid/operators/sgd_op.h
+++ b/paddle/fluid/operators/sgd_op.h
@@ -111,7 +111,7 @@ class SGDOpKernel : public framework::OpKernel<T> {
       for (size_t i = 0; i < grad.rows().size(); i++) {
         PADDLE_ENFORCE(grad.rows()[i] < grad.height(),
                        "Input rows index should less than height");
-        int64_t id_index = param.Index(grad.rows()[i]);
+        int64_t id_index = param_out->AutoGrownIndex(grad.rows()[i], false);
         PADDLE_ENFORCE_GE(id_index, static_cast<int64_t>(0),
                           "id should be in the table");
         for (int64_t j = 0; j < grad_row_width; j++) {
diff --git a/paddle/fluid/operators/shape_op.cc b/paddle/fluid/operators/shape_op.cc
index b44d5f898013a5d27467bd80118c29a886d5e8b3..1be9fe47af71d31ce2e0eba807ea4a43601f8aca 100644
--- a/paddle/fluid/operators/shape_op.cc
+++ b/paddle/fluid/operators/shape_op.cc
@@ -38,7 +38,7 @@ class ShapeOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Input", "(Tensor), The input tensor.");
     AddOutput("Out",
               "(Tensor), The shape of input tensor, the data type of the shape"
-              " is int64_t, will be on the same device with the input Tensor.");
+              " is int32_t, will be on the same device with the input Tensor.");
     AddComment(R"DOC(
 Shape Operator
 
@@ -53,5 +53,5 @@ Get the shape of input tensor. Only support CPU input Tensor now.
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(shape, ops::ShapeOp, ops::ShapeOpMaker,
                   paddle::framework::EmptyGradOpMaker);
-REGISTER_OP_CPU_KERNEL(shape, ops::ShapeKernel<int>, ops::ShapeKernel<int64_t>,
+REGISTER_OP_CPU_KERNEL(shape, ops::ShapeKernel<int>, ops::ShapeKernel<int32_t>,
                        ops::ShapeKernel<float>, ops::ShapeKernel<double>);
diff --git a/paddle/fluid/operators/shape_op.cu b/paddle/fluid/operators/shape_op.cu
index 7736a2a1e13cfa5d445411b3efac7669a7bf23a2..d8fa9515abf807ab4ae3c47e8e1b1cf7f30440a8 100644
--- a/paddle/fluid/operators/shape_op.cu
+++ b/paddle/fluid/operators/shape_op.cu
@@ -15,6 +15,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/shape_op.h"
 
 REGISTER_OP_CUDA_KERNEL(shape, paddle::operators::ShapeKernel<int>,
-                        paddle::operators::ShapeKernel<int64_t>,
+                        paddle::operators::ShapeKernel<int32_t>,
                         paddle::operators::ShapeKernel<float>,
                         paddle::operators::ShapeKernel<double>);
diff --git a/paddle/fluid/operators/shape_op.h b/paddle/fluid/operators/shape_op.h
index 3be86b66a538e7b38a5d59095fee7e7636364bce..0d510a505583c55e26a26bfc6e5d6192899b3d9e 100644
--- a/paddle/fluid/operators/shape_op.h
+++ b/paddle/fluid/operators/shape_op.h
@@ -27,7 +27,7 @@ class ShapeKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* in_t = ctx.Input<Tensor>("Input");
     auto* out_t = ctx.Output<Tensor>("Out");
-    auto out_data = out_t->mutable_data<int64_t>(platform::CPUPlace());
+    auto out_data = out_t->mutable_data<int32_t>(platform::CPUPlace());
     auto in_dims = in_t->dims();
     for (int i = 0; i < in_dims.size(); ++i) {
       out_data[i] = in_dims[i];
diff --git a/paddle/fluid/operators/shrink_rnn_memory_op.cc b/paddle/fluid/operators/shrink_rnn_memory_op.cc
index 8146c5f56104b7dec86b1c4491ed10fc2e94b58b..29d2fb989754f5621222768a279a1c898ea1c355 100644
--- a/paddle/fluid/operators/shrink_rnn_memory_op.cc
+++ b/paddle/fluid/operators/shrink_rnn_memory_op.cc
@@ -62,7 +62,10 @@ class ShrinkRNNMemoryOp : public ArrayOp {
     }
 
     if (dst_num_rows != 0) {
-      out_tensor.ShareDataWith(x_tensor.Slice(0, height));
+      out_tensor.mutable_data(place, x_tensor.type());
+      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+      framework::TensorCopy(x_tensor.Slice(0, height), place, *dev_ctx,
+                            &out_tensor);
     }
   }
 };
diff --git a/paddle/fluid/operators/softmax_cudnn_op.cu.cc b/paddle/fluid/operators/softmax_cudnn_op.cu.cc
index 5596fa0648ccc151bc0d11de9c556599428a8d71..2bdb23e999621b10799b5163f326bc4b66a437e6 100644
--- a/paddle/fluid/operators/softmax_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/softmax_cudnn_op.cu.cc
@@ -30,8 +30,16 @@ class SoftmaxCUDNNKernel : public framework::OpKernel<T> {
     // allocate memory on device.
     Out->mutable_data<T>(context.GetPlace());
 
+    auto dims = X->dims();
+    auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1);
+    framework::LoDTensor flattened_x;
+    framework::LoDTensor flattened_out;
+    flattened_x.ShareDataWith(*X).Resize(flattened_dims);
+    flattened_out.ShareDataWith(*Out).Resize(flattened_dims);
+
     math::SoftmaxCUDNNFunctor<T>()(
-        context.template device_context<platform::CUDADeviceContext>(), X, Out);
+        context.template device_context<platform::CUDADeviceContext>(),
+        &flattened_x, &flattened_out);
   }
 };
 
@@ -46,9 +54,18 @@ class SoftmaxGradCUDNNKernel : public framework::OpKernel<T> {
     // allocate memory on device.
     dX->mutable_data<T>(context.GetPlace());
 
+    auto dims = Out->dims();
+    auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1);
+    framework::LoDTensor flattened_out;
+    framework::LoDTensor flattened_d_out;
+    framework::LoDTensor flattened_d_x;
+    flattened_out.ShareDataWith(*Out).Resize(flattened_dims);
+    flattened_d_out.ShareDataWith(*dOut).Resize(flattened_dims);
+    flattened_d_x.ShareDataWith(*dX).Resize(flattened_dims);
+
     math::SoftmaxGradCUDNNFunctor<T>()(
-        context.template device_context<platform::CUDADeviceContext>(), Out,
-        dOut, dX);
+        context.template device_context<platform::CUDADeviceContext>(),
+        &flattened_out, &flattened_d_out, &flattened_d_x);
   }
 };
 
diff --git a/paddle/fluid/operators/softmax_mkldnn_op.cc b/paddle/fluid/operators/softmax_mkldnn_op.cc
index 6668e6b9e917eea7ba4a80ac78917b73eb827208..01819f53e3ab0973f6140c5a81f18f954b6a0376 100644
--- a/paddle/fluid/operators/softmax_mkldnn_op.cc
+++ b/paddle/fluid/operators/softmax_mkldnn_op.cc
@@ -26,9 +26,9 @@ using paddle::platform::MKLDNNMemDesc;
 
 using mkldnn::memory;  // Note: paddle has also "memory" namespace
 using mkldnn::primitive;
-using mkldnn::softmax_forward;
-using mkldnn::softmax_backward;
 using mkldnn::prop_kind;
+using mkldnn::softmax_backward;
+using mkldnn::softmax_forward;
 using mkldnn::stream;
 using platform::to_void_cast;
 
@@ -113,17 +113,27 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel<T> {
     auto mkldnn_engine = dev_ctx.GetEngine();
     const Tensor* input = ctx.Input<Tensor>("X");
     Tensor* output = ctx.Output<Tensor>("Out");
-    PADDLE_ENFORCE(input->dims().size() == 2UL,
-                   "The input of softmax op must be a 2D matrix.");
-    const T* input_data = input->data<T>();
-    // allocate memory for output
-    T* output_data = output->mutable_data<T>(ctx.GetPlace());
-    std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
-    std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
-    // MKL-DNN does support softmax over selected axis. Having 2D Tensor,
-    // we will make normalization after final eg. axis: 1
-    PADDLE_ENFORCE(((src_tz[0] == dst_tz[0]) && (src_tz[1] == dst_tz[1])),
-                   "Softmax input and output dimensions should match");
+    PADDLE_ENFORCE_EQ(
+        input->dims(), output->dims(),
+        "The shape of softmax's input and output must be identical.");
+
+    // make sure 'output' holds memory, which will be shared by
+    // 'flattened_output' later.
+    output->mutable_data<T>(ctx.GetPlace());
+
+    // flatten input and output to 2-D matrixs
+    auto dims = input->dims();  // input and output share the same shape
+    auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1);
+    framework::Tensor flattened_input;
+    framework::Tensor flattened_output;
+    flattened_input.ShareDataWith(*input).Resize(flattened_dims);
+    flattened_output.ShareDataWith(*output).Resize(flattened_dims);
+
+    const T* input_data = flattened_input.data<T>();
+    T* output_data = flattened_output.mutable_data<T>(ctx.GetPlace());
+
+    std::vector<int> src_tz = paddle::framework::vectorize2int(flattened_dims);
+    std::vector<int> dst_tz = src_tz;
     // Same memory descriptor to be used for input and output
     memory::dims softmax_tz = {src_tz[0], src_tz[1]};
     // Generate keys for storing/retriving primitives for this operator
@@ -174,23 +184,34 @@ class SoftmaxMKLDNNGradKernel : public paddle::framework::OpKernel<T> {
     auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
     auto mkldnn_engine = dev_ctx.GetEngine();
     const Tensor* output = ctx.Input<Tensor>("Out");
-    const T* dst_data = output->data<T>();
-
     auto* dout = ctx.template Input<Tensor>(framework::GradVarName("Out"));
-    const auto* diff_dst_ptr = dout->template data<T>();
-
     auto* dx =
         ctx.template Output<framework::Tensor>(framework::GradVarName("X"));
-    T* diff_src_ptr = dx->template mutable_data<T>(ctx.GetPlace());
 
-    std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
+    PADDLE_ENFORCE_EQ(
+        dout->dims(), dx->dims(),
+        "The shape of softmax_grad's input and output must be identical.");
+
+    // make sure 'dx' holds memory, which will be shared by 'flattened_dx'
+    // later.
+    dx->template mutable_data<T>(ctx.GetPlace());
+
+    auto dims = dout->dims();  // input and output share the same shape
+    auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1);
+    framework::Tensor flattened_output;
+    framework::Tensor flattened_dout;
+    framework::Tensor flattened_dx;
+    flattened_output.ShareDataWith(*output).Resize(flattened_dims);
+    flattened_dout.ShareDataWith(*dout).Resize(flattened_dims);
+    flattened_dx.ShareDataWith(*dx).Resize(flattened_dims);
+
+    const T* dst_data = flattened_output.data<T>();
+    const T* diff_dst_ptr = flattened_dout.template data<T>();
+    T* diff_src_ptr = flattened_dx.template mutable_data<T>(ctx.GetPlace());
+
+    std::vector<int> dst_tz = paddle::framework::vectorize2int(flattened_dims);
     std::vector<int> src_tz(dst_tz);
-    PADDLE_ENFORCE(output->dims().size() == 2UL,
-                   "The input of softmax op must be a 2D matrix.");
-    // MKL-DNN does support softmax over selected axis. Having 2D Tensor,
-    // we will make normalization after final eg. axis: 1
-    PADDLE_ENFORCE(((src_tz[0] == dst_tz[0]) && (src_tz[1] == dst_tz[1])),
-                   "Softmax input and output dimensions should match");
+
     // Same memory descriptor to be used for input and output
     memory::dims softmax_tz = {src_tz[0], src_tz[1]};
     // Currently only supports NC data format
diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc
index 31a7458f637921c290fc71ac748143867b4aae19..bb081238820b9ee3ae095442d21cfce11f7b41e5 100644
--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
@@ -37,10 +37,7 @@ class SoftmaxOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of SoftmaxOp should not be null.");
 
-    auto x_dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE(x_dims.size() == 2UL,
-                   "The input of softmax op must be a matrix.");
-    ctx->SetOutputDim("Out", x_dims);
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
     ctx->ShareLoD("X", /*->*/ "Out");
   }
 
@@ -81,8 +78,8 @@ class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
     AddInput("X",
-             "The input tensor of softmax. "
-             "2-D with shape [batch_size, input_feature_dimensions].");
+             "The input tensor of softmax, "
+             "whose last dimension is the input_feature_dimensions.");
     AddOutput("Out", "The normalized values with the same shape as X.")
         .Reuse("X");
     AddAttr<bool>(
@@ -105,20 +102,23 @@ class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 Softmax Operator.
 
-The input of the softmax operator is a 2-D tensor with shape N x K (N is the
-batch_size, K is the dimension of input feature). The output tensor has the
-same shape as the input tensor.
+The input of the softmax operator is a tensor of any rank. The output tensor 
+has the same shape as the input.
 
-For each row of the input tensor, the softmax operator squashes the
-K-dimensional vector of arbitrary real values to a K-dimensional vector of real
-values in the range [0, 1] that add up to 1.
+The input tensor will first be logically flattened to a 2-D matrix. The matrix's 
+second dimension(row length) is as same as the last dimension of the input 
+tensor, and the first dimension(column length) is the product of all other 
+dimensions of the input tensor. For each row of the matrix, the softmax operator 
+squashes the K-dimensional(K is the width of the matrix, which is also the size 
+of the input tensor's last dimension) vector of arbitrary real values to a 
+K-dimensional vector of real values in the range [0, 1] that add up to 1.
 It computes the exponential of the given dimension and the sum of exponential
 values of all the other dimensions in the K-dimensional vector input.
 Then the ratio of the exponential of the given dimension and the sum of
 exponential values of all the other dimensions is the output of the softmax
 operator.
 
-For each row $i$ and each column $j$ in Input(X), we have:
+For each row $i$ and each column $j$ in the matrix, we have:
     $$Out[i, j] = \frac{\exp(X[i, j])}{\sum_j(exp(X[i, j])}$$
 
 )DOC");
@@ -137,7 +137,8 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
                       ctx->GetInputDim(framework::GradVarName("Out")),
                       "Input(Out) and its gradients should have a same shape.");
 
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+    ctx->SetOutputDim(framework::GradVarName("X"),
+                      ctx->GetInputDim(framework::GradVarName("Out")));
   }
 
  protected:
@@ -160,8 +161,8 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
       layout_ = framework::DataLayout::kMKLDNN;
     }
 #endif
-    auto input_data_type =
-        framework::ToDataType(ctx.Input<Tensor>("X")->type());
+    auto input_data_type = framework::ToDataType(
+        ctx.Input<Tensor>(framework::GradVarName("Out"))->type());
     if (input_data_type == framework::proto::VarType::FP16) {
       PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
                      "float16 can only be used on GPU place");
@@ -172,13 +173,31 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
   }
 };
 
+class SoftmaxOpGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* op = new framework::OpDesc();
+    op->SetType("softmax_grad");
+
+    op->SetInput("Out", Output("Out"));
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+
+    op->SetAttrMap(Attrs());
+
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    return std::unique_ptr<framework::OpDesc>(op);
+  }
+};
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 
 REGISTER_OPERATOR(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+                  ops::SoftmaxOpGradMaker);
 REGISTER_OPERATOR(softmax_grad, ops::SoftmaxOpGrad);
 REGISTER_OP_CPU_KERNEL(
     softmax, ops::SoftmaxKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/softmax_op.h b/paddle/fluid/operators/softmax_op.h
index 600da45a0bbb69b76d59c981e195fc03a49b0504..cf1eeb017d666f605a431aa54637d8cbc99c7c46 100644
--- a/paddle/fluid/operators/softmax_op.h
+++ b/paddle/fluid/operators/softmax_op.h
@@ -31,8 +31,12 @@ class SoftmaxKernel : public framework::OpKernel<T> {
     // allocate memory on device.
     Out->mutable_data<T>(context.GetPlace());
 
+    int rank = X->dims().size();
+    Tensor X_2d = framework::ReshapeToMatrix(*X, rank - 1);
+    Tensor Out_2d = framework::ReshapeToMatrix(*Out, rank - 1);
+
     math::SoftmaxFunctor<DeviceContext, T>()(
-        context.template device_context<DeviceContext>(), X, Out);
+        context.template device_context<DeviceContext>(), &X_2d, &Out_2d);
   }
 };
 
@@ -47,8 +51,14 @@ class SoftmaxGradKernel : public framework::OpKernel<T> {
     // allocate memory on device.
     dX->mutable_data<T>(context.GetPlace());
 
+    int rank = Out->dims().size();
+    Tensor Out_2d = framework::ReshapeToMatrix(*Out, rank - 1);
+    Tensor dOut_2d = framework::ReshapeToMatrix(*dOut, rank - 1);
+    Tensor dX_2d = framework::ReshapeToMatrix(*dX, rank - 1);
+
     math::SoftmaxGradFunctor<DeviceContext, T>()(
-        context.template device_context<DeviceContext>(), Out, dOut, dX);
+        context.template device_context<DeviceContext>(), &Out_2d, &dOut_2d,
+        &dX_2d);
   }
 };
 
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
index 53cb716a979229c99fcbdc12f1aeab4e21b320f3..1a9324ec862fc3dd7ce669c5fed94527cac22b8f 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
@@ -44,6 +44,12 @@ class SoftmaxWithCrossEntropyOpMaker
         "(bool, default: false), A flag to indicate whether to interpretate "
         "the given labels as soft labels.")
         .SetDefault(false);
+    AddAttr<int>(
+        "ignore_index",
+        "(int, default -100), Specifies a target value that is ignored and"
+        "does not contribute to the input gradient. Only valid if soft_label"
+        "is set to False")
+        .SetDefault(-100);
     AddComment(R"DOC(
 Softmax With Cross Entropy Operator.
 
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
index 8f7840cee1dd95a828fd4ac8815e335a5db47e3d..a07c17348ebb3f768d1c8be65c2d31e3c130bd23 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #define EIGEN_USE_GPU
 
+#include <cub/cub.cuh>
+#include "paddle/fluid/operators/math/cross_entropy.h"
 #include "paddle/fluid/operators/softmax_with_cross_entropy_op.h"
 
 namespace paddle {
@@ -24,11 +26,13 @@ using Tensor = framework::Tensor;
 namespace {
 template <typename T>
 __global__ void CrossEntropyGrad(T* logit_grad, const int64_t* labels,
-                                 const int batch_size, const int class_num) {
+                                 const int batch_size, const int class_num,
+                                 const int ignore_index) {
   for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < batch_size;
        i += blockDim.x * gridDim.x) {
     int idx = i * class_num + labels[i];
-    logit_grad[idx] -= static_cast<T>(1.);
+    logit_grad[idx] -=
+        ignore_index == labels[i] ? static_cast<T>(0.) : static_cast<T>(1.);
   }
 }
 
@@ -53,8 +57,196 @@ __global__ void SoftCrossEntropyGradientKernel(T* logit_grad,
     logit_grad[ids] = loss_grad[row_ids] * (logit_grad[ids] - labels[ids]);
   }
 }
+
 }  // namespace
 
+static __device__ __forceinline__ float real_exp(float x) { return expf(x); }
+static __device__ __forceinline__ double real_exp(double x) { return exp(x); }
+static __device__ __forceinline__ float real_log(float x) {
+  return math::TolerableValue<float>()(logf(x));
+}
+static __device__ __forceinline__ double real_log(double x) {
+  return math::TolerableValue<double>()(log(x));
+}
+
+/** In the following codes, 3 CUDA kernels are implemented to calculate softmax
+ * and loss **/
+/*
+  Supposing the x is `logits` and y is `labels`, the equations are as
+followings:
+
+  cross\_entropy_i = \sum_{j}[- y_i_j * log({e^{x_i_j}/\sum_{j}e^{x_i_j}})]
+        = \sum_{j}[- y_i_j * log({e^{x_i_j - max_i}/\sum_{j}e^{x_i_j-max_i}})]
+        = \sum_{j}[-y_i_j * (x_i_j - max_i - log\sum_{j}e^{x_i_j - max_i})]
+        = \sum_{j}[-y_i_j * (x_i_j - max_i - logDiffMaxSum_i)]
+        = \sum_{j}(-y_i_j * tmp_i_j)
+
+  softmax_i_j = e^{tmp_i_j}
+
+where:
+  max_i = \max_{j}{x_i_j}
+  logDiffMaxSum_i = log\sum_{j}e^{x_i_j - max_i}
+  tmp_i_j = x_i_j - max_i - logDiffMaxSum_i
+
+Therefore, the calculation can be separated into 3 steps:
+Step 1: row-wise operation to calculate max_i
+Step 2: row-wise operation to calculate logDiffMaxSum_i
+Step 3: caculate tmp_i_j, and finally get softmax_i_j and cross\_entropy_i
+
+To save memory, we can share memory among max_i, logDiffMaxSum_i and
+cross\_entropy_i.
+In this way, the 3 steps should be changed to:
+Step 1 (RowReductionForMax): row-wise operation to calculate max_i
+Step 2 (RowReductionForDiffMaxSum): calculate immediate result of softmax'_i_j =
+x_i_j - max_i, and row-wise operation to calculate logDiffMaxSum_i
+Step 3 (RowReductionForSoftmaxAndCrossEntropy): calculate tmp_i_j = softmax'_i_j
+- logDiffMaxSum_i, and finally get softmax_i_j and cross\_entropy_i
+*/
+
+// There are 3 kinds of reduce algorithms in cub:
+// BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY
+// BLOCK_REDUCE_RAKING
+// BLOCK_REDUCE_WARP_REDUCTIONS (default)
+template <typename T, int BlockDim>
+using BlockReduce =
+    cub::BlockReduce<T, BlockDim /*, cub::BLOCK_REDUCE_WARP_REDUCTIONS*/>;
+
+template <typename T, int BlockDim>
+using BlockReduceTempStorage = typename BlockReduce<T, BlockDim>::TempStorage;
+
+// Make sure that BlockDim <= feature_size
+// This kernel is used to calculate the max element of each row
+template <typename T, int BlockDim>
+__global__ void RowReductionForMax(const T* logits_data, T* max_data,
+                                   int feature_size) {
+  __shared__ BlockReduceTempStorage<T, BlockDim> temp_storage;
+
+  auto beg_idx = feature_size * blockIdx.x + threadIdx.x;
+  auto end_idx = feature_size * (blockIdx.x + 1);
+
+  T cur_max = logits_data[beg_idx];
+  beg_idx += BlockDim;
+  while (beg_idx < end_idx) {
+    if (cur_max < logits_data[beg_idx]) {
+      cur_max = logits_data[beg_idx];
+    }
+    beg_idx += BlockDim;
+  }
+
+  cur_max = BlockReduce<T, BlockDim>(temp_storage).Reduce(cur_max, cub::Max());
+
+  if (threadIdx.x == 0) {
+    max_data[blockIdx.x] = cur_max < -64 ? -64 : cur_max;
+  }
+}
+
+// Make sure that BlockDim <= feature_size
+template <typename T, int BlockDim>
+__global__ void RowReductionForDiffMaxSum(const T* logits_data, T* max_data,
+                                          T* softmax, int feature_size) {
+  __shared__ BlockReduceTempStorage<T, BlockDim> temp_storage;
+
+  auto beg_idx = feature_size * blockIdx.x + threadIdx.x;
+  auto end_idx = feature_size * (blockIdx.x + 1);
+
+  auto block_max = max_data[blockIdx.x];
+
+  softmax[beg_idx] = logits_data[beg_idx] - block_max;
+  T diff_max_sum = real_exp(softmax[beg_idx]);
+  beg_idx += BlockDim;
+  while (beg_idx < end_idx) {
+    softmax[beg_idx] = logits_data[beg_idx] - block_max;
+    diff_max_sum += real_exp(softmax[beg_idx]);
+    beg_idx += BlockDim;
+  }
+
+  diff_max_sum =
+      BlockReduce<T, BlockDim>(temp_storage).Reduce(diff_max_sum, cub::Sum());
+  if (threadIdx.x == 0) max_data[blockIdx.x] = real_log(diff_max_sum);
+}
+
+// Make sure that BlockDim <= feature_size
+template <typename T, int BlockDim>
+__global__ void RowReductionForSoftmaxAndCrossEntropy(const T* logits_data,
+                                                      const T* labels_data,
+                                                      T* loss_data, T* softmax,
+                                                      int feature_size) {
+  __shared__ BlockReduceTempStorage<T, BlockDim> temp_storage;
+
+  auto beg_idx = feature_size * blockIdx.x + threadIdx.x;
+  auto end_idx = feature_size * (blockIdx.x + 1);
+
+  // log_diff_max_sum shares memory with loss
+  auto block_log_diff_max_sum = loss_data[blockIdx.x];
+  auto tmp = softmax[beg_idx] - block_log_diff_max_sum;
+  softmax[beg_idx] = real_exp(tmp);
+  auto loss = -labels_data[beg_idx] * tmp;
+  beg_idx += BlockDim;
+  while (beg_idx < end_idx) {
+    tmp = softmax[beg_idx] - block_log_diff_max_sum;
+    softmax[beg_idx] = real_exp(tmp);
+    loss -= (labels_data[beg_idx] * tmp);
+    beg_idx += BlockDim;
+  }
+
+  loss = BlockReduce<T, BlockDim>(temp_storage).Reduce(loss, cub::Sum());
+  if (threadIdx.x == 0) loss_data[blockIdx.x] = loss;
+}
+
+template <typename T>
+__global__ void SetSoftmaxToOneWhenFeatureSizeIsOne(T* out, int batch_size) {
+  auto idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < batch_size) out[idx] = static_cast<T>(1);
+}
+
+template <typename T>
+static void SoftmaxWithCrossEntropyFusedKernel(const T* logits_data,
+                                               const T* labels_data,
+                                               T* softmax_data, T* loss_data,
+                                               int batch_size, int feature_size,
+                                               cudaStream_t stream) {
+  constexpr int kMaxBlockDim = 512;
+  int block_dim = feature_size >= kMaxBlockDim
+                      ? kMaxBlockDim
+                      : (1 << static_cast<int>(std::log2(feature_size)));
+
+#define CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(BlockDim)                \
+  case BlockDim:                                                              \
+    RowReductionForMax<T, BlockDim><<<batch_size, BlockDim, 0, stream>>>(     \
+        logits_data, loss_data, feature_size);                                \
+    RowReductionForDiffMaxSum<T,                                              \
+                              BlockDim><<<batch_size, BlockDim, 0, stream>>>( \
+        logits_data, loss_data, softmax_data, feature_size);                  \
+    RowReductionForSoftmaxAndCrossEntropy<                                    \
+        T, BlockDim><<<batch_size, BlockDim, 0, stream>>>(                    \
+        logits_data, labels_data, loss_data, softmax_data, feature_size);     \
+    break
+
+  switch (block_dim) {
+    CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(512);
+    CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(256);
+    CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(128);
+    CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(64);
+    CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(32);
+    CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(16);
+    CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(8);
+    CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(4);
+    CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(2);
+    case 1:
+      SetSoftmaxToOneWhenFeatureSizeIsOne<<<(batch_size + kMaxBlockDim - 1) /
+                                                kMaxBlockDim,
+                                            kMaxBlockDim, 0, stream>>>(
+          softmax_data, batch_size);
+      cudaMemsetAsync(loss_data, 0, batch_size, stream);
+      break;
+    default:
+      PADDLE_THROW("BlockDim must be 2^n in softmax_with_cross_entropy_op");
+      break;
+  }
+
+#undef CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL
+}
+
 template <typename T>
 class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel<T> {
  public:
@@ -66,14 +258,26 @@ class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel<T> {
     Tensor* softmax = context.Output<Tensor>("Softmax");
 
     Tensor* loss = context.Output<Tensor>("Loss");
-    softmax->mutable_data<T>(context.GetPlace());
-    loss->mutable_data<T>(context.GetPlace());
-
-    math::SoftmaxFunctor<platform::CUDADeviceContext, T>()(
-        context.cuda_device_context(), logits, softmax);
-    math::CrossEntropyFunctor<platform::CUDADeviceContext, T>()(
-        context.cuda_device_context(), loss, softmax, labels,
-        context.Attr<bool>("soft_label"));
+    auto* softmax_data = softmax->mutable_data<T>(context.GetPlace());
+    auto* loss_data = loss->mutable_data<T>(context.GetPlace());
+
+    auto soft_label = context.Attr<bool>("soft_label");
+    auto ignore_index = context.Attr<int>("ignore_index");
+    if (soft_label) {
+      int batch_size = logits->dims()[0];
+      int feature_size = logits->dims()[1];
+      auto* logits_data = logits->data<T>();
+      auto* labels_data = labels->data<T>();
+      SoftmaxWithCrossEntropyFusedKernel(
+          logits_data, labels_data, softmax_data, loss_data, batch_size,
+          feature_size, context.cuda_device_context().stream());
+    } else {
+      math::SoftmaxCUDNNFunctor<T>()(context.cuda_device_context(), logits,
+                                     softmax);
+      math::CrossEntropyFunctor<platform::CUDADeviceContext, T>()(
+          context.cuda_device_context(), loss, softmax, labels, false,
+          ignore_index);
+    }
   }
 };
 
@@ -95,7 +299,7 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
     const int class_num = logit_grad->dims()[1];
     int block = 512;
     auto stream = context.cuda_device_context().stream();
-
+    auto ignore_index = context.Attr<int>("ignore_index");
     if (context.Attr<bool>("soft_label")) {
       int grid = (batch_size * class_num + block - 1) / block;
       const T* label_data = labels->data<T>();
@@ -105,7 +309,7 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
       int grid = (batch_size + block - 1) / block;
       const int64_t* label_data = labels->data<int64_t>();
       CrossEntropyGrad<T><<<grid, block, 0, stream>>>(
-          logit_grad_data, label_data, batch_size, class_num);
+          logit_grad_data, label_data, batch_size, class_num, ignore_index);
       int num = batch_size * class_num;
       grid = (num + block - 1) / block;
       Scale<T><<<grid, block, 0, stream>>>(logit_grad_data, loss_grad_data, num,
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.h b/paddle/fluid/operators/softmax_with_cross_entropy_op.h
index dd6f6aca5ada7aa215d3b3444194fc53efeb7020..e9aba3b37b8cc01d4fe5de5200579d4e93f67e56 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.h
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.h
@@ -45,7 +45,8 @@ class SoftmaxWithCrossEntropyKernel : public framework::OpKernel<T> {
     math::SoftmaxFunctor<platform::CPUDeviceContext, T>()(dev_ctx, logits,
                                                           softmax);
     math::CrossEntropyFunctor<platform::CPUDeviceContext, T>()(
-        dev_ctx, loss, softmax, labels, context.Attr<bool>("soft_label"));
+        dev_ctx, loss, softmax, labels, context.Attr<bool>("soft_label"),
+        context.Attr<int>("ignore_index"));
   }
 };
 
diff --git a/paddle/fluid/operators/split_ids_op.h b/paddle/fluid/operators/split_ids_op.h
index d263426e073d95ad6d584c7370baf596587a993d..c4af5a65fc5f81c1af7c1fdcca637ca37c940637 100644
--- a/paddle/fluid/operators/split_ids_op.h
+++ b/paddle/fluid/operators/split_ids_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <unordered_map>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
@@ -67,10 +68,15 @@ class SplitIdsOpKernel : public framework::OpKernel<T> {
       const auto &ids_rows = ids_selected_rows->rows();
       auto outs = ctx.MultiOutput<framework::SelectedRows>("Out");
       const size_t shard_num = outs.size();
+      for (auto &out : outs) {
+        out->mutable_rows()->clear();
+      }
       // get rows for outputs
-      for (auto &id : ids_rows) {
-        size_t shard_id = static_cast<size_t>(id) % shard_num;
-        outs[shard_id]->mutable_rows()->push_back(id);
+      std::unordered_map<int64_t, size_t> id_to_index;
+      for (size_t i = 0; i < ids_rows.size(); ++i) {
+        id_to_index[ids_rows[i]] = i;
+        size_t shard_id = static_cast<size_t>(ids_rows[i]) % shard_num;
+        outs[shard_id]->mutable_rows()->push_back(ids_rows[i]);
       }
 
       int64_t row_width = ids_dims[1];
@@ -80,7 +86,8 @@ class SplitIdsOpKernel : public framework::OpKernel<T> {
             {static_cast<int64_t>(out->rows().size()), row_width});
         T *output = out->mutable_value()->mutable_data<T>(ddim, place);
         for (int64_t i = 0; i < ddim[0]; ++i) {
-          memcpy(output + i * row_width, ids + out->rows()[i] * row_width,
+          memcpy(output + i * row_width,
+                 ids + id_to_index[out->rows()[i]] * row_width,
                  row_width * sizeof(T));
         }
       }
diff --git a/paddle/fluid/operators/squeeze_op.cc b/paddle/fluid/operators/squeeze_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e389c6a65e1e8220685294931c4d08e6fd928b7f
--- /dev/null
+++ b/paddle/fluid/operators/squeeze_op.cc
@@ -0,0 +1,306 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+class SqueezeOpInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of Squeeze operator should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of Squeeze operator should not be null.");
+
+    const auto &x_dims = ctx->GetInputDim("X");
+    // Check input tensor dims (<6) Eigen limit.
+    PADDLE_ENFORCE(x_dims.size() <= 6,
+                   "Invalid dimnesions, the rank of Input(X) "
+                   "should be in the range of [1, 6] (Eigen limit).");
+
+    const auto &axes = ctx->Attrs().Get<std::vector<int>>("axes");
+    for (int a : axes) {
+      PADDLE_ENFORCE_LT(a, x_dims.size(),
+                        "The squeeze axis should be less than input "
+                        "tensor's rank.");
+    }
+
+    auto out_dims = GetOutputShape(axes, x_dims);
+    ctx->SetOutputDim("Out", out_dims);
+    if (x_dims[0] == out_dims[0]) {
+      // Only pass LoD when the first dimension of output and Input(X)
+      // are the same.
+      ctx->ShareLoD("X", "Out");
+    }
+  }
+
+  static framework::DDim GetOutputShape(const std::vector<int> squeeze_dims,
+                                        const framework::DDim &in_dims) {
+    size_t num_squeeze_dims = squeeze_dims.size();
+    int cnt_squeezed_dims = 0;
+    bool should_squeeze[9] = {false};
+
+    // Determines number of dimensions of output tensor after squeeze.
+    // Mark and count the dimensions need to be squeezed
+    if (num_squeeze_dims == 0) {
+      for (int idx = 0; idx < in_dims.size(); ++idx) {
+        if (in_dims[idx] == 1) {
+          should_squeeze[idx] = true;
+          ++cnt_squeezed_dims;
+        }
+      }
+    } else {
+      for (size_t idx = 0; idx < num_squeeze_dims; ++idx) {
+        int current = squeeze_dims[idx] < 0 ? squeeze_dims[idx] + in_dims.size()
+                                            : squeeze_dims[idx];
+        // Check current index, the upper limit has beed checked in line 36.
+        PADDLE_ENFORCE(current >= 0,
+                       "Invalid axis, the negative axis is out of range.");
+        PADDLE_ENFORCE(in_dims[current] == 1,
+                       "Invalid axis index, the axis that will be squeezed "
+                       "should be equal to 1.");
+
+        if (!(should_squeeze[current])) {
+          ++cnt_squeezed_dims;
+        }
+        should_squeeze[current] = true;
+      }
+    }
+
+    // Make output dimensions
+    std::vector<int64_t> output_shape(in_dims.size() - cnt_squeezed_dims, 0);
+    for (int in_idx = 0, out_idx = 0; in_idx < in_dims.size(); ++in_idx) {
+      if (!should_squeeze[in_idx]) {
+        output_shape[out_idx++] = in_dims[in_idx];
+      }
+    }
+
+    return framework::make_ddim(output_shape);
+  }
+};
+
+class SqueezeOp : public framework::OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto &axes = Attr<std::vector<int>>("axes");
+    auto x_dims = scope.FindVar(Input("X"))->Get<framework::LoDTensor>().dims();
+    auto out_dims = SqueezeOpInferShape::GetOutputShape(axes, x_dims);
+
+    framework::AttributeMap attrs;
+    attrs["shape"] = framework::vectorize2int(out_dims);
+    // Invoke Reshape Op
+    auto reshape_op = framework::OpRegistry::CreateOp(
+        "reshape", {{"X", {Input("X")}}, {"Shape", {}}},
+        {{"Out", {Output("Out")}}}, attrs);
+    reshape_op->Run(scope, place);
+  }
+};
+
+class SqueezeOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor). The input tensor of squeeze operator.");
+    AddOutput("Out", "(Tensor). The output tensor of squeeze operator.");
+    AddAttr<std::vector<int>>("axes",
+                              "(std::vector<int>). List of integers,"
+                              " indicating the dimensions to squeeze.")
+        .SetDefault({});
+    AddComment(R"DOC(
+        Squeeze Operator.
+
+        Remove single-dimensional entries from the shape of a tensor.
+        Takes a parameter axes with a list of axes to squeeze.
+        If axes is not provided, all the single dimensions will be removed from the shape.
+        If an axis is selected with shape entry not equal to one, an error is raised.
+
+        Examples:
+        Case 1:
+          Given
+            X.shape = (1, 3, 1, 5)
+          and
+            axes = [0]
+          we get:
+            Out.shape = (3, 1, 5)
+
+        Case 2:
+          Given
+            X.shape = (1, 3, 1, 5)
+          and
+            axes = []
+          we get:
+            Out.shape = (3, 5)
+    )DOC");
+  }
+};
+
+class SqueezeGradInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    context->SetOutputDim(framework::GradVarName("X"),
+                          context->GetInputDim("X"));
+    context->ShareLoD("X", framework::GradVarName("X"));
+  }
+};
+
+class SqueezeGradOp : public framework::OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto dx_name = Output(framework::GradVarName("X"));
+    auto dout_name = Input(framework::GradVarName("Out"));
+    auto x_dims = scope.FindVar(Input("X"))->Get<framework::LoDTensor>().dims();
+    framework::AttributeMap attrs;
+    attrs["shape"] = framework::vectorize2int(x_dims);
+
+    auto reshape_op = framework::OpRegistry::CreateOp(
+        "reshape", {{"X", {dout_name}}, {"Shape", {}}}, {{"Out", {dx_name}}},
+        attrs);
+    reshape_op->Run(scope, place);
+  }
+};
+
+// FIXME(zcd): squeeze2 adds an intermediate output(XShape) based on squeeze,
+// the XShape is used to carry the shape and lod of X which will be used in
+// squeeze_grad, in this way, the framework can reuse the memory of X
+// immediately the squeeze2_op is finished.
+// Considering compatibility issues, we could not fix squeeze2_op
+class Squeeze2OpMaker : public SqueezeOpMaker {
+ public:
+  void Make() override {
+    SqueezeOpMaker::Make();
+    AddOutput("XShape",
+              "XShape is just used to store the shape and lod of X, which will "
+              "be used in SqueezeGradOp.")
+        .AsIntermediate();
+  }
+};
+
+class Squeeze2OpInferShape : public SqueezeOpInferShape {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    SqueezeOpInferShape::operator()(ctx);
+    PADDLE_ENFORCE(ctx->HasOutput("XShape"),
+                   "Output(XShape) of Squeeze operator should not be null.");
+    const auto &x_dims = ctx->GetInputDim("X");
+    std::vector<int64_t> xshape_dims(x_dims.size() + 1);
+    xshape_dims[0] = 0;
+    for (int i = 0; i < x_dims.size(); ++i) {
+      xshape_dims[i + 1] = x_dims[i];
+    }
+    ctx->SetOutputDim("XShape", framework::make_ddim(xshape_dims));
+    ctx->ShareLoD("X", /*->*/ "XShape");
+  }
+};
+
+class Squeeze2Op : public framework::OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto &axes = Attr<std::vector<int>>("axes");
+    auto x_dims = scope.FindVar(Input("X"))->Get<framework::LoDTensor>().dims();
+    auto out_dims = Squeeze2OpInferShape::GetOutputShape(axes, x_dims);
+
+    framework::AttributeMap attrs;
+    attrs["shape"] = framework::vectorize2int(out_dims);
+    // Invoke Reshape Op
+    auto reshape_op = framework::OpRegistry::CreateOp(
+        "reshape2", {{"X", {Input("X")}}, {"Shape", {}}},
+        {{"Out", {Output("Out")}}, {"XShape", {Output("XShape")}}}, attrs);
+    reshape_op->Run(scope, place);
+  }
+};
+
+class Squeeze2GradOpMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
+    grad_op->SetType("squeeze2_grad");
+    grad_op->SetInput("XShape", Output("XShape"));
+    grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    grad_op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+
+class Squeeze2GradInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInput("XShape"),
+                   "Input(XShape) shouldn't be null.");
+    PADDLE_ENFORCE(context->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) shouldn't be null.");
+    auto xshape_dims = context->GetInputDim("XShape");
+    auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
+    context->SetOutputDim(framework::GradVarName("X"), x_dims);
+    context->ShareLoD("XShape", framework::GradVarName("X"));
+  }
+};
+
+class Squeeze2GradOp : public framework::OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto dx_name = Output(framework::GradVarName("X"));
+    auto dout_name = Input(framework::GradVarName("Out"));
+    auto xshape_name = Input("XShape");
+    auto xshape_dims =
+        scope.FindVar(xshape_name)->Get<framework::LoDTensor>().dims();
+    auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
+
+    framework::AttributeMap attrs;
+    attrs["shape"] = framework::vectorize2int(x_dims);
+
+    auto reshape_op = framework::OpRegistry::CreateOp(
+        "reshape2", {{"X", {dout_name}}, {"Shape", {}}},
+        {{"Out", {dx_name}}, {"XShape", {xshape_name}}}, attrs);
+    reshape_op->Run(scope, place);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+// Tell linker to use reshape op
+USE_OP(reshape);
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(squeeze, ops::SqueezeOp, ops::SqueezeOpMaker,
+                  ops::SqueezeOpInferShape,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(squeeze_grad, ops::SqueezeGradOp, ops::SqueezeGradInferShape);
+
+REGISTER_OPERATOR(squeeze2, ops::Squeeze2Op, ops::Squeeze2OpMaker,
+                  ops::Squeeze2OpInferShape, ops::Squeeze2GradOpMaker);
+REGISTER_OPERATOR(squeeze2_grad, ops::Squeeze2GradOp,
+                  ops::Squeeze2GradInferShape);
diff --git a/paddle/fluid/operators/stack_op.cc b/paddle/fluid/operators/stack_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3f4b48bc7391def082c82ed451fc5a752009a2f1
--- /dev/null
+++ b/paddle/fluid/operators/stack_op.cc
@@ -0,0 +1,28 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/stack_op.h"
+
+namespace plat = paddle::platform;
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(stack, ops::StackOp, ops::StackOpMaker,
+                  ops::StackGradOpDescMaker);
+REGISTER_OPERATOR(stack_grad, ops::StackOpGrad);
+
+REGISTER_OP_CPU_KERNEL(stack, ops::StackKernel<plat::CPUDeviceContext, float>,
+                       ops::StackKernel<plat::CPUDeviceContext, double>);
+
+REGISTER_OP_CPU_KERNEL(stack_grad,
+                       ops::StackGradKernel<plat::CPUDeviceContext, float>,
+                       ops::StackGradKernel<plat::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/stack_op.cu b/paddle/fluid/operators/stack_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..92c1bde2bcf089e5c715e90e564408e6ad37ba17
--- /dev/null
+++ b/paddle/fluid/operators/stack_op.cu
@@ -0,0 +1,25 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/stack_op.h"
+
+namespace plat = paddle::platform;
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(stack, ops::StackKernel<plat::CUDADeviceContext, float>,
+                        ops::StackKernel<plat::CUDADeviceContext, double>);
+
+REGISTER_OP_CUDA_KERNEL(stack_grad,
+                        ops::StackGradKernel<plat::CUDADeviceContext, float>,
+                        ops::StackGradKernel<plat::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/stack_op.h b/paddle/fluid/operators/stack_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..d236c5b943704683c27b9b155c11ca9113edf514
--- /dev/null
+++ b/paddle/fluid/operators/stack_op.h
@@ -0,0 +1,250 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/for_range.h"
+
+#ifdef __NVCC__
+#include <thrust/device_vector.h>
+#include "paddle/fluid/framework/array.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+class StackOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE_GT(ctx->Inputs("X").size(), 0,
+                      "Number of Inputs(X) must be larger than 0");
+    PADDLE_ENFORCE(ctx->HasOutput("Y"), "Output(Y) must exist.");
+
+    auto input_dims = ctx->GetInputsDim("X");
+    for (size_t i = 1; i < input_dims.size(); ++i) {
+      PADDLE_ENFORCE_EQ(input_dims[i], input_dims[0],
+                        "Dims of all Inputs(X) must be the same");
+    }
+
+    // Only lod of X[0] would be shared with Y
+    ctx->ShareLoD("X", /*->*/ "Y");
+
+    int axis = ctx->Attrs().Get<int>("axis");
+    int rank = input_dims[0].size();
+    PADDLE_ENFORCE(
+        axis >= -(rank + 1) && axis < rank + 1,
+        "Attr(axis) must be inside [-(rank+1), rank+1), where rank = %d", rank);
+    if (axis < 0) axis += (rank + 1);
+
+    auto vec = framework::vectorize2int(input_dims[0]);
+    vec.insert(vec.begin() + axis, input_dims.size());
+    ctx->SetOutputDim("Y", framework::make_ddim(vec));
+  }
+};
+
+class StackOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "The input of stack op.").AsDuplicable();
+    AddOutput("Y", "The output of stack op.");
+    AddAttr<int>("axis",
+                 "The axis along which all of the Inputs(X) should be stacked.")
+        .SetDefault(0);
+    AddComment(R"DOC(
+      Stack Operator.
+
+      Stack all of the Inputs(X) into one tensor along Attr(axis). The dims of all Inputs(X) must be the same.
+    )DOC");
+  }
+};
+
+template <typename VecXType, typename T>
+struct StackFunctor {
+  HOSTDEVICE StackFunctor(const VecXType &x, T *y, int n, int post)
+      : x_(x), y_(y), n_(n), post_(post) {}
+
+  HOSTDEVICE void operator()(int idx) {
+    int i = idx / (n_ * post_);
+    int which_x = idx / post_ - i * n_;
+    int x_index = i * post_ + idx % post_;
+    y_[idx] = x_[which_x][x_index];
+  }
+
+ private:
+  VecXType x_;
+  T *y_;
+  int n_;
+  int post_;
+};
+
+template <typename VecDxType, typename T>
+struct StackGradFunctor {
+  HOSTDEVICE StackGradFunctor(const VecDxType &dx, const T *dy, int n, int post)
+      : dx_(dx), dy_(dy), n_(n), post_(post) {}
+
+  HOSTDEVICE void operator()(int idx) {
+    int i = idx / (n_ * post_);
+    int which_x = idx / post_ - i * n_;
+    int x_index = i * post_ + idx % post_;
+    dx_[which_x][x_index] = dy_[idx];
+  }
+
+ private:
+  VecDxType dx_;
+  const T *dy_;
+  int n_;
+  int post_;
+};
+
+template <typename DeviceContext, typename VecXType, typename T>
+static inline void StackFunctorForRange(const DeviceContext &ctx,
+                                        const VecXType &x, T *y, int total_num,
+                                        int n, int post) {
+  platform::ForRange<DeviceContext> for_range(ctx, total_num);
+  for_range(StackFunctor<VecXType, T>(x, y, n, post));
+}
+
+template <typename DeviceContext, typename VecDxType, typename T>
+static inline void StackGradFunctorForRange(const DeviceContext &ctx,
+                                            const VecDxType &dx, const T *dy,
+                                            int total_num, int n, int post) {
+  platform::ForRange<DeviceContext> for_range(ctx, total_num);
+  for_range(StackGradFunctor<VecDxType, T>(dx, dy, n, post));
+}
+
+template <typename DeviceContext, typename T>
+class StackKernel : public framework::OpKernel<T> {
+  using Tensor = framework::LoDTensor;
+
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto x = ctx.MultiInput<Tensor>("X");
+    auto *y = ctx.Output<Tensor>("Y");
+
+    int axis = ctx.Attr<int>("axis");
+    if (axis < 0) axis += (x[0]->dims().size() + 1);
+
+    int n = static_cast<int>(x.size());
+    auto *y_data = y->mutable_data<T>(ctx.GetPlace());
+    std::vector<const T *> x_datas(n);
+    for (int i = 0; i < n; i++) x_datas[i] = x[i]->data<T>();
+
+    int pre = 1, post = 1;
+    auto &dim = x[0]->dims();
+    for (auto i = 0; i < axis; ++i) pre *= dim[i];
+    for (auto i = axis; i < dim.size(); ++i) post *= dim[i];
+    int total_num = pre * n * post;
+
+    auto &dev_ctx = ctx.template device_context<DeviceContext>();
+#ifdef __NVCC__
+    thrust::device_vector<const T *> device_x_vec(x_datas);
+    auto x_data_arr = device_x_vec.data().get();
+#else
+    auto x_data_arr = x_datas.data();
+#endif
+    StackFunctorForRange(dev_ctx, x_data_arr, y_data, total_num, n, post);
+#ifdef __NVCC__
+    // Wait() must be called because device_x_vec may be destructed before
+    // kernel ends
+    dev_ctx.Wait();
+#endif
+  }
+};
+
+class StackOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")),
+                   "Input(Y@Grad) must exist.");
+
+    int axis = ctx->Attrs().Get<int>("axis");
+    auto dy_dim = ctx->GetInputDim(framework::GradVarName("Y"));
+    int rank = dy_dim.size();
+    PADDLE_ENFORCE(axis >= -rank && axis < rank,
+                   "Attr(axis) must be inside [-rank, rank), where rank = %d",
+                   rank);
+    if (axis < 0) axis += rank;
+
+    PADDLE_ENFORCE_EQ(ctx->Outputs(framework::GradVarName("X")).size(),
+                      static_cast<size_t>(dy_dim[axis]),
+                      "Number of Outputs(X@Grad) is wrong");
+    auto vec = framework::vectorize2int(dy_dim);
+    vec.erase(vec.begin() + axis);
+    ctx->SetOutputsDim(
+        framework::GradVarName("X"),
+        std::vector<framework::DDim>(dy_dim[axis], framework::make_ddim(vec)));
+  }
+};
+
+class StackGradOpDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("stack_grad");
+    op->SetInput(framework::GradVarName("Y"), OutputGrad("Y"));
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X", false));
+    op->SetAttrMap(Attrs());
+    return op;
+  }
+};
+
+template <typename DeviceContext, typename T>
+class StackGradKernel : public framework::OpKernel<T> {
+  using Tensor = framework::LoDTensor;
+
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    auto dx = ctx.MultiOutput<Tensor>(framework::GradVarName("X"));
+    int axis = ctx.Attr<int>("axis");
+    if (axis < 0) axis += dy->dims().size();
+
+    int n = dy->dims()[axis];
+    std::vector<T *> dx_datas(n);  // NOLINT
+    for (int i = 0; i < n; i++) {
+      dx_datas[i] = dx[i]->mutable_data<T>(ctx.GetPlace());
+    }
+    auto dy_data = dy->data<T>();
+
+    int pre = 1;
+    for (int i = 0; i < axis; ++i) pre *= dy->dims()[i];
+    int total_num = dy->numel();
+    int post = total_num / (n * pre);
+
+    auto &dev_ctx = ctx.template device_context<DeviceContext>();
+#ifdef __NVCC__
+    thrust::device_vector<T *> device_dx_vec(dx_datas);
+    auto dx_data_arr = device_dx_vec.data().get();
+#else
+    auto dx_data_arr = dx_datas.data();
+#endif
+    StackGradFunctorForRange(dev_ctx, dx_data_arr, dy_data, total_num, n, post);
+#ifdef __NVCC__
+    // Wait() must be called because device_dx_vec may be destructed before
+    // kernel ends
+    dev_ctx.Wait();
+#endif
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/sum_mkldnn_op.cc b/paddle/fluid/operators/sum_mkldnn_op.cc
index f78d977760f18c9eb1270e515e68acb208a7c9a4..f9a16ef35ecb9eeb6c8eda9d124ecb17e7f9d5ce 100644
--- a/paddle/fluid/operators/sum_mkldnn_op.cc
+++ b/paddle/fluid/operators/sum_mkldnn_op.cc
@@ -34,15 +34,15 @@
 namespace paddle {
 namespace operators {
 
-using paddle::framework::Tensor;
-using paddle::platform::MKLDNNDeviceContext;
-using paddle::platform::CPUDeviceContext;
 using framework::DataLayout;
 using mkldnn::memory;
 using mkldnn::primitive;
+using mkldnn::reorder;
 using mkldnn::stream;
 using mkldnn::sum;
-using mkldnn::reorder;
+using paddle::framework::Tensor;
+using paddle::platform::CPUDeviceContext;
+using paddle::platform::MKLDNNDeviceContext;
 using platform::to_void_cast;
 
 template <typename T>
@@ -88,7 +88,7 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
         input_format = memory::format::nc;
       }
 
-      for (int i = in_place ? 1 : 0; i < N; i++) {
+      for (int i = 0; i < N; i++) {
         PADDLE_ENFORCE(in_vars[i]->IsType<LoDTensor>(),
                        "all inputs must be all LoDTensors");
         auto& input = in_vars[i]->Get<LoDTensor>();
@@ -175,18 +175,35 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
         auto& sel_row = get_selected_row(i);
         first_dim += sel_row.rows().size();
       }
-      auto in_dim =
-          framework::vectorize(get_selected_row(N - 1).value().dims());
+
+      std::vector<int64_t> in_dim;
+      for (int i = 0; i < N; i++) {
+        auto& sel_row = get_selected_row(i);
+        if (sel_row.rows().size() > 0) {
+          in_dim = framework::vectorize(sel_row.value().dims());
+          break;
+        }
+      }
+
+      if (in_dim.empty()) {
+        VLOG(3) << "WARNING: all the inputs are empty";
+        in_dim = framework::vectorize(get_selected_row(N - 1).value().dims());
+      } else {
+        in_dim[0] = static_cast<int64_t>(first_dim);
+      }
+
       in_dim[0] = static_cast<int64_t>(first_dim);
 
       out_value->Resize(framework::make_ddim(in_dim));
 
+      out_value->mutable_data<T>(ctx.GetPlace());
+
       // if all the input sparse vars are empty, no need to
       // merge these vars.
       if (first_dim == 0UL) {
         return;
       }
-      out_value->mutable_data<T>(ctx.GetPlace());
+
       math::SelectedRowsAddTo<CPUDeviceContext, T> functor;
       int64_t offset = 0;
       for (int i = 0; i < N; i++) {
diff --git a/paddle/fluid/operators/sum_op.h b/paddle/fluid/operators/sum_op.h
index 49a4afb3a8a19c97e844e66477c6288772ece807..6dffe527c1072ee97fcde1725bfc1a47ed1ad74a 100644
--- a/paddle/fluid/operators/sum_op.h
+++ b/paddle/fluid/operators/sum_op.h
@@ -105,18 +105,30 @@ class SumKernel : public framework::OpKernel<T> {
         auto &sel_row = get_selected_row(i);
         first_dim += sel_row.rows().size();
       }
-      auto in_dim =
-          framework::vectorize(get_selected_row(N - 1).value().dims());
-      in_dim[0] = static_cast<int64_t>(first_dim);
+
+      std::vector<int64_t> in_dim;
+      for (int i = 0; i < N; i++) {
+        auto &sel_row = get_selected_row(i);
+        if (sel_row.rows().size() > 0) {
+          in_dim = framework::vectorize(sel_row.value().dims());
+          break;
+        }
+      }
+      if (in_dim.empty()) {
+        VLOG(3) << "WARNING: all the inputs are empty";
+        in_dim = framework::vectorize(get_selected_row(N - 1).value().dims());
+      } else {
+        in_dim[0] = static_cast<int64_t>(first_dim);
+      }
 
       out_value->Resize(framework::make_ddim(in_dim));
+      out_value->mutable_data<T>(context.GetPlace());
 
       // if all the input sparse vars are empty, no need to
       // merge these vars.
       if (first_dim == 0UL) {
         return;
       }
-      out_value->mutable_data<T>(context.GetPlace());
 
       math::SelectedRowsAddTo<DeviceContext, T> functor;
 
diff --git a/paddle/fluid/operators/tensorrt_engine_op.cc b/paddle/fluid/operators/tensorrt_engine_op.cc
index 647cfc0a0af2be85e2868c6f68cab962c6631a8d..1048d3017140c9e31426a1580b2862667116a024 100644
--- a/paddle/fluid/operators/tensorrt_engine_op.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op.cc
@@ -17,105 +17,15 @@
 #include <string>
 #include <vector>
 
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
-#include "paddle/fluid/inference/tensorrt/engine.h"
-#include "paddle/fluid/inference/utils/singleton.h"
 #include "paddle/fluid/operators/tensorrt_engine_op.h"
 
 namespace paddle {
-namespace operators {
-
-using inference::Singleton;
-using inference::tensorrt::TRT_EngineManager;
-
-using FluidDT = framework::proto::VarType_Type;
-using TRT_DT = nvinfer1::DataType;
-
-namespace {
-
-TRT_DT FluidDataType2TRT(FluidDT type) {
-  switch (type) {
-    case FluidDT::VarType_Type_FP32:
-      return TRT_DT::kFLOAT;
-    case FluidDT::VarType_Type_INT32:
-      return TRT_DT::kINT32;
-    default:
-      return TRT_DT::kINT32;
-  }
-  PADDLE_THROW("unkown type");
-  return TRT_DT::kINT32;
-}
-
-nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t> &shape) {
-  PADDLE_ENFORCE_GT(shape.size(), 1UL,
-                    "TensorRT' tensor input requires at least 2 dimensions");
-  PADDLE_ENFORCE_LE(shape.size(), 4UL,
-                    "TensorRT' tensor input requires at most 4 dimensions");
 
-  switch (shape.size()) {
-    case 2:
-      return nvinfer1::Dims2(shape[0], shape[1]);
-    case 3:
-      return nvinfer1::Dims3(shape[0], shape[1], shape[2]);
-    case 4:
-      return nvinfer1::Dims4(shape[0], shape[1], shape[2], shape[3]);
-    default:
-      return nvinfer1::Dims();
-  }
-  return nvinfer1::Dims();
-}
+DEFINE_int32(tensorrt_engine_batch_size, 1, "the batch_size of TensorRT");
+DEFINE_int32(tensorrt_max_batch_size, 1, "TensorRT maximum batch size");
+DEFINE_int32(tensorrt_workspace_size, 16 << 20, "TensorRT workspace size");
 
-}  // namespace
-
-template <typename DeviceContext, typename T>
-void TensorRTEngineKernel<DeviceContext, T>::Prepare(
-    const framework::ExecutionContext &context) const {
-  VLOG(4) << "Prepare engine";
-  // Get the ProgramDesc and pass to convert.
-  framework::proto::BlockDesc block_desc;
-  block_desc.ParseFromString(context.Attr<std::string>("subgraph"));
-  int max_batch = context.Attr<int>("max_batch");
-  auto max_workspace = context.Attr<int>("max_workspace");
-  auto params = context.Attr<std::vector<std::string>>("parameters");
-  std::unordered_set<std::string> parameters;
-  for (const auto &param : params) {
-    parameters.insert(param);
-  }
-
-  // TODO(Superjomn) replace this with a different stream
-  auto *engine = Singleton<TRT_EngineManager>::Global().Create(
-      max_batch, max_workspace, nullptr /*engine hold its own stream*/,
-      context.Attr<std::string>("engine_uniq_key"));
-  engine->InitNetwork();
-
-  framework::BlockDesc block(nullptr /*programdesc*/, &block_desc);
-  // Add inputs
-  VLOG(4) << "declare inputs";
-  for (auto &input : context.Inputs("Xs")) {
-    VLOG(4) << "declare input " << input;
-    auto *var = block.FindVar(input);
-    PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR,
-                      "TensorRT engine only takes LoDTensor as input");
-    auto shape = var->GetShape();
-    engine->DeclareInput(
-        input, FluidDataType2TRT(
-                   var->Proto()->type().lod_tensor().tensor().data_type()),
-        Vec2TRT_Dims(var->GetShape()));
-  }
-
-  inference::Singleton<inference::tensorrt::OpConverter>::Global().ConvertBlock(
-      block_desc, parameters, context.scope(), engine);
-
-  // Add outputs
-  VLOG(4) << "declare outputs";
-  for (auto &output : context.Outputs("Ys")) {
-    VLOG(4) << "declare output " << output;
-    engine->DeclareOutput(output);
-  }
-
-  engine->FreezeNetwork();
-}
+namespace operators {
 
 class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
@@ -124,8 +34,6 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Ys", "A list of outputs").AsDuplicable();
     AddAttr<std::string>("subgraph", "the subgraph.");
     AddAttr<std::string>("engine_uniq_key", "unique key for the TRT engine.");
-    AddAttr<int>("max_batch", "the maximum batch size.");
-    AddAttr<int>("max_workspace", "the maximum batch size.");
     AddComment("TensorRT engine operator.");
   }
 };
@@ -144,11 +52,4 @@ namespace ops = paddle::operators;
 REGISTER_OPERATOR(tensorrt_engine, ops::TensorRTEngineOp,
                   ops::TensorRTEngineOpMaker, ops::TensorRTEngineOpMaker);
 
-REGISTER_OP_CPU_KERNEL(
-    tensorrt_engine,
-    ops::TensorRTEngineKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::TensorRTEngineKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::TensorRTEngineKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::TensorRTEngineKernel<paddle::platform::CPUDeviceContext, int64_t>);
-
 #endif  // PADDLE_WITH_CUDA
diff --git a/paddle/fluid/operators/prelu_op.cu b/paddle/fluid/operators/tensorrt_engine_op.cu.cc
similarity index 59%
rename from paddle/fluid/operators/prelu_op.cu
rename to paddle/fluid/operators/tensorrt_engine_op.cu.cc
index 37d934a29046be04a1721b7330c813f663f61aed..e1ddfde6d51ef719ca0b89cf286b176195ee682a 100644
--- a/paddle/fluid/operators/prelu_op.cu
+++ b/paddle/fluid/operators/tensorrt_engine_op.cu.cc
@@ -12,11 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/prelu_op.h"
+#include "paddle/fluid/operators/tensorrt_engine_op.h"
+
+namespace ops = paddle::operators;
 
 REGISTER_OP_CUDA_KERNEL(
-    prelu,
-    paddle::operators::PReluKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(prelu_grad,
-                        paddle::operators::PReluGradKernel<
-                            paddle::platform::CUDADeviceContext, float>);
+    tensorrt_engine,
+    ops::TensorRTEngineKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::TensorRTEngineKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::TensorRTEngineKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::TensorRTEngineKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt_engine_op.h
index 1602a913aeebe43fabe2f9c9036edd18ac4c70fd..79e75ea9a035b654f0bb7026d3a491bebe0b23c4 100644
--- a/paddle/fluid/operators/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt_engine_op.h
@@ -19,13 +19,51 @@
 #include <string>
 #include <vector>
 
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/inference/analysis/helper.h"
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/engine.h"
 
 namespace paddle {
+
+DECLARE_int32(tensorrt_engine_batch_size);
+DECLARE_int32(tensorrt_max_batch_size);
+DECLARE_int32(tensorrt_workspace_size);
+
 namespace operators {
 
+using FluidDT = framework::proto::VarType_Type;
+using TRT_DT = nvinfer1::DataType;
+
+namespace {
+
+TRT_DT FluidDataType2TRT(FluidDT type) {
+  switch (type) {
+    case FluidDT::VarType_Type_FP32:
+      return TRT_DT::kFLOAT;
+    case FluidDT::VarType_Type_INT32:
+      return TRT_DT::kINT32;
+    default:
+      return TRT_DT::kINT32;
+  }
+  PADDLE_THROW("unkown type");
+  return TRT_DT::kINT32;
+}
+
+nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t>& shape) {
+  PADDLE_ENFORCE_GT(shape.size(), 1UL,
+                    "TensorRT' tensor input requires at least 2 dimensions");
+  PADDLE_ENFORCE_LE(shape.size(), 4UL,
+                    "TensorRT' tensor input requires at most 4 dimensions");
+  PADDLE_ENFORCE(shape.size() == 4UL || shape.size() == 2UL);
+  if (shape.size() == 4UL)
+    return nvinfer1::DimsCHW(shape[1], shape[2], shape[3]);
+  return nvinfer1::DimsCHW(shape[1], 1, 1);
+}
+
+}  // namespace
+
 using inference::Singleton;
 using inference::tensorrt::TRT_EngineManager;
 
@@ -44,7 +82,7 @@ class TensorRTEngineOp : public framework::OperatorWithKernel {
                                   .FindVar(input0)
                                   ->GetMutable<framework::LoDTensor>()
                                   ->type()),
-        platform::CPUPlace());
+        ctx.GetPlace());
     return kt;
   }
 };
@@ -53,7 +91,6 @@ template <typename DeviceContext, typename T>
 class TensorRTEngineKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    VLOG(4) << "TensorRTEngineKernel executing";
     auto engine_name = context.Attr<std::string>("engine_uniq_key");
     if (!Singleton<TRT_EngineManager>::Global().HasEngine(engine_name)) {
       Prepare(context);
@@ -61,14 +98,20 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
     auto* engine = Singleton<TRT_EngineManager>::Global().Get(engine_name);
     auto input_names = context.op().Inputs("Xs");
     PADDLE_ENFORCE(!input_names.empty(), "should pass more than one inputs");
-    // Try to determine a batch_size
-    auto& tensor0 = inference::analysis::GetFromScope<framework::LoDTensor>(
-        context.scope(), input_names.front());
-    int batch_size = tensor0.dims()[0];
-    PADDLE_ENFORCE_LE(batch_size, context.Attr<int>("max_batch"));
+    PADDLE_ENFORCE_LE(FLAGS_tensorrt_engine_batch_size,
+                      FLAGS_tensorrt_max_batch_size);
 
+    std::vector<std::string> output_maps =
+        context.Attr<std::vector<std::string>>("output_name_mapping");
+
+    auto params = context.Attr<std::vector<std::string>>("parameters");
+    std::unordered_set<std::string> parameters;
+    for (const auto& param : params) {
+      parameters.insert(param);
+    }
     // Convert input tensor from fluid to engine.
     for (const auto& x : context.Inputs("Xs")) {
+      if (parameters.count(x)) continue;
       // convert input and copy to TRT engine's buffer
       auto& t = inference::analysis::GetFromScope<framework::LoDTensor>(
           context.scope(), x);
@@ -81,39 +124,111 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
       }
     }
     // Execute the engine.
-    PADDLE_ENFORCE_GT(batch_size, 0);
-    engine->Execute(batch_size);
+    PADDLE_ENFORCE_GT(FLAGS_tensorrt_engine_batch_size, 0);
+    engine->Execute(FLAGS_tensorrt_engine_batch_size);
+
     // Convert output tensor from engine to fluid
+    int output_index = 0;
+    VLOG(4) << "TensorRT Engine Op Outputs:";
     for (const auto& y : context.Outputs("Ys")) {
+      VLOG(4) << y;
       // convert output and copy to fluid.
-      nvinfer1::ITensor* trt_t = engine->GetITensor(y);
+      nvinfer1::ITensor* trt_t = engine->GetITensor(output_maps[output_index]);
       auto dims = trt_t->getDimensions();
       // Use the output ITensor's dims to reshape the Fluid Tensor.
-      std::vector<int> ddim(dims.d, dims.d + dims.nbDims);
+      // The ITensor doesn't contain the batch size dim.
+      std::vector<int> ddim;
+      ddim.push_back(FLAGS_tensorrt_engine_batch_size);
+      for (int i = 0; i < dims.nbDims; i++) {
+        ddim.push_back(dims.d[i]);
+      }
 
       auto* fluid_v = context.scope().FindVar(y);
       PADDLE_ENFORCE_NOT_NULL(fluid_v, "no output variable called %s", y);
       auto* fluid_t = fluid_v->GetMutable<framework::LoDTensor>();
+
       fluid_t->Resize(framework::make_ddim(ddim));
-      auto size = inference::analysis::AccuDims(dims.d, dims.nbDims);
-      if (platform::is_cpu_place(fluid_t->place())) {
-        // TODO(Superjomn) change this float to dtype size.
-        engine->GetOutputInCPU(
-            y, fluid_t->mutable_data<float>(platform::CPUPlace()),
-            size * sizeof(float));
-      } else {
-        engine->GetOutputInGPU(
-            y, fluid_t->mutable_data<float>(platform::CUDAPlace()),
-            size * sizeof(float));
-      }
+
+      // TODO(Superjomn) find some way to determine which device to output the
+      // tensor.
+      // if (platform::is_cpu_place(fluid_t->place())) {
+      // TODO(Superjomn) change this float to dtype size.
+      auto size = inference::analysis::AccuDims(dims.d, dims.nbDims) *
+                  FLAGS_tensorrt_engine_batch_size;
+      engine->GetOutputInGPU(
+          output_maps[output_index],
+          fluid_t->mutable_data<float>(platform::CUDAPlace(
+              boost::get<platform::CUDAPlace>(context.GetPlace()).device)),
+          size * sizeof(float));
+
+      output_index += 1;
     }
 
     cudaStreamSynchronize(*engine->stream());
   }
 
  protected:
-  // Build the engine.
-  void Prepare(const framework::ExecutionContext& context) const;
+  void Prepare(const framework::ExecutionContext& context) const {
+    VLOG(4) << "Prepare engine";
+    // Get the ProgramDesc and pass to convert.
+    framework::proto::BlockDesc block_desc;
+    block_desc.ParseFromString(context.Attr<std::string>("subgraph"));
+    int max_batch = FLAGS_tensorrt_max_batch_size;
+    auto max_workspace = FLAGS_tensorrt_workspace_size;
+    auto params = context.Attr<std::vector<std::string>>("parameters");
+    std::unordered_set<std::string> parameters;
+    for (const auto& param : params) {
+      parameters.insert(param);
+    }
+
+    std::vector<std::string> output_maps =
+        context.Attr<std::vector<std::string>>("output_name_mapping");
+
+    // TODO(Superjomn) replace this with a different stream
+    auto* engine = Singleton<TRT_EngineManager>::Global().Create(
+        max_batch, max_workspace, nullptr /*engine hold its own stream*/,
+        context.Attr<std::string>("engine_uniq_key"),
+        boost::get<platform::CUDAPlace>(context.GetPlace()).device);
+
+    engine->InitNetwork();
+
+    framework::BlockDesc block(nullptr /*programdesc*/, &block_desc);
+    VLOG(4) << "parsed var size " << block.AllVars().size();
+    // Add inputs
+    VLOG(4) << "declare inputs";
+    for (auto& input : context.Inputs("Xs")) {
+      if (parameters.count(input)) continue;
+      VLOG(4) << "declare input " << input;
+      auto* var = block.FindVar(input);
+      // TensorRT engine need to create parameters. The parameter's description
+      // should be set in
+      PADDLE_ENFORCE(var, "no variable called %s", input);
+      PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR,
+                        "TensorRT engine only takes LoDTensor as input");
+      auto shape = var->GetShape();
+      // For the special batch_size placeholder -1, drop it and pass the real
+      // shape of data.
+      // TODO(Superjomn) fix this with batch broadcast, or it can't handle
+      // variational batch size.
+      if (shape[0] == -1) {
+        shape[0] = FLAGS_tensorrt_engine_batch_size;
+      }
+      engine->DeclareInput(
+          input, FluidDataType2TRT(
+                     var->Proto()->type().lod_tensor().tensor().data_type()),
+          Vec2TRT_Dims(shape));
+    }
+
+    inference::Singleton<inference::tensorrt::OpConverter>::Global()
+        .ConvertBlock(block_desc, parameters, context.scope(), engine);
+
+    // Add outputs
+    for (auto& output : output_maps) {
+      engine->DeclareOutput(output);
+    }
+
+    engine->FreezeNetwork();
+  }
 };
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt_engine_op_test.cc
index 82a16361e40513aeaf6f510e450f58989369fcdb..27c1d29762b3de5e57f877b271aae52e71eb7cf9 100644
--- a/paddle/fluid/operators/tensorrt_engine_op_test.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op_test.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/operators/tensorrt_engine_op.h"
 #include <gtest/gtest.h>
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/lod_tensor.h"
@@ -23,20 +24,20 @@ limitations under the License. */
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
 
-USE_CPU_ONLY_OP(tensorrt_engine);
+USE_CUDA_ONLY_OP(tensorrt_engine);
 
 namespace paddle {
 namespace operators {
 
 namespace {
-void CreateCPUTensor(framework::Scope* scope, const std::string& name,
-                     const std::vector<int64_t>& shape) {
+void CreateCUDATensor(framework::Scope* scope, const std::string& name,
+                      const std::vector<int64_t>& shape) {
   auto* var = scope->Var(name);
   auto* tensor = var->GetMutable<framework::LoDTensor>();
   auto dims = framework::make_ddim(shape);
   tensor->Resize(dims);
-  platform::CPUPlace place;
-  platform::CPUDeviceContext ctx(place);
+  platform::CUDAPlace place;
+  platform::CUDADeviceContext ctx(place);
   inference::tensorrt::RandomizeTensor(tensor, place, ctx);
 }
 
@@ -57,6 +58,8 @@ void AddTensorToBlockDesc(framework::proto::BlockDesc* block,
 using inference::analysis::SetAttr;
 
 TEST(TensorRTEngineOp, manual) {
+  FLAGS_tensorrt_engine_batch_size = 2;
+  FLAGS_tensorrt_max_batch_size = 2;
   framework::ProgramDesc program;
   auto* block_ = program.Proto()->add_blocks();
   block_->set_idx(0);
@@ -64,59 +67,61 @@ TEST(TensorRTEngineOp, manual) {
 
   LOG(INFO) << "create block desc";
   framework::BlockDesc block_desc(&program, block_);
-  LOG(INFO) << "create mul op";
-  auto* mul = block_desc.AppendOp();
-  mul->SetType("mul");
-  mul->SetInput("X", std::vector<std::string>({"x"}));     // 2 x 4
-  mul->SetInput("Y", std::vector<std::string>({"y"}));     // 4 x 6
-  mul->SetOutput("Out", std::vector<std::string>({"z"}));  // 2 x 6
+  LOG(INFO) << "create fc op";
+  auto* fc0 = block_desc.AppendOp();
+  fc0->SetType("fc");
+  fc0->SetInput("X", std::vector<std::string>({"x"}));     // 4 x 1 x 1
+  fc0->SetInput("Y", std::vector<std::string>({"y"}));     // 4 x 6
+  fc0->SetOutput("Out", std::vector<std::string>({"z"}));  // 6 x 1 x 1
 
   LOG(INFO) << "create fc op";
-  auto* fc = block_desc.AppendOp();
-  fc->SetType("mul");
-  fc->SetInput("X", std::vector<std::string>({"z"}));
-  fc->SetInput("Y", std::vector<std::string>({"y0"}));     // 6 x 8
-  fc->SetOutput("Out", std::vector<std::string>({"z0"}));  // 2 x 8
+  auto* fc1 = block_desc.AppendOp();
+  fc1->SetType("fc");
+  fc1->SetInput("X", std::vector<std::string>({"z"}));
+  fc1->SetInput("Y", std::vector<std::string>({"y0"}));     // 6 x 8
+  fc1->SetOutput("Out", std::vector<std::string>({"z0"}));  // 8 x 1 x 1
 
   // Set inputs' variable shape in BlockDesc
-  AddTensorToBlockDesc(block_, "x", std::vector<int64_t>({2, 4}));
+  // the batch size is 2, so the dims of 'x' is {2, 4, 1, 1}
+  AddTensorToBlockDesc(block_, "x", std::vector<int64_t>({2, 4, 1, 1}));
   AddTensorToBlockDesc(block_, "y", std::vector<int64_t>({4, 6}));
   AddTensorToBlockDesc(block_, "y0", std::vector<int64_t>({6, 8}));
   AddTensorToBlockDesc(block_, "z", std::vector<int64_t>({2, 6}));
 
   // It is wired, need to copy manually.
-  *block_->add_ops() = *mul->Proto();
-  *block_->add_ops() = *fc->Proto();
+  *block_->add_ops() = *fc0->Proto();
+  *block_->add_ops() = *fc1->Proto();
 
   ASSERT_EQ(block_->ops_size(), 2);
 
   LOG(INFO) << "create tensorrt desc";
   framework::OpDesc engine_op_desc(nullptr);
   engine_op_desc.SetType("tensorrt_engine");
-  engine_op_desc.SetInput("Xs", std::vector<std::string>({"x", "y", "y0"}));
+  engine_op_desc.SetInput("Xs", std::vector<std::string>({"x"}));
   engine_op_desc.SetOutput("Ys", std::vector<std::string>({"z0"}));
   SetAttr<std::string>(engine_op_desc.Proto(), "subgraph",
                        block_->SerializeAsString());
-  SetAttr<int>(engine_op_desc.Proto(), "max_batch", 100);
-  SetAttr<int>(engine_op_desc.Proto(), "max_workspace", 1 << 10);
   SetAttr<std::string>(engine_op_desc.Proto(), "engine_uniq_key", "a_engine");
   SetAttr<std::vector<std::string>>(engine_op_desc.Proto(), "parameters",
                                     std::vector<std::string>({}));
+  SetAttr<std::vector<std::string>>(engine_op_desc.Proto(),
+                                    "output_name_mapping",
+                                    std::vector<std::string>({"z0"}));
 
   LOG(INFO) << "create engine op";
   auto engine_op = framework::OpRegistry::CreateOp(*engine_op_desc.Proto());
   LOG(INFO) << "engine_op " << engine_op.get();
 
   framework::Scope scope;
-  platform::CPUPlace place;
-  platform::CPUDeviceContext ctx(place);
+  platform::CUDAPlace place;
+  platform::CUDADeviceContext ctx(place);
   // Prepare variables.
-  CreateCPUTensor(&scope, "x", std::vector<int64_t>({2, 4}));
-  CreateCPUTensor(&scope, "y", std::vector<int64_t>({4, 6}));
-  CreateCPUTensor(&scope, "z", std::vector<int64_t>({2, 6}));
+  CreateCUDATensor(&scope, "x", std::vector<int64_t>({2, 4}));
+  CreateCUDATensor(&scope, "y", std::vector<int64_t>({4, 6}));
+  CreateCUDATensor(&scope, "z", std::vector<int64_t>({2, 6}));
 
-  CreateCPUTensor(&scope, "y0", std::vector<int64_t>({6, 8}));
-  CreateCPUTensor(&scope, "z0", std::vector<int64_t>({2, 8}));
+  CreateCUDATensor(&scope, "y0", std::vector<int64_t>({6, 8}));
+  CreateCUDATensor(&scope, "z0", std::vector<int64_t>({2, 8}));
 
   // Execute them.
   LOG(INFO) << "engine_op run";
@@ -124,10 +129,12 @@ TEST(TensorRTEngineOp, manual) {
 }
 
 void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
+  FLAGS_tensorrt_engine_batch_size = batch_size;
+  FLAGS_tensorrt_max_batch_size = batch_size;
   framework::ProgramDesc program;
   framework::Scope scope;
-  platform::CPUPlace place;
-  platform::CPUDeviceContext ctx(place);
+  platform::CUDAPlace place;
+  platform::CUDADeviceContext ctx(place);
 
   auto* block_ = program.Proto()->add_blocks();
   block_->set_idx(0);
@@ -161,10 +168,10 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
 
     // Prepare variables.
     if (!x_created) {
-      CreateCPUTensor(&scope, x_name, std::vector<int64_t>(x_shape));
+      CreateCUDATensor(&scope, x_name, std::vector<int64_t>(x_shape));
     }
-    CreateCPUTensor(&scope, y_name, std::vector<int64_t>(y_shape));
-    CreateCPUTensor(&scope, z_name, std::vector<int64_t>(z_shape));
+    CreateCUDATensor(&scope, y_name, std::vector<int64_t>(y_shape));
+    CreateCUDATensor(&scope, z_name, std::vector<int64_t>(z_shape));
 
     // It is wired, need to copy manually.
     *block_->add_ops() = *fc->Proto();
@@ -195,6 +202,10 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
       std::vector<std::string>({"y0", "y1", "y2", "y3"}));
   SetAttr<std::string>(engine_op_desc.Proto(), "engine_uniq_key", "b_engine");
 
+  SetAttr<std::vector<std::string>>(engine_op_desc.Proto(),
+                                    "output_name_mapping",
+                                    std::vector<std::string>({"z3"}));
+
   auto engine_op = framework::OpRegistry::CreateOp(*engine_op_desc.Proto());
 
   // Execute them.
@@ -207,5 +218,4 @@ TEST(TensorRTEngineOp, fc) { Execute(40, 28, 28); }
 }  // namespace operators
 }  // namespace paddle
 
-USE_TRT_CONVERTER(mul)
 USE_TRT_CONVERTER(fc)
diff --git a/paddle/fluid/operators/top_k_op.h b/paddle/fluid/operators/top_k_op.h
index 7ddb82ef6ff063868a4b9b603b8ab89700b9dd13..054dd481994d03f71b0ed5dc73e103085f6c91aa 100644
--- a/paddle/fluid/operators/top_k_op.h
+++ b/paddle/fluid/operators/top_k_op.h
@@ -60,6 +60,7 @@ class TopkKernel : public framework::OpKernel<T> {
 #endif
     for (size_t i = 0; i < row; i++) {
       std::vector<std::pair<T, size_t>> vec;
+      vec.reserve(col);
       for (size_t j = 0; j < col; j++) {
         vec.push_back(std::pair<T, size_t>(eg_input(i, j), j));
       }
diff --git a/paddle/fluid/operators/transpose_op.cc b/paddle/fluid/operators/transpose_op.cc
index 60556a564c25c08612447ebd47a4b432b8a12d29..6a9fc6611a8f8eaa6749aefac0673ccabaebbcfe 100644
--- a/paddle/fluid/operators/transpose_op.cc
+++ b/paddle/fluid/operators/transpose_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/transpose_op.h"
+#include <string>
 #include <vector>
 
 namespace paddle {
@@ -24,7 +25,7 @@ class TransposeOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
     PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null");
     auto x_dims = ctx->GetInputDim("X");
@@ -90,7 +91,7 @@ The behavior of this operator is similar to how `numpy.transpose` works.
          2 &5
     \end{pmatrix}$$
 
-- Given a input tensor with shape $(N, C, H, W)$ and the `axes` is 
+- Given a input tensor with shape $(N, C, H, W)$ and the `axes` is
 $[0, 2, 3, 1]$, then shape of the output tensor will be: $(N, H, W, C)$.
 
 )DOC");
@@ -101,7 +102,7 @@ class TransposeOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
                    "Input(Out@GRAD) should not be null");
@@ -113,6 +114,93 @@ class TransposeOpGrad : public framework::OperatorWithKernel {
   }
 };
 
+// FIXME(zcd): transpose2 adds an intermediate output(XShape) based on
+// transpose, the XShape is used to carry the shape and lod of X which
+// will be used in transpose_grad, in this way, the framework can reuse
+// the memory of X immediately the transpose2_op is finished.
+// Considering compatibility issues, we could not fix transpose2_op
+class Transpose2Op : public TransposeOp {
+ public:
+  Transpose2Op(const std::string &type,
+               const framework::VariableNameMap &inputs,
+               const framework::VariableNameMap &outputs,
+               const framework::AttributeMap &attrs)
+      : TransposeOp(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    TransposeOp::InferShape(ctx);
+    PADDLE_ENFORCE(ctx->HasOutput("XShape"),
+                   "Output(XShape) should not be null");
+    const auto &in_dims = ctx->GetInputDim("X");
+    std::vector<int64_t> x_shape_dim(in_dims.size() + 1);
+    x_shape_dim[0] = 0;
+    for (int i = 0; i < in_dims.size(); ++i) {
+      x_shape_dim[i + 1] = in_dims[i];
+    }
+    ctx->SetOutputDim("XShape", framework::make_ddim(x_shape_dim));
+    ctx->ShareLoD("X", /*->*/ "XShape");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+
+class Transpose2OpMaker : public TransposeOpMaker {
+ public:
+  void Make() override {
+    TransposeOpMaker::Make();
+    AddOutput("XShape", "(Tensor)The output tensor.").AsIntermediate();
+  }
+};
+
+class Transpose2GradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
+    grad_op->SetType("transpose2_grad");
+    grad_op->SetInput("XShape", Output("XShape"));
+    grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    grad_op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+
+class Transpose2OpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("XShape"), "Input(XShape) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null");
+    if (ctx->HasOutput(framework::GradVarName("X"))) {
+      auto xshape_dim = ctx->GetInputDim("XShape");
+      auto x_shape_dim =
+          framework::slice_ddim(xshape_dim, 1, xshape_dim.size());
+      ctx->SetOutputDim(framework::GradVarName("X"), x_shape_dim);
+      ctx->ShareLoD("XShape", framework::GradVarName("X"));
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(
+            ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"))
+                ->type()),
+        ctx.device_context());
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -120,8 +208,20 @@ namespace ops = paddle::operators;
 REGISTER_OPERATOR(transpose, ops::TransposeOp, ops::TransposeOpMaker,
                   paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(transpose_grad, ops::TransposeOpGrad);
+
 REGISTER_OP_CPU_KERNEL(
     transpose, ops::TransposeKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
     transpose_grad,
     ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, float>);
+
+REGISTER_OPERATOR(transpose2, ops::Transpose2Op, ops::Transpose2OpMaker,
+                  ops::Transpose2GradMaker);
+REGISTER_OPERATOR(transpose2_grad, ops::Transpose2OpGrad);
+
+REGISTER_OP_CPU_KERNEL(
+    transpose2,
+    ops::TransposeKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    transpose2_grad,
+    ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/transpose_op.cu.cc b/paddle/fluid/operators/transpose_op.cu.cc
index bcd1fb631394bc33b6fc162cfa7cbb20d55a654b..c1b5a8b31be243fab3af06a18c8e51986c953700 100644
--- a/paddle/fluid/operators/transpose_op.cu.cc
+++ b/paddle/fluid/operators/transpose_op.cu.cc
@@ -21,3 +21,10 @@ REGISTER_OP_CUDA_KERNEL(
 REGISTER_OP_CUDA_KERNEL(
     transpose_grad,
     ops::TransposeGradKernel<paddle::platform::CUDADeviceContext, float>);
+
+REGISTER_OP_CUDA_KERNEL(
+    transpose2,
+    ops::TransposeKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    transpose2_grad,
+    ops::TransposeGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/uniform_random_op.cc b/paddle/fluid/operators/uniform_random_op.cc
index edd1baa4ace4e246190afcd12b0716f1dd38e243..763bb403588d13c15271d26b09813dddf3a5dd8c 100644
--- a/paddle/fluid/operators/uniform_random_op.cc
+++ b/paddle/fluid/operators/uniform_random_op.cc
@@ -30,12 +30,14 @@ class CPUUniformRandomKernel : public framework::OpKernel<T> {
       tensor = out_var->GetMutable<framework::LoDTensor>();
     } else if (out_var->IsType<framework::SelectedRows>()) {
       auto shape = ctx.Attr<std::vector<int>>("shape");
-      tensor = out_var->GetMutable<framework::SelectedRows>()->mutable_value();
+      auto* selected_rows = out_var->GetMutable<framework::SelectedRows>();
+      tensor = selected_rows->mutable_value();
       tensor->Resize(framework::make_ddim(shape));
+      selected_rows->mutable_rows()->reserve(shape[0]);
     } else {
       PADDLE_THROW(
           "uniform_random_op's output only"
-          "supports SelectedRows and Tensor");
+          "supports SelectedRows and LoDTensor");
     }
     T* data = tensor->mutable_data<T>(ctx.GetPlace());
     unsigned int seed = static_cast<unsigned int>(ctx.Attr<int>("seed"));
diff --git a/paddle/fluid/operators/uniform_random_op.cu b/paddle/fluid/operators/uniform_random_op.cu
index e1c7323a30233f4ec4f60e46aa6088ee6d8601b7..bbb692b0ddfc18e8a62c0d2a6bac88f9932f6704 100644
--- a/paddle/fluid/operators/uniform_random_op.cu
+++ b/paddle/fluid/operators/uniform_random_op.cu
@@ -54,7 +54,7 @@ class GPUUniformRandomKernel : public framework::OpKernel<T> {
     } else {
       PADDLE_THROW(
           "uniform_random_op's output only"
-          "supports SelectedRows and Tensor");
+          "supports SelectedRows and LoDTensor");
     }
     T* data = tensor->mutable_data<T>(context.GetPlace());
     unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
diff --git a/paddle/fluid/operators/unsqueeze_op.cc b/paddle/fluid/operators/unsqueeze_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..405943add238ac2d245df11127bfadb4899e855f
--- /dev/null
+++ b/paddle/fluid/operators/unsqueeze_op.cc
@@ -0,0 +1,293 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+class UnsqueezeOpInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of Unsqueeze operator should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of Unsqueeze operator should not be null.");
+
+    const auto &axes = ctx->Attrs().Get<std::vector<int>>("axes");
+    const auto &x_dims = ctx->GetInputDim("X");
+    // Validity Check: input tensor dims (<6).
+    PADDLE_ENFORCE(x_dims.size() <= 6,
+                   "Invalid dimensions, the rank of Input(X) "
+                   "should be in the range of [1, 6] (Eigen limit)");
+    auto out_dims = GetOutputShape(axes, x_dims);
+    ctx->SetOutputDim("Out", out_dims);
+    if (x_dims[0] == out_dims[0]) {
+      // Only pass LoD when the first dimension of output and Input(X)
+      // are the same.
+      ctx->ShareLoD("X", "Out");
+    }
+  }
+
+  static framework::DDim GetOutputShape(const std::vector<int> unsqz_dims,
+                                        const framework::DDim &in_dims) {
+    int output_size = in_dims.size() + static_cast<int>(unsqz_dims.size());
+    int cur_output_size = in_dims.size();
+    std::vector<int64_t> output_shape(output_size, 0);
+
+    // Validity Check: rank range.
+    PADDLE_ENFORCE(output_size <= 6,
+                   "The output tensor's rank should be less than 6.");
+
+    for (int axis : unsqz_dims) {
+      int cur = axis < 0 ? axis + cur_output_size + 1 : axis;
+      // Vaildity Check: the axis bound
+      PADDLE_ENFORCE(
+          cur >= 0 && cur <= cur_output_size,
+          "The unsqueeze dims must be within range of current rank.");
+      // Move old axis, and insert new axis
+      for (int i = cur_output_size; i >= cur; --i) {
+        if (output_shape[i] == 1) {
+          // Move axis
+          output_shape[i + 1] = 1;
+          output_shape[i] = 0;
+        }
+      }
+      output_shape[cur] = 1;
+      // Add the output size.
+      cur_output_size++;
+    }
+
+    // Make output shape
+    for (int in_idx = 0, out_idx = 0; out_idx < output_size; ++out_idx) {
+      if (output_shape[out_idx] == 0) {
+        output_shape[out_idx] = in_dims[in_idx++];
+      }
+    }
+
+    return framework::make_ddim(output_shape);
+  }
+};
+
+class UnsqueezeOp : public framework::OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto &axes = Attr<std::vector<int>>("axes");
+    auto x_dims = scope.FindVar(Input("X"))->Get<framework::LoDTensor>().dims();
+    auto out_dims = UnsqueezeOpInferShape::GetOutputShape(axes, x_dims);
+
+    framework::AttributeMap attrs;
+    attrs["shape"] = framework::vectorize2int(out_dims);
+    // Invoke Reshape op.
+    auto reshape_op = framework::OpRegistry::CreateOp(
+        "reshape", {{"X", {Input("X")}}, {"Shape", {}}},
+        {{"Out", {Output("Out")}}}, attrs);
+    reshape_op->Run(scope, place);
+  }
+};
+
+class UnsqueezeOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor). The input tensor of unsqueeze operator.");
+    AddOutput("Out", "(Tensor). The output tensor of unsqueeze operator.");
+    AddAttr<std::vector<int>>("axes",
+                              "(std::vector<int>). List of integers,"
+                              " indicating the dimensions to be inserted")
+        .AddCustomChecker([](const std::vector<int> &axes) {
+          PADDLE_ENFORCE(!axes.empty(),
+                         "Invalid axes, The unsqueeze axes is empty.");
+          // Validity Check: axes dims (<6).
+          PADDLE_ENFORCE(static_cast<int>(axes.size()) < 6,
+                         "Invalid dimensions, dynamic dimensions should be "
+                         "within [1, 6] dimensions (Eigen limit).");
+          // Validity Check: the range of unsqueeze aixs.
+          for (int axis : axes) {
+            PADDLE_ENFORCE(axis < 6,
+                           "Invalid dimensions, input axis should be"
+                           " within [1, 6] dimensions (Eigen limit).");
+          }
+        });
+    AddComment(R"DOC(
+    Unsqueeze Operator.
+
+    Insert single-dimensional entries to the shape of a tensor.
+    Takes one required argument axes, a list of dimensions that will be inserted.
+    Dimension indices in axes are as seen in the output tensor.
+
+    For example:
+      Given a tensor such that tensor with shape [3, 4, 5],
+      then Unsqueeze(tensor, axes=[0, 4]) has shape [1, 3, 4, 5, 1]
+    )DOC");
+  }
+};
+
+class UnsqueezeGradInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", framework::GradVarName("X"));
+  }
+};
+
+class UnsqueezeGradOp : public framework::OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto dx_name = Output(framework::GradVarName("X"));
+    auto dout_name = Input(framework::GradVarName("Out"));
+    auto x_dims = scope.FindVar(Input("X"))->Get<framework::LoDTensor>().dims();
+
+    framework::AttributeMap attrs;
+    attrs["shape"] = framework::vectorize2int(x_dims);
+
+    auto reshape_op = framework::OpRegistry::CreateOp(
+        "reshape", {{"X", {dout_name}}, {"Shape", {}}}, {{"Out", {dx_name}}},
+        attrs);
+    reshape_op->Run(scope, place);
+  }
+};
+
+// FIXME(zcd): unsqueeze2 adds an intermediate output(XShape) based on
+// unsqueeze, the XShape is used to carry the shape and lod of X which
+// will be used in unsqueeze_grad, in this way, the framework can reuse
+// the memory of X immediately the unsqueeze2_op is finished.
+// Considering compatibility issues, we could not fix unsqueeze2_op
+class Unsqueeze2OpInferShape : public UnsqueezeOpInferShape {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    UnsqueezeOpInferShape::operator()(ctx);
+    PADDLE_ENFORCE(ctx->HasOutput("XShape"),
+                   "Output(XShape) of Unsqueeze operator should not be null.");
+    const auto &x_dims = ctx->GetInputDim("X");
+    std::vector<int64_t> xshape_dims(x_dims.size() + 1);
+    xshape_dims[0] = 0;
+    for (int i = 0; i < x_dims.size(); ++i) {
+      xshape_dims[i + 1] = x_dims[i];
+    }
+    ctx->SetOutputDim("XShape", framework::make_ddim(xshape_dims));
+    ctx->ShareLoD("X", /*->*/ "XShape");
+  }
+};
+
+class Unsqueeze2OpMaker : public UnsqueezeOpMaker {
+ public:
+  void Make() override {
+    UnsqueezeOpMaker::Make();
+    AddOutput("XShape",
+              "XShape is just used to store the shape and lod of X, which will "
+              "be used in UnsqueezeGradOp.")
+        .AsIntermediate();
+  }
+};
+
+class Unsqueeze2Op : public framework::OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto &axes = Attr<std::vector<int>>("axes");
+    auto x_dims = scope.FindVar(Input("X"))->Get<framework::LoDTensor>().dims();
+    auto out_dims = Unsqueeze2OpInferShape::GetOutputShape(axes, x_dims);
+
+    framework::AttributeMap attrs;
+    attrs["shape"] = framework::vectorize2int(out_dims);
+    // Invoke Reshape op.
+    auto reshape_op = framework::OpRegistry::CreateOp(
+        "reshape2", {{"X", {Input("X")}}, {"Shape", {}}},
+        {{"Out", {Output("Out")}}, {"XShape", {Output("XShape")}}}, attrs);
+    reshape_op->Run(scope, place);
+  }
+};
+
+class Unsqueeze2GradOpMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
+    grad_op->SetType("unsqueeze2_grad");
+    grad_op->SetInput("XShape", Output("XShape"));
+    grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    grad_op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+
+class Unsqueeze2GradInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInput("XShape"),
+                   "Input(XShape) shouldn't be null.");
+    PADDLE_ENFORCE(context->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) shouldn't be null.");
+    auto xshape_dims = context->GetInputDim("XShape");
+    auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
+    context->SetOutputDim(framework::GradVarName("X"), x_dims);
+    context->ShareLoD("XShape", framework::GradVarName("X"));
+  }
+};
+
+class Unsqueeze2GradOp : public framework::OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto dx_name = Output(framework::GradVarName("X"));
+    auto dout_name = Input(framework::GradVarName("Out"));
+    auto xshape_name = Input("XShape");
+    auto xshape_dims =
+        scope.FindVar(xshape_name)->Get<framework::LoDTensor>().dims();
+    auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
+
+    framework::AttributeMap attrs;
+    attrs["shape"] = framework::vectorize2int(x_dims);
+
+    auto reshape_op = framework::OpRegistry::CreateOp(
+        "reshape2", {{"X", {dout_name}}, {"Shape", {}}},
+        {{"Out", {dx_name}}, {"XShape", {xshape_name}}}, attrs);
+    reshape_op->Run(scope, place);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+// Tell linker to use reshape op.
+USE_OP(reshape);
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(unsqueeze, ops::UnsqueezeOp, ops::UnsqueezeOpMaker,
+                  ops::UnsqueezeOpInferShape,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(unsqueeze_grad, ops::UnsqueezeGradOp,
+                  ops::UnsqueezeGradInferShape);
+
+REGISTER_OPERATOR(unsqueeze2, ops::Unsqueeze2Op, ops::Unsqueeze2OpMaker,
+                  ops::Unsqueeze2OpInferShape, ops::Unsqueeze2GradOpMaker);
+REGISTER_OPERATOR(unsqueeze2_grad, ops::Unsqueeze2GradOp,
+                  ops::Unsqueeze2GradInferShape);
diff --git a/paddle/fluid/operators/unstack_op.cc b/paddle/fluid/operators/unstack_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4ff3249cc333231a0624cd5aab9603a6a75f4480
--- /dev/null
+++ b/paddle/fluid/operators/unstack_op.cc
@@ -0,0 +1,26 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/unstack_op.h"
+
+namespace plat = paddle::platform;
+namespace ops = paddle::operators;
+
+USE_OP(stack);
+
+REGISTER_OPERATOR(unstack, ops::UnStackOp, ops::UnStackOpMaker,
+                  ops::UnStackOpInferShape, ops::UnStackGradOpDescMaker);
+
+REGISTER_OPERATOR(unstack_grad, ops::UnStackGradOp,
+                  ops::UnStackOpGradInferShape);
diff --git a/paddle/fluid/operators/unstack_op.h b/paddle/fluid/operators/unstack_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..348a1038804ccb2551e5f729cc1a38bcef1511f5
--- /dev/null
+++ b/paddle/fluid/operators/unstack_op.h
@@ -0,0 +1,135 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+class UnStackOpInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must exist.");
+
+    int axis = ctx->Attrs().Get<int>("axis");
+    int num = ctx->Attrs().Get<int>("num");
+    auto x_dim = ctx->GetInputDim("X");
+    int rank = x_dim.size();
+    PADDLE_ENFORCE(axis >= -rank && axis < rank,
+                   "Attr(axis) must be inside [-rank, rank), where rank = %d",
+                   rank);
+    if (axis < 0) axis += rank;
+
+    PADDLE_ENFORCE_EQ(ctx->Outputs("Y").size(), static_cast<size_t>(num),
+                      "Number of Outputs(Y) is wrong");
+    if (x_dim[axis] > 0) {
+      PADDLE_ENFORCE_EQ(num, x_dim[axis], "Number of Outputs(Y) is wrong");
+    }
+    auto vec = framework::vectorize2int(x_dim);
+    vec.erase(vec.begin() + axis);
+    ctx->SetOutputsDim("Y", std::vector<framework::DDim>(  // NOLINT
+                                x_dim[axis], framework::make_ddim(vec)));
+  }
+};
+
+class UnStackOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "The input of unstack op.");
+    AddOutput("Y", "The output of unstack op.").AsDuplicable();
+    AddAttr<int>("axis", "The axis along which Input(X) should be unstacked.")
+        .SetDefault(0);
+    AddAttr<int>("num", "The number of outputs(Y).").GreaterThan(0);
+    AddComment(R"DOC(
+      UnStack Operator.
+
+      UnStack Input(X) into several tensors along Attr(axis).
+    )DOC");
+  }
+};
+
+class UnStackOp : public framework::OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto stack_grad_op = framework::OpRegistry::CreateOp(
+        "stack_grad", {{framework::GradVarName("Y"), {Input("X")}}},
+        {{framework::GradVarName("X"), Outputs("Y")}}, Attrs());
+    stack_grad_op->Run(scope, place);
+  }
+};
+
+class UnStackOpGradInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE_GT(ctx->Inputs(framework::GradVarName("Y")).size(), 0,
+                      "Number of Inputs(Y@Grad) must be larger than 0");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
+                   "Output(X@Grad) must exist.");
+
+    auto input_dims = ctx->GetInputsDim(framework::GradVarName("Y"));
+    for (size_t i = 1; i < input_dims.size(); ++i) {
+      PADDLE_ENFORCE_EQ(input_dims[i], input_dims[0],
+                        "Dims of all Inputs(Y@Grad) must be the same");
+    }
+
+    int axis = ctx->Attrs().Get<int>("axis");
+    int rank = input_dims[0].size();
+    PADDLE_ENFORCE(
+        axis >= -(rank + 1) && axis < rank + 1,
+        "Attr(axis) must be inside [-(rank+1), rank+1), where rank = %d", rank);
+    if (axis < 0) axis += (rank + 1);
+
+    auto vec = framework::vectorize2int(input_dims[0]);
+    vec.insert(vec.begin() + axis, input_dims.size());
+    ctx->SetOutputDim(framework::GradVarName("X"), framework::make_ddim(vec));
+  }
+};
+
+class UnStackGradOpDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("unstack_grad");
+    op->SetInput(framework::GradVarName("Y"), OutputGrad("Y"));
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetAttrMap(Attrs());
+    return op;
+  }
+};
+
+class UnStackGradOp : public framework::OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto stack_op = framework::OpRegistry::CreateOp(
+        "stack", {{"X", Inputs(framework::GradVarName("Y"))}},
+        {{"Y", {Output(framework::GradVarName("X"))}}}, Attrs());
+    stack_op->Run(scope, place);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/warpctc_op.h b/paddle/fluid/operators/warpctc_op.h
index ab70c1f0592d122ba248a101db487e64c0bdae6f..444265f58de732f07c5db2abd87811a063016866 100644
--- a/paddle/fluid/operators/warpctc_op.h
+++ b/paddle/fluid/operators/warpctc_op.h
@@ -153,17 +153,29 @@ class WarpCTCKernel : public framework::OpKernel<T> {
         framework::make_ddim({static_cast<int64_t>(num_sequences), 1});
 
     // warpctc needs sequences data stored in transposed padding format
-    Tensor warpctc_logits;
+    LoDTensor warpctc_logits;
     const size_t max_sequence_length =
-        math::MaximumSequenceLength(logits_lod, level);
+        math::MaximumSequenceLength(logits_lod[level]);
     auto warpctc_logits_dims =
         framework::make_ddim({static_cast<int64_t>(max_sequence_length),
                               static_cast<int64_t>(num_sequences),
                               static_cast<int64_t>(sequence_width)});
     warpctc_logits.mutable_data<T>(warpctc_logits_dims, ctx.GetPlace());
+
+    LoDTensor cpu_pad_value;
+    T* pad_value_data =
+        cpu_pad_value.mutable_data<T>({1}, platform::CPUPlace());
+    *pad_value_data = static_cast<T>(0);
+    LoDTensor pad_value;
+    if (platform::is_cpu_place(ctx.GetPlace())) {
+      pad_value = cpu_pad_value;
+    } else {
+      TensorCopySync(cpu_pad_value, ctx.GetPlace(), &pad_value);
+    }
+
     math::PaddingLoDTensorFunctor<DeviceContext, T>()(
         ctx.template device_context<DeviceContext>(), *logits, &warpctc_logits,
-        false);
+        pad_value, -1, 0, false /* norm_by_times */, math::kLengthBatchWidth);
     const T* warpctc_logits_data = warpctc_logits.data<T>();
 
     std::vector<int> warpctc_label_lengths(num_sequences);
@@ -209,15 +221,15 @@ template <typename DeviceContext, typename T>
 class WarpCTCGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* warpctc_grad = ctx.Input<Tensor>("WarpCTCGrad");
+    auto* warpctc_grad = ctx.Input<LoDTensor>("WarpCTCGrad");
     auto* logits_grad = ctx.Output<LoDTensor>(framework::GradVarName("Logits"));
     const Tensor* loss_grad = ctx.Input<Tensor>(framework::GradVarName("Loss"));
 
     logits_grad->mutable_data<T>(ctx.GetPlace());
     bool norm_by_times = ctx.Attr<bool>("norm_by_times");
     math::UnpaddingLoDTensorFunctor<DeviceContext, T>()(
-        ctx.template device_context<DeviceContext>(), logits_grad,
-        *warpctc_grad, norm_by_times);
+        ctx.template device_context<DeviceContext>(), *warpctc_grad,
+        logits_grad, -1, 0, norm_by_times, math::kLengthBatchWidth);
 
     const T* loss_grad_data = loss_grad->data<T>();
     math::ScaleLoDTensorFunctor<DeviceContext, T>()(
diff --git a/paddle/fluid/operators/while_op.cc b/paddle/fluid/operators/while_op.cc
index 733157ea05ed39434b9a750e3a94ea548f512ce6..65a3bc928e47ac60f06e7efc75f42703e45acbb4 100644
--- a/paddle/fluid/operators/while_op.cc
+++ b/paddle/fluid/operators/while_op.cc
@@ -57,12 +57,16 @@ class WhileOp : public framework::OperatorBase {
 
     PADDLE_ENFORCE(platform::is_cpu_place(cond.place()),
                    "Condition of while op must in CPU memory.");
+
+    bool is_test = Attr<bool>("is_test");
+    auto ctx = executor.Prepare(*program, block->ID());
     while (cond.data<bool>()[0]) {
       auto &current_scope = scope.NewScope();
       step_scopes->push_back(&current_scope);
-
-      executor.Run(*program, &current_scope, block->ID(),
-                   false /*create_local_scope*/);
+      executor.RunPreparedContext(ctx.get(), &current_scope, false);
+      if (is_test) {
+        scope.DeleteScope(&current_scope);
+      }
     }
   }
 };
@@ -88,6 +92,7 @@ class WhileOpMaker : public framework::OpProtoAndCheckerMaker {
               "variables generated in the i'th step.");
     AddAttr<framework::BlockDesc *>(kStepBlock,
                                     "The step block inside WhileOp");
+    AddAttr<bool>("is_test", "True if in test phase.").SetDefault(false);
     AddComment(R"DOC(
 )DOC");
   }
@@ -103,12 +108,15 @@ class WhileGradOp : public framework::OperatorBase {
  private:
   void RunImpl(const framework::Scope &scope,
                const platform::Place &dev_place) const override {
+    PADDLE_ENFORCE(!Attr<bool>("is_test"),
+                   "GradOp is only callable when is_test is false");
     // get device context from pool
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
     auto &dev_ctx = *pool.Get(dev_place);
     framework::Executor executor(dev_place);
     auto *block = Attr<framework::BlockDesc *>(kStepBlock);
     auto *program = block->Program();
+    auto ctx = executor.Prepare(*program, block->ID());
 
     auto *step_scopes =
         scope.FindVar(Input(kStepScopes))->GetMutable<StepScopeVar>();
@@ -161,8 +169,7 @@ class WhileGradOp : public framework::OperatorBase {
           }
         }
       }
-
-      executor.Run(*program, *cur_scope_iter, block->ID(), false);
+      executor.RunPreparedContext(ctx.get(), *cur_scope_iter, false);
 
       auto &pg_names = Outputs(kXGRAD);
       auto &p_names = Inputs(kX);
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index ac9bf9a505d2f03ba511c9a65ec6851cf605ab8b..5af8af640e43a5b2e5ee9856f09f66a9fdf4463c 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -1,3 +1,4 @@
+if (NOT WIN32)
 proto_library(profiler_proto SRCS profiler.proto DEPS framework_proto)
 py_proto_compile(profiler_py_proto SRCS profiler.proto)
 
@@ -10,6 +11,7 @@ add_custom_command(TARGET profiler_py_proto POST_BUILD
         COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler
         COMMENT "Copy generated python proto into directory paddle/fluid/proto/profiler."
         WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+endif(NOT WIN32)
 
 if(WITH_GPU)
   nv_library(enforce SRCS enforce.cc)
@@ -18,7 +20,11 @@ else()
 endif()
 cc_test(enforce_test SRCS enforce_test.cc DEPS stringpiece enforce)
 
-cc_library(cpu_info SRCS cpu_info.cc DEPS gflags glog enforce)
+set(CPU_INFO_DEPS gflags glog enforce)
+IF(WITH_XBYAK)
+    list(APPEND CPU_INFO_DEPS xbyak)
+ENDIF()
+cc_library(cpu_info SRCS cpu_info.cc DEPS ${CPU_INFO_DEPS})
 cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info)
 
 nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce)
@@ -54,9 +60,16 @@ cc_test(init_test SRCS init_test.cc DEPS device_context)
 nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
 nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context)
 
+
+if (NOT WIN32)
 cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS})
 cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer)
 cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)
+endif(NOT WIN32)
 
 nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor)
 cc_test(float16_test SRCS float16_test.cc DEPS lod_tensor)
+
+IF(WITH_GPU)
+  nv_test(cuda_helper_test SRCS cuda_helper_test.cu)
+ENDIF()
diff --git a/paddle/fluid/platform/cpu_helper.cc b/paddle/fluid/platform/cpu_helper.cc
index 77ecb170111d63f23312d06fa8a8172bc45f2a4e..234a04b5c2eb5ee643e8a4e723b28331cd8e6ee0 100644
--- a/paddle/fluid/platform/cpu_helper.cc
+++ b/paddle/fluid/platform/cpu_helper.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/enforce.h"
 
 #ifdef PADDLE_WITH_MKLML
+#include <omp.h>
 #include "paddle/fluid/platform/dynload/mklml.h"
 #endif
 
@@ -33,6 +34,7 @@ void SetNumThreads(int num_threads) {
 #elif defined(PADDLE_WITH_MKLML)
   int real_num_threads = num_threads > 1 ? num_threads : 1;
   platform::dynload::MKL_Set_Num_Threads(real_num_threads);
+  omp_set_num_threads(num_threads);
 #else
   PADDLE_ENFORCE(false, "To be implemented.");
 #endif
diff --git a/paddle/fluid/platform/cpu_info.cc b/paddle/fluid/platform/cpu_info.cc
index f832d72b53e8d06a32d5c0ac2ecf7130aa28a666..2880c09263f10e9c624e11b77188171f48d9db28 100644
--- a/paddle/fluid/platform/cpu_info.cc
+++ b/paddle/fluid/platform/cpu_info.cc
@@ -14,12 +14,21 @@ limitations under the License. */
 
 #include "paddle/fluid/platform/cpu_info.h"
 
+#ifdef PADDLE_WITH_XBYAK
+#include "xbyak/xbyak.h"
+#include "xbyak/xbyak_util.h"
+#endif
+
 #ifdef __APPLE__
 #include <sys/sysctl.h>
 #include <sys/types.h>
+
+#elif defined(_WIN32)
+#define NOMINMAX  // msvc max/min macro conflict with std::min/max
+#include <windows.h>
 #else
 #include <unistd.h>
-#endif
+#endif  // _WIN32
 
 #include <algorithm>
 #include "gflags/gflags.h"
@@ -27,16 +36,20 @@ limitations under the License. */
 DEFINE_double(fraction_of_cpu_memory_to_use, 1,
               "Default use 100% of CPU memory for PaddlePaddle,"
               "reserve the rest for page tables, etc");
-
+#if !defined(_WIN32)
 DEFINE_uint64(initial_cpu_memory_in_mb,
 #ifdef PADDLE_WITH_MKLDNN
               /* Aligned with mozga-intel, MKLDNN need at least 5000 MB
                * to obtain the best performance*/
-              5000,
+              5000ul,
 #else
-              500,
+              500ul,
 #endif
               "Initial CPU memory for PaddlePaddle, in MD unit.");
+#else
+DEFINE_uint64(initial_cpu_memory_in_mb, 500ul,
+              "Initial CPU memory for PaddlePaddle, in MD unit.");
+#endif  // !defined(_WIN32)
 
 DEFINE_double(
     fraction_of_cuda_pinned_memory_to_use, 0.5,
@@ -55,6 +68,11 @@ inline size_t CpuTotalPhysicalMemory() {
   size_t len = sizeof(size);
   if (sysctl(mib, 2, &size, &len, NULL, 0) == 0) return (size_t)size;
   return 0L;
+#elif defined(_WIN32)
+  MEMORYSTATUSEX sMeminfo;
+  sMeminfo.dwLength = sizeof(sMeminfo);
+  GlobalMemoryStatusEx(&sMeminfo);
+  return sMeminfo.ullTotalPhys;
 #else
   int64_t pages = sysconf(_SC_PHYS_PAGES);
   int64_t page_size = sysconf(_SC_PAGE_SIZE);
@@ -98,5 +116,48 @@ size_t CUDAPinnedMaxChunkSize() {
   return CUDAPinnedMaxAllocSize() / 256;
 }
 
+namespace jit {
+#ifdef PADDLE_WITH_XBYAK
+static Xbyak::util::Cpu cpu;
+bool MayIUse(const cpu_isa_t cpu_isa) {
+  using namespace Xbyak::util;  // NOLINT
+  switch (cpu_isa) {
+    case sse42:
+      return cpu.has(Cpu::tSSE42);
+    case avx:
+      return cpu.has(Cpu::tAVX);
+    case avx2:
+      return cpu.has(Cpu::tAVX2);
+    case avx512_common:
+      return cpu.has(Cpu::tAVX512F);
+    case avx512_core:
+      return true && cpu.has(Cpu::tAVX512F) && cpu.has(Cpu::tAVX512BW) &&
+             cpu.has(Cpu::tAVX512VL) && cpu.has(Cpu::tAVX512DQ);
+    case avx512_core_vnni:
+      return true && cpu.has(Cpu::tAVX512F) && cpu.has(Cpu::tAVX512BW) &&
+             cpu.has(Cpu::tAVX512VL) && cpu.has(Cpu::tAVX512DQ) &&
+             cpu.has(Cpu::tAVX512_VNNI);
+    case avx512_mic:
+      return true && cpu.has(Cpu::tAVX512F) && cpu.has(Cpu::tAVX512CD) &&
+             cpu.has(Cpu::tAVX512ER) && cpu.has(Cpu::tAVX512PF);
+    case avx512_mic_4ops:
+      return true && MayIUse(avx512_mic) && cpu.has(Cpu::tAVX512_4FMAPS) &&
+             cpu.has(Cpu::tAVX512_4VNNIW);
+    case isa_any:
+      return true;
+  }
+  return false;
+}
+#else
+bool MayIUse(const cpu_isa_t cpu_isa) {
+  if (cpu_isa == isa_any) {
+    return true;
+  } else {
+    return false;
+  }
+}
+#endif
+
+}  // namespace jit
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/cpu_info.h b/paddle/fluid/platform/cpu_info.h
index f06c2b67fe4385f427322e9bb2f3080fdd3acc94..30c8fbcfce92a8b06a175ddf198cde572f72b2a4 100644
--- a/paddle/fluid/platform/cpu_info.h
+++ b/paddle/fluid/platform/cpu_info.h
@@ -37,5 +37,23 @@ size_t CUDAPinnedMinChunkSize();
 //! Get the maximum chunk size for buddy allocator.
 size_t CUDAPinnedMaxChunkSize();
 
+namespace jit {
+typedef enum {
+  isa_any,
+  sse42,
+  avx,
+  avx2,
+  avx512_common,
+  avx512_core,
+  avx512_core_vnni,
+  avx512_mic,
+  avx512_mic_4ops,
+} cpu_isa_t;  // Instruction set architecture
+
+// May I use some instruction
+bool MayIUse(const cpu_isa_t cpu_isa);
+
+}  // namespace jit
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/cuda_device_function.h b/paddle/fluid/platform/cuda_device_function.h
index ecec4178f2d9937920e52eb74bf9068b84e741a0..9f504d14a8da116648483c0f64cb511b46e6a97e 100644
--- a/paddle/fluid/platform/cuda_device_function.h
+++ b/paddle/fluid/platform/cuda_device_function.h
@@ -14,6 +14,10 @@ limitations under the License. */
 
 #pragma once
 #include <cuda.h>
+// NOTE(): support float16 to half in header file.
+#define PADDLE_CUDA_FP16
+#include <cuda_fp16.h>
+#include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
 namespace platform {
@@ -32,10 +36,29 @@ __forceinline__ __device__ T CudaShuffleDownSync(unsigned mask, T val,
 #if CUDA_VERSION < 9000
   return __shfl_down(val, delta, width);
 #else
-  return __shfl_down_sync(mask, val, delta, width);
+  return __shfl_down_sync(mask, val, static_cast<unsigned>(delta), width);
 #endif
 }
 
+// CUDA 9.0 have native compatible float16 shfl_down
+#if CUDA_VERSION < 9000
+template <>
+__forceinline__ __device__ float16 CudaShuffleDownSync(unsigned mask,
+                                                       float16 val, int delta,
+                                                       int width) {
+  return float16(
+      __shfl_down(static_cast<half>(val), static_cast<unsigned>(delta), width));
+}
+#else
+template <>
+__forceinline__ __device__ float16 CudaShuffleDownSync(unsigned mask,
+                                                       float16 val, int delta,
+                                                       int width) {
+  return float16(__shfl_down_sync(mask, static_cast<half>(val),
+                                  static_cast<unsigned>(delta), width));
+}
+#endif
+
 template <typename T>
 __forceinline__ __device__ T CudaShuffleSync(unsigned mask, T val, int src_line,
                                              int width = 32) {
@@ -46,6 +69,11 @@ __forceinline__ __device__ T CudaShuffleSync(unsigned mask, T val, int src_line,
 #endif
 }
 
+template <typename T>
+HOSTDEVICE T Infinity() {
+  return INFINITY;
+}
+
 template <typename T>
 __device__ T reduceSum(T val, int tid, int len) {
   // NOTE(zcd): The warp size should be taken from the
diff --git a/paddle/fluid/platform/cuda_helper_test.cu b/paddle/fluid/platform/cuda_helper_test.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ee45afab93d079374aefe366425502890854c28d
--- /dev/null
+++ b/paddle/fluid/platform/cuda_helper_test.cu
@@ -0,0 +1,234 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <algorithm>
+#include <iostream>
+#include <random>
+
+#define PADDLE_CUDA_FP16
+#include "paddle/fluid/platform/cuda_device_function.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/float16.h"
+
+using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+using paddle::platform::float16;
+
+template <typename T>
+__global__ void AddKernel(const T* data_a, T* data_b, size_t num) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
+       i += blockDim.x * gridDim.x) {
+    paddle::platform::CudaAtomicAdd(&data_b[i], data_a[i]);
+  }
+}
+
+template <typename T>
+struct AddFunctor {
+  T operator()(const T& a, const T& b) { return a + b; }
+};
+
+template <typename T>
+void TestCase(size_t num) {
+  T *in1, *in2, *out;
+  T *d_in1, *d_in2;
+  size_t size = sizeof(T) * num;
+  cudaMalloc(reinterpret_cast<void**>(&d_in1), size);
+  cudaMalloc(reinterpret_cast<void**>(&d_in2), size);
+  in1 = reinterpret_cast<T*>(malloc(size));
+  in2 = reinterpret_cast<T*>(malloc(size));
+  out = reinterpret_cast<T*>(malloc(size));
+  std::minstd_rand engine;
+  std::uniform_real_distribution<double> dist(0.0, 1.0);
+  for (size_t i = 0; i < num; ++i) {
+    in1[i] = static_cast<T>(dist(engine));
+    in2[i] = static_cast<T>(dist(engine));
+  }
+  cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice);
+  AddKernel<T><<<1, PADDLE_CUDA_NUM_THREADS>>>(d_in1, d_in2, num);
+  cudaDeviceSynchronize();
+  cudaMemcpy(out, d_in2, size, cudaMemcpyDeviceToHost);
+  cudaDeviceSynchronize();
+  for (size_t i = 0; i < num; ++i) {
+    // NOTE(dzhwinter): the float16 add has small underflow/overflow
+    // so we use EXPECT_NEAR to check the result.
+    EXPECT_NEAR(static_cast<float>(out[i]),
+                static_cast<float>(AddFunctor<T>()(in1[i], in2[i])), 0.001);
+  }
+  free(in1);
+  free(in2);
+  free(out);
+  cudaFree(d_in1);
+  cudaFree(d_in2);
+}
+
+// cuda primitives
+TEST(CudaAtomic, Add) {
+  TestCase<float>(static_cast<size_t>(10));
+  TestCase<float>(static_cast<size_t>(1024 * 1024));
+
+  TestCase<double>(static_cast<size_t>(10));
+  TestCase<double>(static_cast<size_t>(1024 * 1024));
+}
+
+TEST(CudaAtomic, float16) {
+  TestCase<float16>(static_cast<size_t>(1));
+  TestCase<float16>(static_cast<size_t>(2));
+  TestCase<float16>(static_cast<size_t>(3));
+
+  TestCase<float16>(static_cast<size_t>(10));
+  TestCase<float16>(static_cast<size_t>(1024 * 1024));
+}
+
+// unalignment of uint8
+void TestUnalign(size_t num, const int shift_bit) {
+  PADDLE_ENFORCE(num % 2 == 0, "must be a multiple of 2");
+  float16 *in1, *in2, *out;
+  float16 *d_in1, *d_in2;
+  size_t size = sizeof(uint8_t) * (num + shift_bit);
+  size_t array_size = sizeof(float16) * (num / 2);
+
+  cudaMalloc(reinterpret_cast<void**>(&d_in1), size);
+  cudaMalloc(reinterpret_cast<void**>(&d_in2), size);
+  in1 = reinterpret_cast<float16*>(malloc(size));
+  in2 = reinterpret_cast<float16*>(malloc(size));
+  out = reinterpret_cast<float16*>(malloc(size));
+
+  // right shift 1, mimic the unalignment of address
+  float16* r_in1 =
+      reinterpret_cast<float16*>(reinterpret_cast<uint8_t*>(in1) + shift_bit);
+  float16* r_in2 =
+      reinterpret_cast<float16*>(reinterpret_cast<uint8_t*>(in2) + shift_bit);
+
+  std::minstd_rand engine;
+  std::uniform_real_distribution<double> dist(0.0, 1.0);
+  for (size_t i = 0; i < num / 2; ++i) {
+    r_in1[i] = static_cast<float16>(dist(engine));
+    r_in2[i] = static_cast<float16>(dist(engine));
+  }
+  cudaMemcpy(d_in1, r_in1, array_size, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_in2, r_in2, array_size, cudaMemcpyHostToDevice);
+  AddKernel<float16><<<1, PADDLE_CUDA_NUM_THREADS>>>(d_in1, d_in2, num / 2);
+  cudaDeviceSynchronize();
+  cudaMemcpy(out, d_in2, array_size, cudaMemcpyDeviceToHost);
+  cudaDeviceSynchronize();
+  for (size_t i = 0; i < num / 2; ++i) {
+    // NOTE(dzhwinter): the float16 add has small truncate error.
+    // so we use EXPECT_NEAR to check the result.
+    EXPECT_NEAR(static_cast<float>(out[i]),
+                static_cast<float>(AddFunctor<float16>()(r_in1[i], r_in2[i])),
+                0.001);
+  }
+  free(in1);
+  free(in2);
+  free(out);
+  cudaFree(d_in1);
+  cudaFree(d_in2);
+}
+
+TEST(CudaAtomic, float16Unalign) {
+  // same with float16 testcase
+  TestUnalign(static_cast<size_t>(2), /*shift_bit*/ 2);
+  TestUnalign(static_cast<size_t>(1024), /*shift_bit*/ 2);
+  TestUnalign(static_cast<size_t>(1024 * 1024), /*shift_bit*/ 2);
+
+  // shift the address.
+  TestUnalign(static_cast<size_t>(2), /*shift_bit*/ 1);
+  TestUnalign(static_cast<size_t>(1024), /*shift_bit*/ 1);
+  TestUnalign(static_cast<size_t>(1024 * 1024), /*shift_bit*/ 1);
+
+  TestUnalign(static_cast<size_t>(2), /*shift_bit*/ 3);
+  TestUnalign(static_cast<size_t>(1024), /*shift_bit*/ 3);
+  TestUnalign(static_cast<size_t>(1024 * 1024), /*shift_bit*/ 3);
+}
+
+// https://devblogs.nvidia.com/faster-parallel-reductions-kepler/
+template <typename T>
+static __forceinline__ __device__ T WarpReduceSum(T val) {
+  unsigned mask = 0u;
+  CREATE_SHFL_MASK(mask, true);
+  for (int offset = warpSize / 2; offset > 0; offset /= 2) {
+    val += paddle::platform::CudaShuffleDownSync(mask, val, offset);
+  }
+  return val;
+}
+
+template <typename T>
+__forceinline__ __device__ T BlockReduce(T val) {
+  static __shared__ T shared[32];  // Shared mem for 32 partial sums
+  int lane = threadIdx.x % warpSize;
+  int wid = threadIdx.x / warpSize;
+
+  val = WarpReduceSum(val);  // Each warp performs partial reduction
+
+  if (lane == 0) shared[wid] = val;  // Write reduced value to shared memory
+
+  __syncthreads();  // Wait for all partial reductions
+
+  // read from shared memory only if that warp existed
+  val =
+      (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : static_cast<T>(0);
+
+  if (wid == 0) val = WarpReduceSum(val);  // Final reduce within first warp
+
+  return val;
+}
+
+template <typename T>
+__global__ void DeviceReduceSum(T* in, T* out, size_t N) {
+  T sum(0);
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
+       i += blockDim.x * gridDim.x) {
+    sum += in[i];
+  }
+  sum = BlockReduce<T>(sum);
+  __syncthreads();
+  if (threadIdx.x == 0) out[blockIdx.x] = sum;
+}
+
+template <typename T>
+void TestReduce(size_t num, float atol = 0.01) {
+  T* in1;
+  T *d_in1, *d_in2;
+  size_t size = sizeof(T) * num;
+  cudaMalloc(reinterpret_cast<void**>(&d_in1), size);
+  cudaMalloc(reinterpret_cast<void**>(&d_in2), sizeof(T));
+  in1 = reinterpret_cast<T*>(malloc(size));
+  std::minstd_rand engine;
+  std::uniform_real_distribution<double> dist(0.0, 1.0);
+  for (size_t i = 0; i < num; ++i) {
+    in1[i] = static_cast<T>(dist(engine));
+  }
+  auto out = std::accumulate(in1, in1 + num, static_cast<T>(0));
+  cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);
+  cudaDeviceSynchronize();
+  DeviceReduceSum<T><<<1, PADDLE_CUDA_NUM_THREADS>>>(d_in1, d_in2, num);
+  cudaMemcpy(in1, d_in2, sizeof(T), cudaMemcpyDeviceToHost);
+  cudaDeviceSynchronize();
+  // NOTE(dzhwinter): the float16 add has small underflow/overflow
+  // so we use EXPECT_NEAR to check the result.
+  EXPECT_NEAR(static_cast<float>(in1[0]), static_cast<float>(out), atol);
+  free(in1);
+  cudaFree(d_in1);
+  cudaFree(d_in2);
+}
+
+TEST(CudaShuffleSync, float16) {
+  TestReduce<float>(10);
+  TestReduce<float>(1000);
+
+  // float16 will overflow or accumulate truncate errors in big size.
+  TestReduce<float16>(10);
+  TestReduce<float16>(100, /*atol error*/ 1.0);
+}
diff --git a/paddle/fluid/platform/cuda_primitives.h b/paddle/fluid/platform/cuda_primitives.h
index d535ed2f89df6a0b311ec068ecd92c8e3183cee7..67ea64833d3b844d88a2e5996f860ef165bd8ffd 100644
--- a/paddle/fluid/platform/cuda_primitives.h
+++ b/paddle/fluid/platform/cuda_primitives.h
@@ -14,12 +14,14 @@ limitations under the License. */
 
 #pragma once
 #include <cuda.h>
+#include <stdio.h>
+#include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
 namespace platform {
 
 #define CUDA_ATOMIC_WRAPPER(op, T) \
-  __device__ __forceinline__ T CudaAtomic##op(T* address, const T val)
+  __device__ __forceinline__ T CudaAtomic##op(T *address, const T val)
 
 #define USE_CUDA_ATOMIC(op, T) \
   CUDA_ATOMIC_WRAPPER(op, T) { return atomic##op(address, val); }
@@ -42,17 +44,17 @@ CUDA_ATOMIC_WRAPPER(Add, int64_t) {
   static_assert(sizeof(int64_t) == sizeof(long long int),  // NOLINT
                 "long long should be int64");
   return CudaAtomicAdd(
-      reinterpret_cast<unsigned long long int*>(address),  // NOLINT
-      static_cast<unsigned long long int>(val));           // NOLINT
+      reinterpret_cast<unsigned long long int *>(address),  // NOLINT
+      static_cast<unsigned long long int>(val));            // NOLINT
 }
 
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600
 USE_CUDA_ATOMIC(Add, double);
 #else
 CUDA_ATOMIC_WRAPPER(Add, double) {
-  unsigned long long int* address_as_ull =                 // NOLINT
-      reinterpret_cast<unsigned long long int*>(address);  // NOLINT
-  unsigned long long int old = *address_as_ull, assumed;   // NOLINT
+  unsigned long long int *address_as_ull =                  // NOLINT
+      reinterpret_cast<unsigned long long int *>(address);  // NOLINT
+  unsigned long long int old = *address_as_ull, assumed;    // NOLINT
 
   do {
     assumed = old;
@@ -64,6 +66,67 @@ CUDA_ATOMIC_WRAPPER(Add, double) {
 
   return __longlong_as_double(old);
 }
+#endif
+
+#ifdef PADDLE_CUDA_FP16
+// NOTE(dzhwinter): cuda do not have atomicCAS for half.
+// Just use the half address as a unsigned value address and
+// do the atomicCAS. According to the value store at high 16 bits
+// or low 16 bits, then do a different sum and CAS.
+// Given most warp-threads will failed on the atomicCAS, so this
+// implemented should be avoided in high concurrency. It's will be
+// slower than the way convert value into 32bits and do a full atomicCAS.
+
+// convert the value into float and do the add arithmetic.
+// then store the result into a uint32.
+inline static __device__ uint32_t add_to_low_half(uint32_t val, float x) {
+  float16 low_half;
+  // the float16 in lower 16bits
+  low_half.x = static_cast<uint16_t>(val & 0xFFFFu);
+  low_half = static_cast<float16>(static_cast<float>(low_half) + x);
+  return (val & 0xFFFF0000u) | low_half.x;
+}
+
+inline static __device__ uint32_t add_to_high_half(uint32_t val, float x) {
+  float16 high_half;
+  // the float16 in higher 16bits
+  high_half.x = static_cast<uint16_t>(val >> 16);
+  high_half = static_cast<float16>(static_cast<float>(high_half) + x);
+  return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
+}
+
+CUDA_ATOMIC_WRAPPER(Add, float16) {
+  // concrete packed float16 value may exsits in lower or higher 16bits
+  // of the 32bits address.
+  uint32_t *address_as_ui = reinterpret_cast<uint32_t *>(
+      reinterpret_cast<char *>(address) -
+      (reinterpret_cast<uintptr_t>(address) & 0x02));
+  float val_f = static_cast<float>(val);
+  uint32_t old = *address_as_ui;
+  uint32_t sum;
+  uint32_t newval;
+  uint32_t assumed;
+  if (((uintptr_t)address & 0x02) == 0) {
+    // the float16 value stay at lower 16 bits of the address.
+    do {
+      assumed = old;
+      old = atomicCAS(address_as_ui, assumed, add_to_low_half(assumed, val_f));
+    } while (old != assumed);
+    float16 ret;
+    ret.x = old & 0xFFFFu;
+    return ret;
+  } else {
+    // the float16 value stay at higher 16 bits of the address.
+    do {
+      assumed = old;
+      old = atomicCAS(address_as_ui, assumed, add_to_high_half(assumed, val_f));
+    } while (old != assumed);
+    float16 ret;
+    ret.x = old >> 16;
+    return ret;
+  }
+}
+
 #endif
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/cudnn_helper.h b/paddle/fluid/platform/cudnn_helper.h
index 6ea4f8b7cba18ce7f803dbd9b15a7ae70c3055f2..bb8b14bb9fa41942c3aa653ca224c0842fbf9a00 100644
--- a/paddle/fluid/platform/cudnn_helper.h
+++ b/paddle/fluid/platform/cudnn_helper.h
@@ -59,13 +59,12 @@ inline const char* cudnnGetErrorString(cudnnStatus_t status) {
 #define CUDNN_VERSION_MIN(major, minor, patch) \
   (CUDNN_VERSION >= ((major)*1000 + (minor)*100 + (patch)))
 
-#define CUDNN_ENFORCE(condition)                                  \
-  do {                                                            \
-    cudnnStatus_t status = condition;                             \
-    if (status != CUDNN_STATUS_SUCCESS) {                         \
-      VLOG(1) << ::paddle::platform::cudnnGetErrorString(status); \
-      PADDLE_THROW("cuDNN call failed");                          \
-    }                                                             \
+#define CUDNN_ENFORCE(condition)                                     \
+  do {                                                               \
+    cudnnStatus_t status = condition;                                \
+    if (UNLIKELY(status != CUDNN_STATUS_SUCCESS)) {                  \
+      PADDLE_THROW(::paddle::platform::cudnnGetErrorString(status)); \
+    }                                                                \
   } while (false)
 
 enum class DataLayout {  // Not use
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index a57ee2d8f5e598adc33e7fd8f1f354d9a372cd12..dfc079e986e93c7f02f17b299e5d6293edbedd05 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -16,6 +16,9 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/fluid/memory/memory.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/framework/rw_lock.h"
+#endif
 
 namespace paddle {
 namespace platform {
@@ -142,7 +145,58 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
   mutable unsigned int* semaphore_;
 };
 
-CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) {
+class CudnnHolder {
+ public:
+  CudnnHolder(const cudaStream_t* stream, const CUDAPlace& place)
+      : workspace_(nullptr), workspace_len_(0), stream_(stream), place_(place) {
+    PADDLE_ENFORCE(dynload::cudnnCreate(&cudnn_handle_));
+    PADDLE_ENFORCE(dynload::cudnnSetStream(cudnn_handle_, *stream_));
+  }
+
+  cudnnHandle_t cudnn_handle() const { return cudnn_handle_; }
+
+  void RunFunc(const std::function<void(void*)>& cudnn_func,
+               size_t required_workspace_len) {
+    std::lock_guard<std::mutex> lock(mtx_);
+    if (required_workspace_len > workspace_len_) {
+      ReallocateWorkspace(required_workspace_len);
+    }
+    cudnn_func(workspace_);
+  }
+
+  ~CudnnHolder() {
+    PADDLE_ENFORCE(dynload::cudnnDestroy(cudnn_handle_));
+    if (workspace_ != nullptr) {
+      paddle::memory::Free(place_, workspace_);
+    }
+  }
+
+ private:
+  void ReallocateWorkspace(size_t required_workspace_len) {
+    if (required_workspace_len <= workspace_len_) {
+      return;
+    }
+    if (workspace_ != nullptr) {
+      // Maybe someone is using the current workspace
+      PADDLE_ENFORCE(cudaStreamSynchronize(*stream_));
+      paddle::memory::Free(place_, workspace_);
+    }
+    workspace_ = paddle::memory::Alloc(place_, required_workspace_len);
+    workspace_len_ = required_workspace_len;
+  }
+
+  cudnnHandle_t cudnn_handle_;
+  void* workspace_;
+  size_t workspace_len_;
+
+  const cudaStream_t* stream_;  // not owned;
+  const CUDAPlace place_;
+
+  std::mutex mtx_;
+};
+
+CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
+    : place_(place), cudnn_holder_(nullptr) {
   SetDeviceId(place_.device);
   compute_capability = GetCUDAComputeCapability(place_.device);
   multi_process = GetCUDAMultiProcessors(place_.device);
@@ -154,10 +208,7 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) {
   PADDLE_ENFORCE(dynload::cublasCreate(&cublas_handle_));
   PADDLE_ENFORCE(dynload::cublasSetStream(cublas_handle_, stream_));
   if (dynload::HasCUDNN()) {
-    PADDLE_ENFORCE(dynload::cudnnCreate(&cudnn_handle_));
-    PADDLE_ENFORCE(dynload::cudnnSetStream(cudnn_handle_, stream_));
-  } else {
-    cudnn_handle_ = nullptr;
+    cudnn_holder_.reset(new CudnnHolder(&stream_, place));
   }
 
   callback_manager_.reset(new StreamCallbackManager(stream_));
@@ -168,9 +219,6 @@ CUDADeviceContext::~CUDADeviceContext() {
   Wait();
   WaitStreamCallback();
   PADDLE_ENFORCE(dynload::cublasDestroy(cublas_handle_));
-  if (cudnn_handle_ != nullptr) {
-    PADDLE_ENFORCE(dynload::cudnnDestroy(cudnn_handle_));
-  }
   eigen_stream_.reset();
   eigen_device_.reset();
   PADDLE_ENFORCE(cudaStreamDestroy(stream_));
@@ -199,7 +247,14 @@ cublasHandle_t CUDADeviceContext::cublas_handle() const {
   return cublas_handle_;
 }
 
-cudnnHandle_t CUDADeviceContext::cudnn_handle() const { return cudnn_handle_; }
+cudnnHandle_t CUDADeviceContext::cudnn_handle() const {
+  return cudnn_holder_->cudnn_handle();
+}
+
+void CUDADeviceContext::RunCudnnFuncWithWorkspace(
+    const std::function<void(void*)>& cudnn_func, size_t workspace_len) const {
+  cudnn_holder_->RunFunc(cudnn_func, workspace_len);
+}
 
 cudaStream_t CUDADeviceContext::stream() const { return stream_; }
 
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 0fb53383685221a3415a396ff6b712ccddd011c3..c3b092b2a527c745b5a1c0a4469cc1f66087c12e 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -24,7 +24,7 @@ limitations under the License. */
 #endif
 
 #ifdef PADDLE_WITH_MKLDNN
-#include <mkldnn.hpp>
+#include "mkldnn.hpp"
 #endif
 
 #include <map>
@@ -74,6 +74,7 @@ struct DefaultDeviceContextType<platform::CPUPlace> {
 #ifdef PADDLE_WITH_CUDA
 
 class EigenCudaStreamDevice;
+class CudnnHolder;
 
 class CUDADeviceContext : public DeviceContext {
  public:
@@ -101,6 +102,11 @@ class CUDADeviceContext : public DeviceContext {
   /*! \brief  Return cudnn  handle in the device context. */
   cudnnHandle_t cudnn_handle() const;
 
+  /*! \brief  Run a cudnn function with the workspace provided by
+   * CUDADeviceContext */
+  void RunCudnnFuncWithWorkspace(const std::function<void(void*)>& cudnn_func,
+                                 size_t workspace_len) const;
+
   /*! \brief  Return cuda stream in the device context. */
   cudaStream_t stream() const;
 
@@ -127,8 +133,8 @@ class CUDADeviceContext : public DeviceContext {
 
   std::unique_ptr<Eigen::GpuDevice> eigen_device_;
   std::unique_ptr<EigenCudaStreamDevice> eigen_stream_;
+  std::unique_ptr<CudnnHolder> cudnn_holder_;
   cudaStream_t stream_;
-  cudnnHandle_t cudnn_handle_;
   cublasHandle_t cublas_handle_;
 
   int compute_capability;
diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc
index d9e2afadaf8ec439d158e57c94d3e6e684bce116..dc1d751141187edb7738e42c41514614d4d399b0 100644
--- a/paddle/fluid/platform/device_tracer.cc
+++ b/paddle/fluid/platform/device_tracer.cc
@@ -30,9 +30,6 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 namespace {
-// Current thread's id. Note, we don't distinguish nested threads
-// for now.
-thread_local int cur_thread_id = 0;
 // Tracking the nested block stacks of each thread.
 thread_local std::deque<int> block_id_stack;
 // Tracking the nested event stacks.
@@ -192,6 +189,8 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
 }
 }  // namespace
 
+#endif  // PADDLE_WITH_CUPTI
+
 class DeviceTracerImpl : public DeviceTracer {
  public:
   DeviceTracerImpl() : enabled_(false) {}
@@ -247,6 +246,8 @@ class DeviceTracerImpl : public DeviceTracer {
     if (enabled_) {
       return;
     }
+
+#ifdef PADDLE_WITH_CUPTI
     EnableActivity();
 
     // Register callbacks for buffer requests and completed by CUPTI.
@@ -265,6 +266,7 @@ class DeviceTracerImpl : public DeviceTracer {
         dynload::cuptiEnableCallback(1, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API,
                                      CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel));
     CUPTI_CALL(dynload::cuptiGetTimestamp(&start_ns_));
+#endif  // PADDLE_WITH_CUPTI
     enabled_ = true;
   }
 
@@ -316,16 +318,21 @@ class DeviceTracerImpl : public DeviceTracer {
   }
 
   void Disable() {
+#ifdef PADDLE_WITH_CUPTI
     // flush might cause additional calls to DeviceTracker.
     dynload::cuptiActivityFlushAll(CUPTI_ACTIVITY_FLAG_FLUSH_FORCED);
+#endif  // PADDLE_WITH_CUPTI
     std::lock_guard<std::mutex> l(trace_mu_);
+#ifdef PADDLE_WITH_CUPTI
     DisableActivity();
     dynload::cuptiUnsubscribe(subscriber_);
     CUPTI_CALL(dynload::cuptiGetTimestamp(&end_ns_));
+#endif  // PADDLE_WITH_CUPTI
     enabled_ = false;
   }
 
  private:
+#ifdef PADDLE_WITH_CUPTI
   static void CUPTIAPI ApiCallback(void *userdata, CUpti_CallbackDomain domain,
                                    CUpti_CallbackId cbid, const void *cbdata) {
     auto *cbInfo = reinterpret_cast<const CUpti_CallbackData *>(cbdata);
@@ -343,7 +350,8 @@ class DeviceTracerImpl : public DeviceTracer {
       VLOG(1) << "Unhandled API Callback for " << domain << " " << cbid;
     }
   }
-
+  CUpti_SubscriberHandle subscriber_;
+#endif  // PADDLE_WITH_CUPTI
   std::mutex trace_mu_;
   bool enabled_;
   uint64_t start_ns_;
@@ -352,45 +360,9 @@ class DeviceTracerImpl : public DeviceTracer {
   std::vector<MemRecord> mem_records_;
   std::vector<CPURecord> cpu_records_;
   std::unordered_map<uint32_t, std::string> correlations_;
-  CUpti_SubscriberHandle subscriber_;
-};
-
-#endif  // PADDLE_WITH_CUPTI
-
-class DeviceTracerDummy : public DeviceTracer {
- public:
-  DeviceTracerDummy() {}
-
-  void AddAnnotation(uint64_t id, const std::string &anno) {}
-
-  void AddCPURecords(const std::string &anno, uint64_t start_ns,
-                     uint64_t end_ns, int64_t device_id, int64_t thread_id) {}
-
-  void AddMemRecords(const std::string &name, uint64_t start_ns,
-                     uint64_t end_ns, int64_t device_id, int64_t stream_id,
-                     uint32_t correlation_id, uint64_t bytes) {}
-
-  void AddKernelRecords(uint64_t start, uint64_t end, int64_t device_id,
-                        int64_t stream_id, uint32_t correlation_id) {}
-
-  bool IsEnabled() { return false; }
-
-  void Enable() {}
-
-  proto::Profile GenProfile(const std::string &profile_path) {
-    return proto::Profile();
-  }
-
-  void Disable() {}
 };
 
-void CreateTracer(DeviceTracer **t) {
-#ifdef PADDLE_WITH_CUPTI
-  *t = new DeviceTracerImpl();
-#else
-  *t = new DeviceTracerDummy();
-#endif  // PADDLE_WITH_CUPTI
-}
+void CreateTracer(DeviceTracer **t) { *t = new DeviceTracerImpl(); }
 
 DeviceTracer *GetDeviceTracer() {
   std::call_once(tracer_once_flag, CreateTracer, &tracer);
@@ -413,12 +385,5 @@ void SetCurBlock(int block_id) { block_id_stack.push_back(block_id); }
 void ClearCurBlock() { block_id_stack.pop_back(); }
 
 int BlockDepth() { return block_id_stack.size(); }
-
-void SetCurThread(int thread_id) { cur_thread_id = thread_id; }
-
-void ClearCurThread() { cur_thread_id = 0; }
-
-int CurThread() { return cur_thread_id; }
-
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/device_tracer.h b/paddle/fluid/platform/device_tracer.h
index 0375c7439c29d4122e8ff6b58734dad4f504b7a2..f59fc40b71699a790978e22fd7e26da8d4d94c5f 100644
--- a/paddle/fluid/platform/device_tracer.h
+++ b/paddle/fluid/platform/device_tracer.h
@@ -13,6 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 
+#if !defined(_WIN32)
+#include <sys/time.h>
+#else
+#include <windows.h>
+#endif  // !_WIN32
+
+#include <time.h>
+#include <chrono>  // NOLINT
 #include <string>
 
 #include "paddle/fluid/platform/dynload/cupti.h"
@@ -24,6 +32,15 @@ namespace platform {
 ///////////////////////
 // WARN: Under Development. Don't depend on it yet.
 //////////////////////
+#if !defined(_WIN32)
+inline uint64_t PosixInNsec() {
+  struct timeval tv;
+  gettimeofday(&tv, nullptr);
+  return 1000 * (static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec);
+}
+#else
+inline uint64_t PosixInNsec() { return static_cast<uint64_t>(0); }
+#endif  // !_WIN32
 
 // DeviceTracer performs the following tasks:
 // 1. Register cuda callbacks for various events: kernel, memcpy, etc.
@@ -99,9 +116,5 @@ std::string CurAnnotation();
 void SetCurBlock(int block_id);
 void ClearCurBlock();
 int BlockDepth();
-
-void SetCurThread(int thread_id);
-void ClearCurThread();
-int CurThread();
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt
index 9da787a4073fa002f75154f7c4fba54e9ed8efa6..5939c500c946c44579d1de645ac9700c7701a4e9 100644
--- a/paddle/fluid/platform/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/dynload/CMakeLists.txt
@@ -3,7 +3,7 @@ cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce)
 list(APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc)
 
 # There is no macOS version of NCCL.
-if (NOT APPLE)
+if (NOT APPLE AND NOT WIN32)
   list(APPEND CUDA_SRCS nccl.cc)
 endif()
 
@@ -16,7 +16,9 @@ if (CUPTI_FOUND)
     list(APPEND CUDA_SRCS cupti.cc)
 endif(CUPTI_FOUND)
 nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader)
+if (NOT WIN32)
 cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc)
+endif(NOT WIN32)
 if (WITH_MKLML)
     cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml)
 endif()
diff --git a/paddle/fluid/platform/dynload/cublas.h b/paddle/fluid/platform/dynload/cublas.h
index 25bcda7eedc1ef42f75fb8fd1439f0c8f55015c3..c7c533bd42859c374c4783d43ec4cdd34a6a994a 100644
--- a/paddle/fluid/platform/dynload/cublas.h
+++ b/paddle/fluid/platform/dynload/cublas.h
@@ -17,10 +17,10 @@
 #include <cublasXt.h>
 #include <cublas_v2.h>
 #include <cuda.h>
-#include <dlfcn.h>
 #include <mutex>  // NOLINT
 #include <type_traits>
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/fluid/platform/port.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h
index 77e46fa768b62c277d7b4027de7173e39a5672b4..0103e7a3accf88f3c83f109298010c3c9af3d549 100644
--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
@@ -15,9 +15,9 @@ limitations under the License. */
 #pragma once
 
 #include <cudnn.h>
-#include <dlfcn.h>
 #include <mutex>  // NOLINT
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/fluid/platform/port.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/dynload/cupti.h b/paddle/fluid/platform/dynload/cupti.h
index e8f4a82ef132be9e4ec3fb76f11766046a2ff638..b946f46e82af4b09fafff54765b899254a4ec1df 100644
--- a/paddle/fluid/platform/dynload/cupti.h
+++ b/paddle/fluid/platform/dynload/cupti.h
@@ -17,10 +17,10 @@ limitations under the License. */
 
 #include <cuda.h>
 #include <cupti.h>
-#include <dlfcn.h>
 #include <mutex>  // NOLINT
 
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/fluid/platform/port.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/dynload/curand.h b/paddle/fluid/platform/dynload/curand.h
index 5b9e0820e0b319fe7a636a57a0029caf038b4db3..2daf1b4215ce1f7f771bbac72bfe103b0b941976 100644
--- a/paddle/fluid/platform/dynload/curand.h
+++ b/paddle/fluid/platform/dynload/curand.h
@@ -14,9 +14,9 @@ limitations under the License. */
 #pragma once
 
 #include <curand.h>
-#include <dlfcn.h>
 
 #include <mutex>  // NOLINT
+#include "paddle/fluid/platform/port.h"
 
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
 
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc
index 93bf7c13516ffa4baca6a30f1daf946939726d85..6a3ad2151081504fda2a3818c5f99ad47039d91d 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -13,8 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
 
-#include <dlfcn.h>
-
 #include <memory>
 #include <mutex>  // NOLINT
 #include <string>
@@ -23,6 +21,7 @@ limitations under the License. */
 #include "glog/logging.h"
 #include "paddle/fluid/platform/dynload/cupti_lib_path.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/port.h"
 
 DEFINE_string(cudnn_dir, "",
               "Specify path for loading libcudnn.so. For instance, "
@@ -122,6 +121,12 @@ static inline void* GetDsoHandleFromSearchPath(const std::string& search_root,
     if (nullptr == dso_handle) {
       LOG(WARNING) << "Failed to find dynamic library: " << dlPath << " ("
                    << dlerror() << ")";
+      if (dlPath.find("nccl") != std::string::npos) {
+        std::cout
+            << "You may need to install 'nccl2' from NVIDIA official website: "
+            << "https://developer.nvidia.com/nccl/nccl-download"
+            << "before install PaddlePaddle" << std::endl;
+      }
       dlPath = dso_name;
       dso_handle = GetDsoHandleFromDefaultPath(dlPath, dynload_flags);
     }
diff --git a/paddle/fluid/platform/dynload/mklml.h b/paddle/fluid/platform/dynload/mklml.h
index 17acefe8cde01809572e4c86cbdccfed9a477a51..aa20553ceffceded09447693c6e92f55fb48702d 100644
--- a/paddle/fluid/platform/dynload/mklml.h
+++ b/paddle/fluid/platform/dynload/mklml.h
@@ -14,10 +14,10 @@ limitations under the License. */
 
 #pragma once
 
-#include <dlfcn.h>
 #include <mkl.h>
 #include <mutex>  // NOLINT
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/fluid/platform/port.h"
 
 namespace paddle {
 namespace platform {
@@ -49,17 +49,33 @@ extern void* mklml_dso_handle;
 
 #define MKLML_ROUTINE_EACH(__macro) \
   __macro(cblas_sgemm);             \
-  __macro(cblas_saxpy);             \
-  __macro(cblas_scopy);             \
-  __macro(cblas_sgemv);             \
-  __macro(cblas_sgemm_batch);       \
   __macro(cblas_dgemm);             \
+  __macro(cblas_saxpy);             \
   __macro(cblas_daxpy);             \
+  __macro(cblas_scopy);             \
   __macro(cblas_dcopy);             \
+  __macro(cblas_sgemv);             \
   __macro(cblas_dgemv);             \
+  __macro(cblas_sgemm_alloc);       \
+  __macro(cblas_dgemm_alloc);       \
+  __macro(cblas_sgemm_pack);        \
+  __macro(cblas_dgemm_pack);        \
+  __macro(cblas_sgemm_compute);     \
+  __macro(cblas_dgemm_compute);     \
+  __macro(cblas_sgemm_free);        \
+  __macro(cblas_dgemm_free);        \
+  __macro(cblas_sgemm_batch);       \
   __macro(cblas_dgemm_batch);       \
+  __macro(cblas_sdot);              \
+  __macro(cblas_ddot);              \
+  __macro(cblas_sscal);             \
+  __macro(cblas_dscal);             \
   __macro(vsAdd);                   \
   __macro(vdAdd);                   \
+  __macro(vsMul);                   \
+  __macro(vdMul);                   \
+  __macro(vsExp);                   \
+  __macro(vdExp);                   \
   __macro(MKL_Set_Num_Threads)
 
 MKLML_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MKLML_WRAP);
diff --git a/paddle/fluid/platform/dynload/nccl.h b/paddle/fluid/platform/dynload/nccl.h
index 575516f81870fc9f7b92919ffc20a201cb5cbce8..331ca9908e126d5dbca830457281fbf88fc1df09 100644
--- a/paddle/fluid/platform/dynload/nccl.h
+++ b/paddle/fluid/platform/dynload/nccl.h
@@ -13,12 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 
-#include <dlfcn.h>
 #include <nccl.h>
 
 #include <mutex>  // NOLINT
-
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/fluid/platform/port.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/dynload/warpctc.h b/paddle/fluid/platform/dynload/warpctc.h
index d157c1fda789b98f06ad069d2a9c4f421ff82dcd..18ed9956f1841874b27c2493e2f3e22fdfbf0448 100644
--- a/paddle/fluid/platform/dynload/warpctc.h
+++ b/paddle/fluid/platform/dynload/warpctc.h
@@ -14,10 +14,9 @@ limitations under the License. */
 
 #pragma once
 
-#include <dlfcn.h>
 #include <mutex>  // NOLINT
-
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/fluid/platform/port.h"
 #include "warpctc/include/ctc.h"
 
 namespace paddle {
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index 566485cd3c383640047d97f40b452735e8c8c171..61a653d9313daff96d39c08e80f17d7e33acceb1 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -14,13 +14,15 @@ limitations under the License. */
 
 #pragma once
 
-#include <dlfcn.h>     // for dladdr
-#include <execinfo.h>  // for backtrace
-
 #ifdef __GNUC__
 #include <cxxabi.h>  // for __cxa_demangle
 #endif               // __GNUC__
 
+#if defined(_WIN32)
+#define NOMINMAX  // msvc max/min macro conflict with std::min/max
+#define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
+#endif
+
 #ifdef PADDLE_WITH_CUDA
 #include <cublas_v2.h>
 #include <cudnn.h>
@@ -37,6 +39,7 @@ limitations under the License. */
 
 #include "glog/logging.h"
 #include "paddle/fluid/platform/macros.h"
+#include "paddle/fluid/platform/port.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/fluid/string/to_string.h"
 
@@ -44,7 +47,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/dynload/cublas.h"
 #include "paddle/fluid/platform/dynload/cudnn.h"
 #include "paddle/fluid/platform/dynload/curand.h"
-#ifndef __APPLE__
+#if !defined(__APPLE__) and !defined(_WIN32)
 #include "paddle/fluid/platform/dynload/nccl.h"
 #endif  // __APPLE__
 #endif  // PADDLE_WITH_CUDA
@@ -75,7 +78,7 @@ struct EnforceNotMet : public std::exception {
 
       sout << string::Sprintf("%s at [%s:%d]", exp.what(), f, l) << std::endl;
       sout << "PaddlePaddle Call Stacks: " << std::endl;
-
+#if !defined(_WIN32)
       void* call_stack[TRACE_STACK_LIMIT];
       auto size = backtrace(call_stack, TRACE_STACK_LIMIT);
       auto symbols = backtrace_symbols(call_stack, size);
@@ -95,6 +98,9 @@ struct EnforceNotMet : public std::exception {
         }
       }
       free(symbols);
+#else
+      sout << "Windows not support stack backtrace yet.";
+#endif
       err_str_ = sout.str();
     }
   }
@@ -116,7 +122,12 @@ struct EOFException : public std::exception {
 // always forces branch prediction of true.
 // This generates faster binary code. __builtin_expect is since C++11.
 // For more details, please check https://stackoverflow.com/a/43870188/724872.
+#if !defined(_WIN32)
 #define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
+#else
+// there is no equivalent intrinsics in msvc.
+#define UNLIKELY(condition) (condition == 0)
+#endif
 
 template <typename... Args>
 inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
@@ -205,7 +216,7 @@ inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
 #endif
 }
 
-#ifndef __APPLE__
+#if !defined(__APPLE__) and !defined(_WIN32)
 template <typename... Args>
 inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
     ncclResult_t stat, const Args&... args) {
@@ -221,7 +232,7 @@ inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
 #endif
   }
 }
-#endif  // __APPLE__
+#endif  // __APPLE__ and windows
 #endif  // PADDLE_WITH_CUDA
 
 template <typename T>
@@ -229,6 +240,7 @@ inline void throw_on_error(T e) {
   throw_on_error(e, "");
 }
 
+#if !defined(_WIN32)
 #define PADDLE_THROW(...)                                              \
   do {                                                                 \
     throw ::paddle::platform::EnforceNotMet(                           \
@@ -247,15 +259,28 @@ inline void throw_on_error(T e) {
                                               __FILE__, __LINE__);      \
     }                                                                   \
   } while (false)
-#else
-#define PADDLE_ENFORCE(...) ::paddle::platform::throw_on_error(__VA_ARGS__);
-#endif
 
 #define PADDLE_THROW_EOF()                                                     \
   do {                                                                         \
     throw ::paddle::platform::EOFException("There is no next data.", __FILE__, \
                                            __LINE__);                          \
   } while (false)
+
+#else
+#define PADDLE_ENFORCE(...) ::paddle::platform::throw_on_error(__VA_ARGS__)
+#endif  // REPLACE_ENFORCE_GLOG
+
+#else  // !_WIN32
+// disable enforce, caused by the varardic macro exception error
+#define PADDLE_THROW(x)                                      \
+  do {                                                       \
+    throw std::make_exception_ptr(                           \
+        std::runtime_error("Windows disable the enforce.")); \
+  } while (false)
+
+#define PADDLE_ENFORCE(x, ...) x
+#endif  // !_WIN32
+
 /*
  * Some enforce helpers here, usage:
  *    int a = 1;
@@ -263,7 +288,8 @@ inline void throw_on_error(T e) {
  *    PADDLE_ENFORCE_EQ(a, b);
  *
  *    will raise an expression described as follows:
- *    "enforce a == b failed, 1 != 2" with detailed stack information.
+ *    "Enforce failed. Expected input a == b, but received a(1) != b(2)."
+ *      with detailed stack information.
  *
  *    extra messages is also supported, for example:
  *    PADDLE_ENFORCE(a, b, "some simple enforce failed between %d numbers", 2)
@@ -292,9 +318,10 @@ inline void throw_on_error(T e) {
 #define __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, __CMP, __INV_CMP, ...)  \
   do {                                                                  \
     if (UNLIKELY(!((__VAL0)__CMP(__VAL1)))) {                           \
-      PADDLE_THROW("enforce %s " #__CMP " %s failed, %s " #__INV_CMP    \
-                   " %s\n%s",                                           \
-                   #__VAL0, #__VAL1, paddle::string::to_string(__VAL0), \
+      PADDLE_THROW("Enforce failed. Expected %s " #__CMP                \
+                   " %s, but received %s:%s " #__INV_CMP " %s:%s.\n%s", \
+                   #__VAL0, #__VAL1, #__VAL0,                           \
+                   paddle::string::to_string(__VAL0), #__VAL1,          \
                    paddle::string::to_string(__VAL1),                   \
                    paddle::string::Sprintf("" __VA_ARGS__));            \
     }                                                                   \
diff --git a/paddle/fluid/platform/enforce_test.cc b/paddle/fluid/platform/enforce_test.cc
index 0e8684581a93f076b1a077cc52e966d3c88cf078..d52182965552e9ec945cb7d0b421d8addcb758e9 100644
--- a/paddle/fluid/platform/enforce_test.cc
+++ b/paddle/fluid/platform/enforce_test.cc
@@ -54,7 +54,9 @@ TEST(ENFORCE_EQ, NO_EXTRA_MSG_FAIL) {
     PADDLE_ENFORCE_EQ(a, 1 + 3);
   } catch (paddle::platform::EnforceNotMet error) {
     caught_exception = true;
-    HasPrefix(StringPiece(error.what()), "enforce a == 1 + 3 failed, 2 != 4");
+    HasPrefix(
+        StringPiece(error.what()),
+        "Enforce failed. Expected a == 1 + 3, but received a:2 != 1 + 3:4.");
   }
   EXPECT_TRUE(caught_exception);
 }
@@ -67,7 +69,8 @@ TEST(ENFORCE_EQ, EXTRA_MSG_FAIL) {
   } catch (paddle::platform::EnforceNotMet error) {
     caught_exception = true;
     HasPrefix(StringPiece(error.what()),
-              "enforce a == 1 + 3 failed, 2 != 4\ntheir size not match");
+              "Enforce failed. Expected a == 1 + 3, but received a:2 != 1 + "
+              "3:4.\ntheir size not match");
   }
   EXPECT_TRUE(caught_exception);
 }
@@ -84,8 +87,9 @@ TEST(ENFORCE_NE, FAIL) {
     PADDLE_ENFORCE_NE(1.0, 1UL);
   } catch (paddle::platform::EnforceNotMet error) {
     caught_exception = true;
-    EXPECT_TRUE(HasPrefix(StringPiece(error.what()),
-                          "enforce 1.0 != 1UL failed, 1 == 1"))
+    EXPECT_TRUE(HasPrefix(
+        StringPiece(error.what()),
+        "Enforce failed. Expected 1.0 != 1UL, but received 1.0:1 == 1UL:1."))
         << error.what() << " does not have expected prefix";
   }
   EXPECT_TRUE(caught_exception);
@@ -98,8 +102,9 @@ TEST(ENFORCE_GT, FAIL) {
     PADDLE_ENFORCE_GT(1, 2UL);
   } catch (paddle::platform::EnforceNotMet error) {
     caught_exception = true;
-    EXPECT_TRUE(
-        HasPrefix(StringPiece(error.what()), "enforce 1 > 2UL failed, 1 <= 2"));
+    EXPECT_TRUE(HasPrefix(
+        StringPiece(error.what()),
+        "Enforce failed. Expected 1 > 2UL, but received 1:1 <= 2UL:2."));
   }
   EXPECT_TRUE(caught_exception);
 }
@@ -116,8 +121,9 @@ TEST(ENFORCE_GE, FAIL) {
     PADDLE_ENFORCE_GE(1, 2UL);
   } catch (paddle::platform::EnforceNotMet error) {
     caught_exception = true;
-    EXPECT_TRUE(
-        HasPrefix(StringPiece(error.what()), "enforce 1 >= 2UL failed, 1 < 2"));
+    EXPECT_TRUE(HasPrefix(
+        StringPiece(error.what()),
+        "Enforce failed. Expected 1 >= 2UL, but received 1:1 < 2UL:2."));
   }
   EXPECT_TRUE(caught_exception);
 }
@@ -135,8 +141,9 @@ TEST(ENFORCE_LE, FAIL) {
     PADDLE_ENFORCE_GT(1, 2UL);
   } catch (paddle::platform::EnforceNotMet error) {
     caught_exception = true;
-    EXPECT_TRUE(
-        HasPrefix(StringPiece(error.what()), "enforce 1 > 2UL failed, 1 <= 2"));
+    EXPECT_TRUE(HasPrefix(
+        StringPiece(error.what()),
+        "Enforce failed. Expected 1 > 2UL, but received 1:1 <= 2UL:2."));
   }
   EXPECT_TRUE(caught_exception);
 }
@@ -153,7 +160,8 @@ TEST(ENFORCE_LT, FAIL) {
   } catch (paddle::platform::EnforceNotMet error) {
     caught_exception = true;
     EXPECT_TRUE(HasPrefix(StringPiece(error.what()),
-                          "enforce 1UL < 0.12 failed, 1 >= 0.12"));
+                          "Enforce failed. Expected 1UL < 0.12, but "
+                          "received 1UL:1 >= 0.12:0.12."));
   }
   EXPECT_TRUE(caught_exception);
 }
diff --git a/paddle/fluid/platform/float16.h b/paddle/fluid/platform/float16.h
index ffd183af68514dbb1a8b3de39000c9ca3f56ddc3..ee16fc66e4aa7a14c7797487dba0ad5c1e9abe25 100644
--- a/paddle/fluid/platform/float16.h
+++ b/paddle/fluid/platform/float16.h
@@ -56,7 +56,11 @@ limitations under the License. */
 #include <immintrin.h>
 #endif  // PADDLE_ARM
 
+#if !defined(_WIN32)
 #define PADDLE_ALIGN(x) __attribute__((aligned(x)))
+#else
+#define PADDLE_ALIGN(x) /*do nothing*/
+#endif
 
 namespace paddle {
 namespace platform {
@@ -67,8 +71,11 @@ struct float16;
 }  // namespace platform
 }  // namespace paddle
 
+// NOTE():
+// Do not move the eigen.h header, otherwise the eigen_vector<bool> will failed.
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/platform/hostdevice.h"
+#include "unsupported/Eigen/CXX11/Tensor"
 
 namespace paddle {
 namespace platform {
@@ -898,6 +905,30 @@ struct is_pod<paddle::platform::float16> {
       is_standard_layout<paddle::platform::float16>::value;
 };
 
+template <>
+struct is_floating_point<paddle::platform::float16>
+    : std::integral_constant<
+          bool, std::is_same<paddle::platform::float16,
+                             typename std::remove_cv<
+                                 paddle::platform::float16>::type>::value> {};
+template <>
+struct is_signed<paddle::platform::float16> {
+  static const bool value = true;
+};
+
+template <>
+struct is_unsigned<paddle::platform::float16> {
+  static const bool value = false;
+};
+
+inline bool isnan(const paddle::platform::float16& a) {
+  return paddle::platform::isnan(a);
+}
+
+inline bool isinf(const paddle::platform::float16& a) {
+  return paddle::platform::isinf(a);
+}
+
 template <>
 struct numeric_limits<paddle::platform::float16> {
   static const bool is_specialized = true;
diff --git a/paddle/fluid/platform/float16_test.cc b/paddle/fluid/platform/float16_test.cc
index ede294be1e2e26693bd3ead2ccd5e6a6c8a075bc..27e930e6e0a76982b3f27619f38a4a08d82cafa1 100644
--- a/paddle/fluid/platform/float16_test.cc
+++ b/paddle/fluid/platform/float16_test.cc
@@ -141,10 +141,36 @@ TEST(float16, lod_tensor_cpu) {
   }
 }
 
+TEST(float16, floating) {
+  // compile time assert.
+  PADDLE_ASSERT(std::is_floating_point<float16>::value);
+}
+
 TEST(float16, print) {
   float16 a = float16(1.0f);
   std::cout << a << std::endl;
 }
 
+// CPU test
+TEST(float16, isinf) {
+  float16 a;
+  a.x = 0x7c00;
+  float16 b = float16(INFINITY);
+  float16 c = static_cast<float16>(INFINITY);
+  EXPECT_EQ(std::isinf(a), true);
+  EXPECT_EQ(std::isinf(b), true);
+  EXPECT_EQ(std::isinf(c), true);
+}
+
+TEST(float16, isnan) {
+  float16 a;
+  a.x = 0x7fff;
+  float16 b = float16(NAN);
+  float16 c = static_cast<float16>(NAN);
+  EXPECT_EQ(std::isnan(a), true);
+  EXPECT_EQ(std::isnan(b), true);
+  EXPECT_EQ(std::isnan(c), true);
+}
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/float16_test.cu b/paddle/fluid/platform/float16_test.cu
index 1b9cf9b5d3fa2121b588c31d7cf2f4c50cb951bc..e2b7ca9b03809113c31af8ff4d3ad3713748f330 100644
--- a/paddle/fluid/platform/float16_test.cu
+++ b/paddle/fluid/platform/float16_test.cu
@@ -11,11 +11,13 @@ limitations under the License. */
 
 #include "paddle/fluid/platform/float16.h"
 
+#include <glog/logging.h>
 #include <gtest/gtest.h>
+#include <bitset>
+#include <iostream>
 
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/legacy/utils/Logging.h"
 
 #define ARITHMETIC_KERNEL(op_type, sign)                                 \
   __global__ void op_type(const half* in1, const half* in2, half* out) { \
@@ -241,6 +243,72 @@ TEST(float16, lod_tensor_on_gpu) {
   }
 }
 
+template <typename T>
+struct Functor {
+  bool operator()(const T& val) {
+    return std::type_index(typeid(T)) ==
+           std::type_index(typeid(platform::float16));
+  }
+};
+
+TEST(float16, typeid) {
+  // the framework heavily used typeid hash
+  Functor<float16> functor;
+  float16 a = float16(.0f);
+  Functor<int> functor2;
+  int b(0);
+
+  // compile time assert
+  PADDLE_ASSERT(functor(a) == true);
+  PADDLE_ASSERT(functor2(b) == false);
+}
+
+// GPU test
+TEST(float16, isinf) {
+  float16 a;
+  a.x = 0x7c00;
+  float16 b = float16(INFINITY);
+  // underflow to 0
+  float16 native_a(5e-40f);
+  // overflow to inf
+  float16 native_b(5e40f);
+  EXPECT_EQ(std::isinf(a), true);
+  EXPECT_EQ(std::isinf(b), true);
+  EXPECT_EQ(std::isinf(native_b), true);
+  EXPECT_EQ(native_a, float16(0));
+}
+
+TEST(float16, isnan) {
+  float16 a;
+  a.x = 0x7fff;
+  float16 b = float16(NAN);
+  float16 c = float16(5e40);
+  // inf * +-0 will get a nan
+  float16 d = c * float16(0);
+  EXPECT_EQ(std::isnan(a), true);
+  EXPECT_EQ(std::isnan(b), true);
+  EXPECT_EQ(std::isnan(d), true);
+}
+
+TEST(float16, cast) {
+  float16 a;
+  a.x = 0x0070;
+  auto b = a;
+  {
+    // change semantic, keep the same value
+    float16 c = reinterpret_cast<float16&>(reinterpret_cast<unsigned&>(b));
+    EXPECT_EQ(b, c);
+  }
+
+  {
+    // use uint32 low 16 bit store float16
+    uint32_t c = reinterpret_cast<uint32_t&>(b);
+    float16 d;
+    d.x = c;
+    EXPECT_EQ(b, d);
+  }
+}
+
 }  // namespace platform
 }  // namespace paddle
 #endif  // PADDLE_CUDA_FP16
diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc
index 4cee93f3a4224cb97327254cd1679021d197a1b1..126636d879213b1c8f242db8fbdf6a358a1d2da9 100644
--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
@@ -116,7 +116,8 @@ size_t GpuMaxChunkSize() {
   size_t allocating = static_cast<size_t>(FLAGS_fraction_of_gpu_memory_to_use *
                                           (total - reserving));
 
-  PADDLE_ENFORCE_LE(allocating, available);
+  PADDLE_ENFORCE_LE(allocating, available,
+                    "Insufficient GPU memory to allocation.");
 
   return allocating;
 }
diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index 0b776528414735e8a7c1e3763e7ccb662bb9f285..4c99f4be321160caf0ee2f89a655bdfb933408e3 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -18,11 +18,15 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/cpu_helper.h"
+#include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/init.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/string/piece.h"
 
+DEFINE_int32(paddle_num_threads, 1,
+             "Number of threads for each paddle instance.");
+
 namespace paddle {
 namespace framework {
 
@@ -81,9 +85,6 @@ void InitDevices(bool init_p2p) {
   } catch (const std::exception &exp) {
     LOG(WARNING) << "Compiled with WITH_GPU, but no GPU found in runtime.";
   }
-#else
-  LOG(WARNING)
-      << "'CUDA' is not supported, Please re-compile with WITH_GPU option";
 #endif
   InitDevices(init_p2p, devices);
 }
@@ -97,9 +98,6 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
   } catch (const std::exception &exp) {
     LOG(WARNING) << "Compiled with WITH_GPU, but no GPU found in runtime.";
   }
-#else
-  LOG(WARNING)
-      << "'CUDA' is not supported, Please re-compile with WITH_GPU option";
 #endif
 
   for (size_t i = 0; i < devices.size(); ++i) {
@@ -115,8 +113,24 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
   places.emplace_back(platform::CPUPlace());
   platform::DeviceContextPool::Init(places);
 #ifndef PADDLE_WITH_MKLDNN
-  platform::SetNumThreads(1);
+  platform::SetNumThreads(FLAGS_paddle_num_threads);
+#endif
+
+  if (platform::jit::MayIUse(platform::jit::avx512_common)) {
+#ifndef __AVX512F__
+    LOG(WARNING) << "AVX512F is available, Please re-compile on local machine";
+#endif
+  }
+  if (platform::jit::MayIUse(platform::jit::avx2)) {
+#ifndef __AVX2__
+    LOG(WARNING) << "AVX2 is available, Please re-compile on local machine";
+#endif
+  }
+  if (platform::jit::MayIUse(platform::jit::avx)) {
+#ifndef __AVX__
+    LOG(WARNING) << "AVX is available, Please re-compile on local machine";
 #endif
+  }
 }
 
 void InitGLOG(const std::string &prog_name) {
diff --git a/paddle/fluid/platform/macros.h b/paddle/fluid/platform/macros.h
index 4cc04b090519637ab0b8d3740b8a12f216218cae..32b7efc04c1f2ecc22f93c08387aec69ded4930a 100644
--- a/paddle/fluid/platform/macros.h
+++ b/paddle/fluid/platform/macros.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <cfloat>
 
 // Disable the copy and assignment operator for a class.
 #ifndef DISABLE_COPY_AND_ASSIGN
@@ -23,3 +24,7 @@ limitations under the License. */
   classname& operator=(const classname&) = delete; \
   classname& operator=(classname&&) = delete
 #endif
+
+#if defined(__FLT_MAX__)
+#define FLT_MAX __FLT_MAX__
+#endif  // __FLT_MAX__
diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h
index 33fec2c1073819d88d85a8872227adcb9df3e8f4..c0a2543ba5d8ff8f34cb6231c51cb5053a6a9481 100644
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@@ -125,6 +125,11 @@ class MKLDNNHandler {
     return this->AcquireMemory(md, ptr, "@user_weights_mem_p");
   }
 
+  std::shared_ptr<mkldnn::memory> AcquireBiasMemory(
+      const mkldnn::memory::desc& md, void* ptr) {
+    return this->AcquireMemory(md, ptr, "@user_bias_mem_p");
+  }
+
   std::shared_ptr<mkldnn::memory> AcquireDstMemory(
       const mkldnn::memory::desc& md, void* ptr) {
     return this->AcquireMemory(md, ptr, "@user_dst_mem_p");
@@ -187,7 +192,8 @@ class MKLDNNHandler {
       mkldnn::memory::primitive_desc& user_mpd,  // NOLINT
       const std::shared_ptr<mkldnn::memory> user_memory_p,
       const std::string& suffix,
-      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
+      std::vector<mkldnn::primitive>& pipeline,  // NOLINT
+      bool is_persistent = false) {
     // create reorder primitive if the input format is not the preferred one
     auto local_key = key_ + suffix;
     auto key_reorder_p = key_ + suffix + "reorder_p";
@@ -208,7 +214,7 @@ class MKLDNNHandler {
         pipeline.push_back(*reorder_p);
       }
       dev_ctx_.SetBlob(local_key, target_memory_p);
-    } else {
+    } else if (!is_persistent) {
       // Make reorder if needed
       auto reorder_p = std::static_pointer_cast<mkldnn::reorder>(
           dev_ctx_.GetBlob(key_reorder_p));
@@ -222,17 +228,18 @@ class MKLDNNHandler {
 
   static std::string GetHash(mkldnn::memory::dims& operand_dims,  // NOLINT
                              const std::string& suffix) {
-    auto dims2str = [](const mkldnn::memory::dims& operand_dims) {
-      std::string dstr = "";
-      for (size_t i = 0; i < operand_dims.size(); ++i) {
-        dstr += std::to_string(operand_dims[i]) + "-";
-      }
-      return dstr;
-    };
-
     return dims2str(operand_dims) + suffix;
   }
 
+ protected:
+  static std::string dims2str(const mkldnn::memory::dims& operand_dims) {
+    std::string dstr = "";
+    for (size_t i = 0; i < operand_dims.size(); ++i) {
+      dstr += std::to_string(operand_dims[i]) + "-";
+    }
+    return dstr;
+  }
+
  protected:
   const MKLDNNDeviceContext& dev_ctx_;
   mkldnn::engine engine_;
@@ -250,5 +257,17 @@ inline mkldnn::memory::format MKLDNNFormatForSize(
   return data_format;
 }
 
+inline mkldnn::memory::format data_format_to_memory_format(
+    const std::string& data_format) {
+  switch (framework::StringToDataLayout(data_format)) {
+    case framework::DataLayout::kNHWC:
+      return mkldnn::memory::format::nhwc;
+    case framework::DataLayout::kNCHW:
+      return mkldnn::memory::format::nchw;
+    default:
+      return mkldnn::memory::format::any;
+  }
+}
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h
index cc46c88fd1f9a5d1bacad26beed6fd0af6405310..115abb98d56e633c938695c8127c832eab602110 100644
--- a/paddle/fluid/platform/nccl_helper.h
+++ b/paddle/fluid/platform/nccl_helper.h
@@ -100,14 +100,13 @@ struct NCCLContextMap {
       return;
     }
     std::unique_ptr<ncclComm_t[]> comms(new ncclComm_t[order_.size()]);
-    // if pass nccl_id here, can assume we are doing multi node training
-    if (nccl_id == nullptr) {
+    // if num_trainers == 1, should create a new nccl id for local comms.
+    if (num_trainers == 1) {
       std::lock_guard<std::mutex> guard(NCCLGroupGuard::NCCLMutex());
       PADDLE_ENFORCE(platform::dynload::ncclCommInitAll(
           comms.get(), static_cast<int>(order_.size()), order_.data()));
     } else {
-      PADDLE_ENFORCE_GT(num_trainers, 1);
-      // TODO(wuyi): need to ensure each node have same number of GPUs
+      PADDLE_ENFORCE_NOT_NULL(nccl_id);
       {
         int nranks = num_trainers * order_.size();
         NCCLGroupGuard gurad;
diff --git a/paddle/fluid/platform/port.h b/paddle/fluid/platform/port.h
new file mode 100644
index 0000000000000000000000000000000000000000..cf9f4aa95bc1cb79d95b79331fbc09e11af64194
--- /dev/null
+++ b/paddle/fluid/platform/port.h
@@ -0,0 +1,154 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstdio>
+#include <stdexcept>
+
+#include <memory>
+#include <string>
+
+#define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
+#include "glog/logging.h"
+
+#if !defined(_WIN32)
+#define UNUSED __attribute__((unused))
+#include <dlfcn.h>     //  dladdr
+#include <execinfo.h>  // backtrace
+#include <sys/stat.h>
+#include <algorithm>  // std::accumulate
+#else
+#include <io.h>  // _popen, _pclose
+#include <windows.h>
+#if defined(_WIN32)
+#include <numeric>  // std::accumulate in msvc
+#endif
+// windows version of __attribute__((unused))
+#define UNUSED __pragma(warning(suppress : 4100))
+
+#ifndef S_ISDIR  // windows port for sys/stat.h
+#define S_ISDIR(mode) (((mode)&S_IFMT) == S_IFDIR)
+#endif  // S_ISDIR
+
+static void *dlsym(void *handle, const char *symbol_name) {
+  FARPROC found_symbol;
+  found_symbol = GetProcAddress((HMODULE)handle, symbol_name);
+
+  if (found_symbol == NULL) {
+    throw std::runtime_error(std::string(symbol_name) + " not found.");
+  }
+  return reinterpret_cast<void *>(found_symbol);
+}
+
+static void *dlopen(const char *filename, int flag) {
+  std::string file_name(filename);
+  file_name.replace(0, file_name.size() - 1, '/', '\\');
+  HMODULE hModule = LoadLibrary(file_name.c_str());
+  if (!hModule) {
+    throw std::runtime_error(file_name + " not found.");
+  }
+  return reinterpret_cast<void *>(hModule);
+}
+
+#endif  // !_WIN32
+
+static void ExecShellCommand(const std::string &cmd, std::string *message) {
+  char buffer[128];
+#if !defined(_WIN32)
+  std::shared_ptr<FILE> pipe(popen(cmd.c_str(), "r"), pclose);
+#else
+  std::shared_ptr<FILE> pipe(_popen(cmd.c_str(), "r"), _pclose);
+#endif  // _WIN32
+  if (!pipe) {
+    LOG(ERROR) << "error running command: " << cmd;
+    return;
+  }
+  while (!feof(pipe.get())) {
+    if (fgets(buffer, 128, pipe.get()) != nullptr) {
+      *message += buffer;
+    }
+  }
+}
+
+static bool PathExists(const std::string &path) {
+#if !defined(_WIN32)
+  struct stat statbuf;
+  if (stat(path.c_str(), &statbuf) != -1) {
+    if (S_ISDIR(statbuf.st_mode)) {
+      return true;
+    }
+  }
+#else
+  struct _stat statbuf;
+  if (_stat(path.c_str(), &statbuf) != -1) {
+    if (S_ISDIR(statbuf.st_mode)) {
+      return true;
+    }
+  }
+#endif  // !_WIN32
+  return false;
+}
+
+// TODO(yuyang18): If the functions below are needed by other files, move them
+// to paddle::filesystem namespace.
+#if !defined(_WIN32)
+constexpr char kSEP = '/';
+#else
+constexpr char kSEP = '\\';
+#endif  // _WIN32
+
+static bool FileExists(const std::string &filepath) {
+#if !defined(_WIN32)
+  struct stat buffer;
+  return (stat(filepath.c_str(), &buffer) == 0);
+#else
+  struct _stat buffer;
+  return (_stat(filepath.c_str(), &buffer) == 0);
+#endif  // !_WIN32
+}
+
+static std::string DirName(const std::string &filepath) {
+  auto pos = filepath.rfind(kSEP);
+  if (pos == std::string::npos) {
+    return "";
+  }
+  return filepath.substr(0, pos);
+}
+
+static void MkDir(const char *path) {
+  std::string path_error(path);
+  path_error += " mkdir failed!";
+#if !defined(_WIN32)
+  if (mkdir(path, 0755)) {
+    if (errno != EEXIST) {
+      throw std::runtime_error(path_error);
+    }
+  }
+#else
+  CreateDirectory(path, NULL);
+  auto errorno = GetLastError();
+  if (errorno != ERROR_ALREADY_EXISTS) {
+    throw std::runtime_error(path_error);
+  }
+#endif  // !_WIN32
+}
+
+static void MkDirRecursively(const char *fullpath) {
+  if (*fullpath == '\0') return;  // empty string
+  if (FileExists(fullpath)) return;
+
+  MkDirRecursively(DirName(fullpath).c_str());
+  MkDir(fullpath);
+}
diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index 01de9d7041bf3eb40884e2a6295027cccfaebd2a..652a6ec7a4e2e823b28f39b449570cd375e88e18 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -15,7 +15,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/profiler.h"
 
 #include <sys/time.h>
-#include <time.h>
 #include <algorithm>
 #include <iomanip>
 #include <limits>
@@ -97,12 +96,6 @@ inline uint64_t GetTimeInNsec() {
       .count();
 }
 
-inline uint64_t PosixInNsec() {
-  struct timeval tv;
-  gettimeofday(&tv, nullptr);
-  return 1000 * (static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec);
-}
-
 Event::Event(EventType type, std::string name, uint32_t thread_id,
              const DeviceContext* dev_ctx)
     : type_(type), name_(name), thread_id_(thread_id), has_cuda_(false) {
@@ -110,6 +103,8 @@ Event::Event(EventType type, std::string name, uint32_t thread_id,
   has_cuda_ = dev_ctx ? platform::is_gpu_place(dev_ctx->GetPlace()) : false;
   if (has_cuda_) {
     auto* cuda_dev_ctx = static_cast<const CUDADeviceContext*>(dev_ctx);
+    PADDLE_ENFORCE(cudaSetDevice(
+        boost::get<platform::CUDAPlace>(cuda_dev_ctx->GetPlace()).device));
     PADDLE_ENFORCE(cudaGetDevice(&device_));
     PADDLE_ENFORCE(cudaEventCreate(&event_));
     auto stream = cuda_dev_ctx->stream();
@@ -176,6 +171,7 @@ void PopEvent(const std::string& name, const DeviceContext* dev_ctx) {
 
 RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx)
     : is_enabled_(false), start_ns_(PosixInNsec()) {
+  std::lock_guard<std::mutex> l(profiler_mu);
   if (g_state == ProfilerState::kDisabled) return;
   is_enabled_ = true;
   dev_ctx_ = dev_ctx;
@@ -186,11 +182,12 @@ RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx)
 }
 
 RecordEvent::~RecordEvent() {
+  std::lock_guard<std::mutex> l(profiler_mu);
   if (g_state == ProfilerState::kDisabled || !is_enabled_) return;
   DeviceTracer* tracer = GetDeviceTracer();
   if (tracer) {
     tracer->AddCPURecords(CurAnnotation(), start_ns_, PosixInNsec(),
-                          BlockDepth(), CurThread());
+                          BlockDepth(), g_thread_id);
   }
   ClearCurAnnotation();
   PopEvent(name_, dev_ctx_);
@@ -198,6 +195,7 @@ RecordEvent::~RecordEvent() {
 
 RecordBlock::RecordBlock(int block_id)
     : is_enabled_(false), start_ns_(PosixInNsec()) {
+  std::lock_guard<std::mutex> l(profiler_mu);
   if (g_state == ProfilerState::kDisabled) return;
   is_enabled_ = true;
   SetCurBlock(block_id);
@@ -205,27 +203,18 @@ RecordBlock::RecordBlock(int block_id)
 }
 
 RecordBlock::~RecordBlock() {
+  std::lock_guard<std::mutex> l(profiler_mu);
   if (g_state == ProfilerState::kDisabled || !is_enabled_) return;
   DeviceTracer* tracer = GetDeviceTracer();
   if (tracer) {
     // We try to put all blocks at the same nested depth in the
     // same timeline lane. and distinguish the using thread_id.
     tracer->AddCPURecords(name_, start_ns_, PosixInNsec(), BlockDepth(),
-                          CurThread());
+                          g_thread_id);
   }
   ClearCurBlock();
 }
 
-RecordThread::RecordThread(int thread_id) {
-  if (g_state == ProfilerState::kDisabled) return;
-  SetCurThread(thread_id);
-}
-
-RecordThread::~RecordThread() {
-  if (g_state == ProfilerState::kDisabled) return;
-  ClearCurThread();
-}
-
 void EnableProfiler(ProfilerState state) {
   PADDLE_ENFORCE(state != ProfilerState::kDisabled,
                  "Can't enbale profling, since the input state is ",
@@ -281,12 +270,13 @@ struct EventItem {
   double min_time;
   double max_time;
   double ave_time;
+  float ratio;
 };
 
 // Print results
 void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
                    const std::string& sorted_domain, const size_t name_width,
-                   const size_t data_width) {
+                   const size_t data_width, double total) {
   // Output header information
   std::cout << "\n------------------------->"
             << "     Profiling Report     "
@@ -311,7 +301,8 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
   std::cout << std::setw(name_width) << "Event" << std::setw(data_width)
             << "Calls" << std::setw(data_width) << "Total"
             << std::setw(data_width) << "Min." << std::setw(data_width)
-            << "Max." << std::setw(data_width) << "Ave." << std::endl;
+            << "Max." << std::setw(data_width) << "Ave."
+            << std::setw(data_width) << "Ratio." << std::endl;
   for (size_t i = 0; i < events_table.size(); ++i) {
     for (size_t j = 0; j < events_table[i].size(); ++j) {
       const EventItem& event_item = events_table[i][j];
@@ -320,7 +311,9 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
                 << std::setw(data_width) << event_item.total_time
                 << std::setw(data_width) << event_item.min_time
                 << std::setw(data_width) << event_item.max_time
-                << std::setw(data_width) << event_item.ave_time << std::endl;
+                << std::setw(data_width) << event_item.ave_time
+                << std::setw(data_width) << event_item.total_time / total
+                << std::endl;
     }
   }
   std::cout << std::endl;
@@ -370,6 +363,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
 
   std::vector<std::vector<EventItem>> events_table;
   size_t max_name_width = 0;
+  double total = 0.;  // the total time
   for (size_t i = 0; i < events.size(); i++) {
     std::list<Event> pushed_events;
     std::vector<EventItem> event_items;
@@ -390,6 +384,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
                                g_state == ProfilerState::kAll)
                                   ? rit->CudaElapsedMs(events[i][j])
                                   : rit->CpuElapsedMs(events[i][j]);
+          total += event_time;
 
           std::string event_name =
               "thread" + std::to_string(rit->thread_id()) + "::" + rit->name();
@@ -398,7 +393,8 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
           if (event_idx.find(event_name) == event_idx.end()) {
             event_idx[event_name] = event_items.size();
             EventItem event_item = {event_name, 1,          event_time,
-                                    event_time, event_time, event_time};
+                                    event_time, event_time, event_time,
+                                    0.};
             event_items.push_back(event_item);
           } else {
             int index = event_idx[event_name];
@@ -442,7 +438,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
   }
 
   // Print report
-  PrintProfiler(events_table, sorted_domain, max_name_width + 4, 12);
+  PrintProfiler(events_table, sorted_domain, max_name_width + 4, 12, total);
 }
 
 void DisableProfiler(EventSortingKey sorted_key,
diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h
index bf43925373a12cd9ff2155d68c42d0266ba4df60..38630686f7cf3c669373f941d989adf11ba6cfe6 100644
--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@@ -69,6 +69,7 @@ void PushEvent(const std::string& name, const DeviceContext* dev_ctx);
 
 void PopEvent(const std::string& name, const DeviceContext* dev_ctx);
 
+#if !defined(_WIN32)
 struct RecordEvent {
   RecordEvent(const std::string& name, const DeviceContext* dev_ctx);
 
@@ -94,11 +95,15 @@ struct RecordBlock {
   std::string name_;
   uint64_t start_ns_;
 };
-
-struct RecordThread {
-  explicit RecordThread(int thread_id);
-  ~RecordThread();
+#else
+// windows do not support profiler temporarily.
+struct RecordEvent {
+  RecordEvent(const std::string& name, const DeviceContext* dev_ctx) {}
 };
+struct RecordBlock {
+  explicit RecordBlock(int block_id) {}
+};
+#endif
 
 // Return the event list of all threads. Assumed the returned value calls
 // event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
diff --git a/paddle/fluid/platform/variant.h b/paddle/fluid/platform/variant.h
index 45f60fc9d76560b133fa06198a24c7eaccc24088..dc9fad29f281a1c6ac300b48f9e600ff802a5752 100644
--- a/paddle/fluid/platform/variant.h
+++ b/paddle/fluid/platform/variant.h
@@ -38,6 +38,7 @@ limitations under the License. */
 #endif
 #endif
 
+#include <boost/any.hpp>
 #include <boost/mpl/comparison.hpp>
 #include <boost/mpl/less_equal.hpp>
 #include <boost/variant.hpp>
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 89ca4f781273e99bbb83216c238dfc5c88c0a22b..b5bd07d401f9ebfe441bc0f84f9bad317f0e8da9 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -1,19 +1,24 @@
+
+set(PYBIND_DEPS pybind python proto_desc memory executor prune  feed_fetch_method)
+set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc)
+if(NOT WIN32)
+list(APPEND PYBIND_DEPS parallel_executor profiler)
+list(APPEND PYBIND_SRCS recordio.cc)
+endif()
 if(WITH_PYTHON)
   if(WITH_AMD_GPU)
     hip_library(paddle_pybind SHARED
-      SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc
-      DEPS pybind python proto_desc memory executor prune profiler feed_fetch_method
-           parallel_executor
+      SRCS ${PYBIND_SRCS}
+      DEPS ${PYBIND_DEPS}
       ${GLOB_OP_LIB})
   else()
     cc_library(paddle_pybind SHARED
-      SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc
-      DEPS pybind python proto_desc memory executor prune profiler feed_fetch_method
-           parallel_executor
+      SRCS ${PYBIND_SRCS}
+      DEPS ${PYBIND_DEPS}
       ${GLOB_OP_LIB})
-    if(NOT APPLE AND NOT ANDROID)
+    if(NOT APPLE AND NOT ANDROID AND NOT WIN32)
       target_link_libraries(paddle_pybind rt)
-    endif(NOT APPLE AND NOT ANDROID)
+    endif(NOT APPLE AND NOT ANDROID AND NOT WIN32)
   endif(WITH_AMD_GPU)
 
   cc_test(tensor_py_test SRCS tensor_py_test.cc DEPS python)
diff --git a/paddle/fluid/pybind/const_value.cc b/paddle/fluid/pybind/const_value.cc
index 76aa7d2010682416f68e982e9b89da9813abb078..f577068d1f39a3083a54f106d006f9982304411e 100644
--- a/paddle/fluid/pybind/const_value.cc
+++ b/paddle/fluid/pybind/const_value.cc
@@ -13,7 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/pybind/const_value.h"
-#include <paddle/fluid/framework/op_proto_maker.h>
+#include "paddle/fluid/framework/ir/node.h"
+#include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/framework/operator.h"
 
 namespace paddle {
@@ -24,6 +25,8 @@ void BindConstValue(pybind11::module* m) {
   m->def("kTempVarName", [] { return framework::kTempVarName; });
   m->def("kGradVarSuffix", [] { return framework::kGradVarSuffix; });
   m->def("kZeroVarSuffix", [] { return framework::kZeroVarSuffix; });
+  m->def("kControlDepVarName",
+         [] { return framework::ir::Node::kControlDepVarName; });
 
   auto op_proto_and_checker_maker =
       m->def_submodule("op_proto_and_checker_maker");
@@ -40,6 +43,9 @@ void BindConstValue(pybind11::module* m) {
   op_proto_and_checker_maker.def(
       "kOpRoleVarAttrName",
       framework::OpProtoAndCheckerMaker::OpRoleVarAttrName);
+  op_proto_and_checker_maker.def(
+      "kOpNameScopeAttrName",
+      framework::OpProtoAndCheckerMaker::OpNamescopeAttrName);
 }
 
 }  // namespace pybind
diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc
index fcd3356d44ee592233c3883d439d0677714900b8..67501186d150171728194f23bc02d2c014848dd7 100644
--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
@@ -137,7 +137,10 @@ void BindProgramDesc(pybind11::module *m) {
              PADDLE_ENFORCE(desc->ParseFromString(data),
                             "Fail to parse ProgramDesc from string. This could "
                             "be a bug of Paddle.");
-           });
+           })
+      .def("_version", [](pd::ProgramDesc &self) -> int64_t {
+        return self.Proto()->version().version();
+      });
 }
 
 void BindBlockDesc(pybind11::module *m) {
@@ -145,14 +148,14 @@ void BindBlockDesc(pybind11::module *m) {
       .def_property_readonly("id", &pd::BlockDesc::ID)
       .def_property_readonly("parent", &pd::BlockDesc::Parent)
       .def("get_forward_block_idx", &pd::BlockDesc::ForwardBlockID)
-      .def("set_forward_block_idx", &pd::BlockDesc::SetForwardBlockID)
+      .def("_set_forward_block_idx", &pd::BlockDesc::SetForwardBlockID)
       .def("append_op", &pd::BlockDesc::AppendOp,
            pybind11::return_value_policy::reference)
-      .def("prepend_op", &pd::BlockDesc::PrependOp,
+      .def("_prepend_op", &pd::BlockDesc::PrependOp,
            pybind11::return_value_policy::reference)
-      .def("insert_op", &pd::BlockDesc::InsertOp,
+      .def("_insert_op", &pd::BlockDesc::InsertOp,
            pybind11::return_value_policy::reference)
-      .def("remove_op", &pd::BlockDesc::RemoveOp)
+      .def("_remove_op", &pd::BlockDesc::RemoveOp)
       .def("var",
            [](pd::BlockDesc &self, pybind11::bytes byte_name) {
              std::string name = byte_name;
@@ -165,7 +168,7 @@ void BindBlockDesc(pybind11::module *m) {
              return self.HasVar(name);
            },
            pybind11::return_value_policy::reference)
-      .def("rename_var",
+      .def("_rename_var",
            [](pd::BlockDesc &self, const pybind11::bytes &byte_name,
               const pybind11::bytes &byte_name_new) {
              std::string name = byte_name;
@@ -189,7 +192,7 @@ void BindBlockDesc(pybind11::module *m) {
              return self.FindVarRecursive(name);
            },
            pybind11::return_value_policy::reference)
-      .def("remove_var",
+      .def("_remove_var",
            [](pd::BlockDesc &self, pybind11::bytes byte_name) {
              std::string name = byte_name;
              return self.RemoveVar(name);
@@ -205,12 +208,7 @@ void BindBlockDesc(pybind11::module *m) {
 void BindVarDsec(pybind11::module *m) {
   pybind11::class_<pd::VarDesc> var_desc(*m, "VarDesc", "");
   var_desc
-      .def("name",
-           [](pd::VarDesc &self) {
-             pybind11::bytes name = self.Name();
-             return name;
-           },
-           pybind11::return_value_policy::reference)
+      .def("name", &pd::VarDesc::Name, pybind11::return_value_policy::reference)
       .def("set_name", &pd::VarDesc::SetName)
       .def("set_shape", &pd::VarDesc::SetShape)
       .def("set_shapes", &pd::VarDesc::SetShapes)
@@ -239,6 +237,7 @@ void BindVarDsec(pybind11::module *m) {
   pybind11::enum_<pd::proto::VarType::Type>(var_desc, "VarType", "")
       .value("BOOL", pd::proto::VarType::BOOL)
       .value("UINT8", pd::proto::VarType::UINT8)
+      .value("INT8", pd::proto::VarType::INT8)
       .value("INT16", pd::proto::VarType::INT16)
       .value("INT32", pd::proto::VarType::INT32)
       .value("INT64", pd::proto::VarType::INT64)
@@ -301,7 +300,8 @@ void BindOpDesc(pybind11::module *m) {
              std::string ser(seriralized);
              self.SetAttr(name, ser);
            })
-      .def("block_attr", &pd::OpDesc::GetBlockAttr)
+      .def("block_attr_id", &pd::OpDesc::GetBlockAttrId)
+      .def("blocks_attr_ids", &pd::OpDesc::GetBlocksAttrIds)
       .def("check_attrs", &pd::OpDesc::CheckAttrs)
       .def("infer_shape", &pd::OpDesc::InferShape)
       .def("infer_var_type", &pd::OpDesc::InferVarType)
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 7a8bb712452538b7e2a349d56a15de3284f82b39..8bc30fc123163983f4bddc19af489920db93e0c0 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 #include <Python.h>
 #include <algorithm>
 #include <map>
+#include <memory>
 #include <mutex>  // NOLINT // for call_once
 #include <string>
 #include <unordered_map>
@@ -32,6 +33,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/prune.h"
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/version.h"
 #include "paddle/fluid/operators/activation_op.h"
 #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -53,6 +55,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/gpu_info.h"
 #endif
 
+#include "pybind11/stl.h"
+
 // disable auto conversion to list in Python
 PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray);
 
@@ -66,6 +70,14 @@ bool IsCompiledWithCUDA() {
 #endif
 }
 
+bool IsCompiledWithDIST() {
+#ifdef PADDLE_WITH_DISTRIBUTE
+  return true;
+#else
+  return false;
+#endif
+}
+
 PYBIND11_PLUGIN(core) {
   py::module m("core", "C++ core of PaddlePaddle");
 
@@ -78,37 +90,37 @@ PYBIND11_PLUGIN(core) {
   py::class_<Tensor>(m, "Tensor", py::buffer_protocol())
       .def_buffer(
           [](Tensor &self) -> py::buffer_info { return CastToPyBuffer(self); })
-      .def("get_dims",
+      .def("_get_dims",
            [](const Tensor &self) { return vectorize(self.dims()); })
-      .def("set_dims",
+      .def("_set_dims",
            [](Tensor &self, const std::vector<int64_t> &dim) {
              self.Resize(make_ddim(dim));
            })
-      .def("set_layout",
+      .def("_set_layout",
            [](Tensor &self, const std::string &layout) {
              self.set_layout(StringToDataLayout(layout));
            })
-      .def("alloc_float",
+      .def("_alloc_float",
            [](Tensor &self, paddle::platform::CUDAPlace &place) {
              self.mutable_data<float>(place);
            })
-      .def("alloc_float",
+      .def("_alloc_float",
            [](Tensor &self, paddle::platform::CPUPlace &place) {
              self.mutable_data<float>(place);
            })
-      .def("alloc_int",
+      .def("_alloc_int",
            [](Tensor &self, paddle::platform::CPUPlace &place) {
              self.mutable_data<int>(place);
            })
-      .def("alloc_int",
+      .def("_alloc_int",
            [](Tensor &self, paddle::platform::CUDAPlace &place) {
              self.mutable_data<int>(place);
            })
-      .def("alloc_int",
+      .def("_alloc_int",
            [](Tensor &self, paddle::platform::CUDAPinnedPlace &place) {
              self.mutable_data<int>(place);
            })
-      .def("alloc_float",
+      .def("_alloc_float",
            [](Tensor &self, paddle::platform::CUDAPinnedPlace &place) {
              self.mutable_data<float>(place);
            })
@@ -119,6 +131,7 @@ PYBIND11_PLUGIN(core) {
       .def("set", PyCPUTensorSetFromArray<bool>)
       .def("set", PyCPUTensorSetFromArray<uint16_t>)
       .def("set", PyCPUTensorSetFromArray<uint8_t>)
+      .def("set", PyCPUTensorSetFromArray<int8_t>)
 #ifdef PADDLE_WITH_CUDA
       .def("set", PyCUDATensorSetFromArray<float>)
       .def("set", PyCUDATensorSetFromArray<int>)
@@ -127,6 +140,7 @@ PYBIND11_PLUGIN(core) {
       .def("set", PyCUDATensorSetFromArray<bool>)
       .def("set", PyCUDATensorSetFromArray<uint16_t>)
       .def("set", PyCUDATensorSetFromArray<uint8_t>)
+      .def("set", PyCUDATensorSetFromArray<int8_t>)
       .def("set", PyCUDAPinnedTensorSetFromArray<float>)
       .def("set", PyCUDAPinnedTensorSetFromArray<int>)
       .def("set", PyCUDAPinnedTensorSetFromArray<double>)
@@ -134,13 +148,14 @@ PYBIND11_PLUGIN(core) {
       .def("set", PyCUDAPinnedTensorSetFromArray<bool>)
       .def("set", PyCUDAPinnedTensorSetFromArray<uint16_t>)
       .def("set", PyCUDAPinnedTensorSetFromArray<uint8_t>)
+      .def("set", PyCUDAPinnedTensorSetFromArray<int8_t>)
 #endif
       .def("shape", [](Tensor &self) { return vectorize(self.dims()); })
-      .def("set_float_element", TensorSetElement<float>)
-      .def("get_float_element", TensorGetElement<float>)
-      .def("set_double_element", TensorSetElement<double>)
-      .def("get_double_element", TensorGetElement<double>)
-      .def("dtype", [](Tensor &self) { return ToDataType(self.type()); });
+      .def("_set_float_element", TensorSetElement<float>)
+      .def("_get_float_element", TensorGetElement<float>)
+      .def("_set_double_element", TensorSetElement<double>)
+      .def("_get_double_element", TensorGetElement<double>)
+      .def("_dtype", [](Tensor &self) { return ToDataType(self.type()); });
 
   py::class_<LoDTensor, Tensor>(m, "LoDTensor")
       .def_buffer(
@@ -238,16 +253,13 @@ PYBIND11_PLUGIN(core) {
         self.set_rows(new_rows);
 #endif
            })
+      .def("sync_index", [](SelectedRows &instance) { instance.SyncIndex(); })
       .def("rows", [](SelectedRows &self) {
-#ifndef PADDLE_WITH_CUDA
-        return self.rows();
-#else
-         auto rows = self.rows();
-         std::vector<int64_t> new_rows;
-         new_rows.reserve(rows.size());
-         std::copy(rows.begin(), rows.end(), std::back_inserter(new_rows));
-         return new_rows;
-#endif
+        auto rows = self.rows();
+        std::vector<int64_t> new_rows;
+        new_rows.reserve(rows.size());
+        std::copy(rows.begin(), rows.end(), std::back_inserter(new_rows));
+        return new_rows;
       });
 
   py::class_<Variable>(m, "Variable", R"DOC(Variable Class.
@@ -296,13 +308,14 @@ All parameter, weight, gradient are variables in Paddle.
            py::return_value_policy::reference);
 
   py::class_<framework::ReaderHolder>(m, "Reader", "")
-      .def("reset", &framework::ReaderHolder::ReInit);
+      .def("reset", &framework::ReaderHolder::ResetAll);
 
   using LoDTensorBlockingQueue =
       ::paddle::operators::reader::LoDTensorBlockingQueue;
   using LoDTensorBlockingQueueHolder =
       ::paddle::operators::reader::LoDTensorBlockingQueueHolder;
-  py::class_<LoDTensorBlockingQueue>(m, "LoDTensorBlockingQueue", "")
+  py::class_<LoDTensorBlockingQueue, std::shared_ptr<LoDTensorBlockingQueue>>(
+      m, "LoDTensorBlockingQueue", "")
       .def("push",
            [](LoDTensorBlockingQueue &self,
               const std::vector<framework::LoDTensor> &lod_tensor_vec) {
@@ -317,7 +330,7 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("init_lod_tensor_blocking_queue",
         [](Variable &var, size_t capacity,
            const std::vector<std::vector<int64_t>> &shapes)
-            -> LoDTensorBlockingQueue * {
+            -> std::shared_ptr<LoDTensorBlockingQueue> {
               std::vector<DDim> dims(shapes.size());
               std::transform(shapes.begin(), shapes.end(), dims.begin(),
                              [](const std::vector<int64_t> &shape) {
@@ -325,9 +338,9 @@ All parameter, weight, gradient are variables in Paddle.
                              });
               auto *holder = var.GetMutable<LoDTensorBlockingQueueHolder>();
               holder->InitOnce(capacity, dims);
-              return holder->GetQueue().get();
+              return holder->GetQueue();
             },
-        py::return_value_policy::reference);
+        py::return_value_policy::copy);
 
   py::class_<Scope>(m, "Scope", "")
       .def("var",
@@ -388,8 +401,10 @@ All parameter, weight, gradient are variables in Paddle.
     InferenceOptimize(*(origin.Proto()), &pruned_desc);
     return new ProgramDesc(pruned_desc);
   });
-  m.def("empty_var_name", []() { return framework::kEmptyVarName; });
-  m.def("grad_var_suffix", []() { return framework::kGradVarSuffix; });
+  m.def("empty_var_name",
+        []() { return std::string(framework::kEmptyVarName); });
+  m.def("grad_var_suffix",
+        []() { return std::string(framework::kGradVarSuffix); });
   m.def_submodule(
        "var_names",
        "The module will return special predefined variable name in Paddle")
@@ -492,10 +507,7 @@ All parameter, weight, gradient are variables in Paddle.
 
   py::class_<framework::Executor>(m, "Executor")
       .def(py::init<const platform::Place &>())
-#ifdef PADDLE_WITH_DISTRIBUTE
-      .def("begin_pass", &Executor::BeginPass)
-      .def("end_pass", &Executor::EndPass)
-#endif
+      .def("close", &Executor::Close)
       .def("run", [](Executor &self, const ProgramDesc &prog, Scope *scope,
                      int block_id, bool create_local_scope, bool create_vars) {
         pybind11::gil_scoped_release release;
@@ -508,6 +520,7 @@ All parameter, weight, gradient are variables in Paddle.
         [](bool init_p2p) { framework::InitDevices(init_p2p); });
 
   m.def("is_compiled_with_cuda", IsCompiledWithCUDA);
+  m.def("is_compiled_with_dist", IsCompiledWithDIST);
 #ifdef PADDLE_WITH_CUDA
   m.def("is_float16_supported", [](const platform::CUDAPlace &place) -> bool {
     // Only GPUs with Compute Capability >= 53 support float16
@@ -518,6 +531,8 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("set_feed_variable", framework::SetFeedVariable);
   m.def("get_fetch_variable", framework::GetFetchVariable);
 
+  m.def("_is_program_version_supported", IsProgramVersionSupported);
+
   BindProgramDesc(&m);
   BindBlockDesc(&m);
   BindVarDsec(&m);
@@ -534,6 +549,8 @@ All parameter, weight, gradient are variables in Paddle.
       });
 
   py::class_<LoDTensorArray>(m, "LoDTensorArray")
+      .def("__init__",
+           [](LoDTensorArray &instance) { new (&instance) LoDTensorArray(); })
       .def("__getitem__",
            [](LoDTensorArray &self, size_t i) { return &self.at(i); },
            py::return_value_policy::reference)
@@ -585,8 +602,8 @@ All parameter, weight, gradient are variables in Paddle.
 
   // -- python binds for parallel executor.
   py::class_<ParallelExecutor> pe(m, "ParallelExecutor");
-  py::class_<ExecutionStrategy>(pe, "ExecutionStrategy")
-      .def(py::init())
+  py::class_<ExecutionStrategy> exec_strategy(pe, "ExecutionStrategy");
+  exec_strategy.def(py::init())
       .def_property(
           "num_threads",
           [](const ExecutionStrategy &self) { return self.num_threads_; },
@@ -613,6 +630,16 @@ All parameter, weight, gradient are variables in Paddle.
           [](ExecutionStrategy &self, size_t num_iteration_per_drop_scope) {
             self.num_iteration_per_drop_scope_ = num_iteration_per_drop_scope;
           });
+  exec_strategy.def_property(
+      "use_experimental_executor",
+      [](const ExecutionStrategy &self) {
+        return self.type_ == ExecutionStrategy::kExperimental;
+      },
+      [](ExecutionStrategy &self, bool experimental) {
+        self.type_ = experimental ? ExecutionStrategy::kExperimental
+                                  : ExecutionStrategy::kDefault;
+      });
+
   py::class_<BuildStrategy> build_strategy(pe, "BuildStrategy");
 
   py::enum_<BuildStrategy::ReduceStrategy>(build_strategy, "ReduceStrategy")
@@ -656,7 +683,6 @@ All parameter, weight, gradient are variables in Paddle.
                   const std::string &, Scope *, std::vector<Scope *> &,
                   const ExecutionStrategy &, const BuildStrategy &, size_t,
                   size_t>())
-      .def("bcast_params", &ParallelExecutor::BCastParamsToGPUs)
       // NOTE: even we return a vec<Scope*>* to Python use reference policy.
       // We still cannot get local_scope from this vector, since the element
       // of vec<Scope*> will be freed by Python GC. We can only return Scope*
diff --git a/paddle/fluid/pybind/recordio.cc b/paddle/fluid/pybind/recordio.cc
index 330d104e0a774d905e463566f85bd2e64a080190..f83b026d4d50772b969c4316964b70a68b27442b 100644
--- a/paddle/fluid/pybind/recordio.cc
+++ b/paddle/fluid/pybind/recordio.cc
@@ -30,7 +30,9 @@ class RecordIOWriter {
  public:
   RecordIOWriter(const std::string& filename, recordio::Compressor compressor,
                  size_t max_num_record)
-      : stream_(filename), writer_(&stream_, compressor, max_num_record) {}
+      : closed_(false),
+        stream_(filename),
+        writer_(&stream_, compressor, max_num_record) {}
 
   void AppendTensor(const framework::LoDTensor& tensor) {
     tensors_.push_back(tensor);
@@ -47,9 +49,17 @@ class RecordIOWriter {
     PADDLE_ENFORCE(tensors_.empty());
     writer_.Flush();
     stream_.close();
+    closed_ = true;
+  }
+
+  ~RecordIOWriter() {
+    if (!closed_) {
+      Close();
+    }
   }
 
  private:
+  bool closed_;
   std::vector<framework::LoDTensor> tensors_;
   std::ofstream stream_;
   recordio::Writer writer_;
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index 3e2ea1ef88b03f5b2576c1cee2b5d26a439943da..51614a6a3dd2f7f830cf533fc365b56a99d3b918 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -97,7 +97,7 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
 inline pybind11::buffer_info CastToPyBuffer(const framework::Tensor &tensor) {
   auto buffer_info =
       details::CastToPyBufferImpl<true, 0, float, int, double, int64_t, bool,
-                                  uint8_t, platform::float16>()(tensor);
+                                  uint8_t, int8_t, platform::float16>()(tensor);
   return buffer_info;
 }
 
diff --git a/paddle/fluid/recordio/scanner.cc b/paddle/fluid/recordio/scanner.cc
index 06a13e6c5b6ea76456e231e3f7b1eb33492b16ea..a0a2f984228db0e7a015630655a3176aa4d1a5a4 100644
--- a/paddle/fluid/recordio/scanner.cc
+++ b/paddle/fluid/recordio/scanner.cc
@@ -28,6 +28,7 @@ Scanner::Scanner(std::unique_ptr<std::istream> &&stream)
 
 Scanner::Scanner(const std::string &filename)
     : stream_(new std::ifstream(filename)), parser_(*stream_) {
+  PADDLE_ENFORCE(static_cast<bool>(*stream_), "Cannot open file %s", filename);
   Reset();
 }
 
diff --git a/paddle/fluid/string/CMakeLists.txt b/paddle/fluid/string/CMakeLists.txt
index 1fe7f42ca1c692e4d7034883022852657be8cc20..8572dc1e8e543b552e3ed5a180ec942faf90a624 100644
--- a/paddle/fluid/string/CMakeLists.txt
+++ b/paddle/fluid/string/CMakeLists.txt
@@ -1,4 +1,5 @@
 cc_library(stringpiece SRCS piece.cc)
+cc_library(pretty_log SRCS pretty_log.cc)
 cc_test(stringpiece_test SRCS piece_test.cc DEPS stringpiece glog gflags)
 cc_test(stringprintf_test SRCS printf_test.cc DEPS glog gflags)
 cc_test(to_string_test SRCS to_string_test.cc)
diff --git a/paddle/fluid/framework/details/ssa_graph.cc b/paddle/fluid/string/pretty_log.cc
similarity index 66%
rename from paddle/fluid/framework/details/ssa_graph.cc
rename to paddle/fluid/string/pretty_log.cc
index 1b8c889449059c563ea39f86250075ac2537cdbe..4534fdc58b81fe03b3a1fc19b55aa62ddbf5eaf1 100644
--- a/paddle/fluid/framework/details/ssa_graph.cc
+++ b/paddle/fluid/string/pretty_log.cc
@@ -1,4 +1,4 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,4 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/details/ssa_graph.h"
+#include "paddle/fluid/string/pretty_log.h"
+#include <gflags/gflags.h>
+
+DEFINE_bool(color, true, "Whether to turn on pretty log");
+
+namespace paddle {
+namespace string {}  // namespace string
+}  // namespace paddle
diff --git a/paddle/fluid/string/pretty_log.h b/paddle/fluid/string/pretty_log.h
new file mode 100644
index 0000000000000000000000000000000000000000..a3b4e38f453835828a4a53130e11c854ac3f4a74
--- /dev/null
+++ b/paddle/fluid/string/pretty_log.h
@@ -0,0 +1,70 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <gflags/gflags.h>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <utility>
+#include "paddle/fluid/string/printf.h"
+
+DECLARE_bool(color);
+
+namespace paddle {
+
+namespace string {
+
+inline std::string black() { return FLAGS_color ? "\e[30m" : ""; }
+inline std::string red() { return FLAGS_color ? "\e[31m" : ""; }
+inline std::string b_red() { return FLAGS_color ? "\e[41m" : ""; }
+inline std::string green() { return FLAGS_color ? "\e[32m" : ""; }
+inline std::string yellow() { return FLAGS_color ? "\e[33m" : ""; }
+inline std::string blue() { return FLAGS_color ? "\e[34m" : ""; }
+inline std::string purple() { return FLAGS_color ? "\e[35m" : ""; }
+inline std::string cyan() { return FLAGS_color ? "\e[36m" : ""; }
+inline std::string light_gray() { return FLAGS_color ? "\e[37m" : ""; }
+inline std::string white() { return FLAGS_color ? "\e[37m" : ""; }
+inline std::string light_red() { return FLAGS_color ? "\e[91m" : ""; }
+inline std::string dim() { return FLAGS_color ? "\e[2m" : ""; }
+inline std::string bold() { return FLAGS_color ? "\e[1m" : ""; }
+inline std::string underline() { return FLAGS_color ? "\e[4m" : ""; }
+inline std::string blink() { return FLAGS_color ? "\e[5m" : ""; }
+inline std::string reset() { return FLAGS_color ? "\e[0m" : ""; }
+
+using TextBlock = std::pair<std::string, std::string>;
+
+struct Style {
+  static std::string info() { return black(); }
+  static std::string warn() { return b_red(); }
+  static std::string suc() { return green(); }
+  static std::string H1() { return bold() + purple(); }
+  static std::string H2() { return green(); }
+  static std::string H3() { return green(); }
+  static std::string detail() { return light_gray(); }
+};
+
+template <typename... Args>
+static void PrettyLogEndl(const std::string& style, const char* fmt,
+                          const Args&... args) {
+  std::cerr << style << Sprintf(fmt, args...) << reset() << std::endl;
+}
+template <typename... Args>
+static void PrettyLog(const std::string& style, const char* fmt,
+                      const Args&... args) {
+  std::cerr << style << Sprintf(fmt, args...) << reset();
+}
+
+}  // namespace string
+}  // namespace paddle
diff --git a/paddle/fluid/train/demo/demo_trainer.cc b/paddle/fluid/train/demo/demo_trainer.cc
index 4425f062efa6eab552caee1a429746528cd66926..a0757b53f37b29de0b3802c345b1ad9db69f16e9 100644
--- a/paddle/fluid/train/demo/demo_trainer.cc
+++ b/paddle/fluid/train/demo/demo_trainer.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <time.h>
 #include <fstream>
 
 #include "paddle/fluid/framework/executor.h"
@@ -21,6 +22,7 @@
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/init.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace train {
@@ -93,11 +95,21 @@ int main() {
 
   auto loss_var = scope.Var(loss_name);
 
+  paddle::platform::ProfilerState pf_state;
+  pf_state = paddle::platform::ProfilerState::kCPU;
+  paddle::platform::EnableProfiler(pf_state);
+  clock_t t1 = clock();
+
   for (int i = 0; i < 10; ++i) {
     executor.Run(*train_program.get(), &scope, 0, false, true);
     std::cout << "step: " << i << " loss: "
               << loss_var->Get<paddle::framework::LoDTensor>().data<float>()[0]
               << std::endl;
   }
+
+  clock_t t2 = clock();
+  paddle::platform::DisableProfiler(paddle::platform::EventSortingKey::kTotal,
+                                    "run_paddle_op_profiler");
+  std::cout << "run_time = " << t2 - t1 << std::endl;
   return 0;
 }
diff --git a/paddle/legacy/capi/Arguments.cpp b/paddle/legacy/capi/Arguments.cpp
index 87fac3d6c6abe37b128213d4ffd66f8c1573a910..0ce1770c76c2e145d0b2bf71332cc4593517f195 100644
--- a/paddle/legacy/capi/Arguments.cpp
+++ b/paddle/legacy/capi/Arguments.cpp
@@ -66,6 +66,17 @@ paddle_error paddle_arguments_get_value(paddle_arguments args,
   return kPD_NO_ERROR;
 }
 
+PD_API paddle_error paddle_arguments_get_prob(paddle_arguments args,
+                                              uint64_t ID,
+                                              paddle_matrix mat) {
+  if (args == nullptr || mat == nullptr) return kPD_NULLPTR;
+  auto m = paddle::capi::cast<paddle::capi::CMatrix>(mat);
+  auto a = castArg(args);
+  if (ID >= a->args.size()) return kPD_OUT_OF_RANGE;
+  m->mat = a->args[ID].in;
+  return kPD_NO_ERROR;
+}
+
 paddle_error paddle_arguments_get_ids(paddle_arguments args,
                                       uint64_t ID,
                                       paddle_ivector ids) {
diff --git a/paddle/legacy/capi/arguments.h b/paddle/legacy/capi/arguments.h
index 69a66bb012c318bc8317c246d690a7f4baffd248..ceb64ee6aa74a8ba4b5cb9045b366dcda8f8cc90 100644
--- a/paddle/legacy/capi/arguments.h
+++ b/paddle/legacy/capi/arguments.h
@@ -87,6 +87,18 @@ PD_API paddle_error paddle_arguments_get_value(paddle_arguments args,
                                                uint64_t ID,
                                                paddle_matrix mat);
 
+/**
+ * @brief paddle_arguments_get_prob Get the prob matrix of beam search, which
+ *        slot ID is `ID`
+ * @param [in] args arguments array
+ * @param [in] ID array index
+ * @param [out] mat matrix pointer
+ * @return paddle_error
+ */
+PD_API paddle_error paddle_arguments_get_prob(paddle_arguments args,
+                                              uint64_t ID,
+                                              paddle_matrix mat);
+
 /**
  * @brief PDArgsGetIds Get the integer vector of one argument in array, which
  *        index is `ID`.
diff --git a/paddle/legacy/capi/examples/model_inference/multi_thread/convert_protobin.sh b/paddle/legacy/capi/examples/model_inference/multi_thread/convert_protobin.sh
deleted file mode 120000
index 3c1b3533523cf1709720d11df7b8e311e0577fe7..0000000000000000000000000000000000000000
--- a/paddle/legacy/capi/examples/model_inference/multi_thread/convert_protobin.sh
+++ /dev/null
@@ -1 +0,0 @@
-../dense/convert_protobin.sh
\ No newline at end of file
diff --git a/paddle/legacy/capi/examples/model_inference/multi_thread/convert_protobin.sh b/paddle/legacy/capi/examples/model_inference/multi_thread/convert_protobin.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b29f2cd21418ecbd2fb2ba626138e5aa11bf77f3
--- /dev/null
+++ b/paddle/legacy/capi/examples/model_inference/multi_thread/convert_protobin.sh
@@ -0,0 +1 @@
+../dense/convert_protobin.sh
diff --git a/paddle/legacy/capi/examples/model_inference/sequence/convert_protobin.sh b/paddle/legacy/capi/examples/model_inference/sequence/convert_protobin.sh
deleted file mode 120000
index 3c1b3533523cf1709720d11df7b8e311e0577fe7..0000000000000000000000000000000000000000
--- a/paddle/legacy/capi/examples/model_inference/sequence/convert_protobin.sh
+++ /dev/null
@@ -1 +0,0 @@
-../dense/convert_protobin.sh
\ No newline at end of file
diff --git a/paddle/legacy/capi/examples/model_inference/sequence/convert_protobin.sh b/paddle/legacy/capi/examples/model_inference/sequence/convert_protobin.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b29f2cd21418ecbd2fb2ba626138e5aa11bf77f3
--- /dev/null
+++ b/paddle/legacy/capi/examples/model_inference/sequence/convert_protobin.sh
@@ -0,0 +1 @@
+../dense/convert_protobin.sh
diff --git a/paddle/legacy/capi/examples/model_inference/sparse_binary/convert_protobin.sh b/paddle/legacy/capi/examples/model_inference/sparse_binary/convert_protobin.sh
deleted file mode 120000
index 3c1b3533523cf1709720d11df7b8e311e0577fe7..0000000000000000000000000000000000000000
--- a/paddle/legacy/capi/examples/model_inference/sparse_binary/convert_protobin.sh
+++ /dev/null
@@ -1 +0,0 @@
-../dense/convert_protobin.sh
\ No newline at end of file
diff --git a/paddle/legacy/capi/examples/model_inference/sparse_binary/convert_protobin.sh b/paddle/legacy/capi/examples/model_inference/sparse_binary/convert_protobin.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b29f2cd21418ecbd2fb2ba626138e5aa11bf77f3
--- /dev/null
+++ b/paddle/legacy/capi/examples/model_inference/sparse_binary/convert_protobin.sh
@@ -0,0 +1 @@
+../dense/convert_protobin.sh
diff --git a/paddle/legacy/utils/PythonUtil.cpp b/paddle/legacy/utils/PythonUtil.cpp
index 7faeff55c28b9065179ad27b3b604a9f411249e5..21ed049c4d2743d1fa914d6948d6c8c2862f0bfc 100644
--- a/paddle/legacy/utils/PythonUtil.cpp
+++ b/paddle/legacy/utils/PythonUtil.cpp
@@ -136,7 +136,13 @@ std::string callPythonFunc(const std::string& moduleName,
                            const std::string& funcName,
                            const std::vector<std::string>& args) {
   PyObjectPtr obj = callPythonFuncRetPyObj(moduleName, funcName, args);
+#if PY_MAJOR_VERSION >= 3
+  Py_ssize_t str_size = 0u;
+  const char* str = PyUnicode_AsUTF8AndSize(obj.get(), &str_size);
+  return std::string(str, (size_t)str_size);
+#else
   return std::string(PyString_AsString(obj.get()), PyString_Size(obj.get()));
+#endif  // PY_MAJOR_VERSION >= 3
 }
 
 PyObjectPtr createPythonClass(
diff --git a/paddle/legacy/utils/PythonUtil.h b/paddle/legacy/utils/PythonUtil.h
index b0c8612c378fbe12cdf24e51a5b6546740b2d4c8..d5b2dbddde21f5c2a0696aadeda2b057175fc5e9 100644
--- a/paddle/legacy/utils/PythonUtil.h
+++ b/paddle/legacy/utils/PythonUtil.h
@@ -88,6 +88,33 @@ PyObjectPtr createPythonClass(const std::string& moduleName,
 namespace py {
 PyObjectPtr import(const std::string& moduleName);
 
+#if PY_MAJOR_VERSION >= 3
+/**
+ * Cast a PyLong to int type T.
+ * @tparam T return type.
+ * @param [in] obj PyLong object.
+ * @param [out] ok status for casting. False if error occured. nullptr if user
+ *                 don't care is ok or not.
+ * @return The value of python object, or 0 if not ok.
+ */
+template <typename T>
+T castInt(PyObject* obj, bool* ok = nullptr) {
+  // Refer to https://www.python.org/dev/peps/pep-0237/, the int and long object
+  // were unified to long since python3
+  if (PyLong_Check(obj)) {
+    if (ok) *ok = true;
+    return (T)PyLong_AsUnsignedLong(obj);
+  } else {
+    if (ok) *ok = false;
+    return (T)0;
+  }
+}
+
+// Convert PyAPI from 2.x to 3.x
+#define PyString_FromString PyUnicode_FromString
+#define PyString_AsString PyUnicode_AsUTF8
+
+#else
 /**
  * Cast a PyLong or PyInt to int type T.
  * @tparam T return type.
@@ -109,6 +136,7 @@ T castInt(PyObject* obj, bool* ok = nullptr) {
     return (T)0;
   }
 }
+#endif  // PY_MAJOR_VERSION >= 3
 
 /**
  * Invoke repr of python object.
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index d173b41e86f61954954b6a5ea9957d2e172deca0..ba5065f468376d83488d7eade5dc2041d86dfd39 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -19,6 +19,8 @@
 #                   Utils
 #=================================================
 
+set -ex
+
 function print_usage() {
     echo -e "\n${RED}Usage${NONE}:
     ${BOLD}${SCRIPT_NAME}${NONE} [OPTION]"
@@ -31,12 +33,14 @@ function print_usage() {
     ${BLUE}single_test${NONE}: run a single unit test
     ${BLUE}bind_test${NONE}: parallel tests bind to different GPU
     ${BLUE}doc${NONE}: generate paddle documents
+    ${BLUE}gen_doc_lib${NONE}: generate paddle documents library
     ${BLUE}html${NONE}: convert C++ source code into HTML
     ${BLUE}dockerfile${NONE}: generate paddle release dockerfile
     ${BLUE}capi${NONE}: generate paddle CAPI package
     ${BLUE}fluid_inference_lib${NONE}: deploy fluid inference library
     ${BLUE}check_style${NONE}: run code style check
     ${BLUE}cicheck${NONE}: run CI tasks
+    ${BLUE}assert_api_not_changed${NONE}: check api compability
     "
 }
 
@@ -78,6 +82,12 @@ function cmake_gen() {
             PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27mu/bin/python
         -DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27mu/include/python2.7
         -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs4/lib/libpython2.7.so"
+        elif [ "$1" == "cp35-cp35m" ]; then
+            export LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH}
+            export PATH=/opt/_internal/cpython-3.5.1/bin/:${PATH}
+            export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.5.1/bin/python3
+            -DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.5.1/include/python3.5m
+            -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.5.1/lib/libpython3.so"
         fi
     fi
 
@@ -106,8 +116,9 @@ function cmake_gen() {
         -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF}
         -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
         -DWITH_CONTRIB=${WITH_CONTRIB:-ON}
+        -DWITH_INFERENCE=${WITH_INFERENCE:-ON}
         -DWITH_ANAKIN=${WITH_ANAKIN:-OFF}
-        -DWITH_INFERENCE_DEMO=${WITH_INFERENCE_DEMO:-ON}
+        -DPY_VERSION=${PY_VERSION:-2.7}
     ========================================
 EOF
     # Disable UNITTEST_USE_VIRTUALENV in docker because
@@ -135,8 +146,9 @@ EOF
         -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} \
         -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
         -DWITH_CONTRIB=${WITH_CONTRIB:-ON} \
+        -DWITH_INFERENCE=${WITH_INFERENCE:-ON} \
         -DWITH_ANAKIN=${WITH_ANAKIN:-OFF} \
-        -DWITH_INFERENCE_DEMO=${WITH_INFERENCE_DEMO:-ON}
+        -DPY_VERSION=${PY_VERSION:-2.7}
 }
 
 function abort(){
@@ -318,13 +330,35 @@ function assert_api_not_changed() {
     virtualenv .env
     source .env/bin/activate
     pip install ${PADDLE_ROOT}/build/python/dist/*whl
-    curl ${PADDLE_API_SPEC_URL:-https://raw.githubusercontent.com/PaddlePaddle/FluidAPISpec/master/API.spec} \
-        > origin.spec
     python ${PADDLE_ROOT}/tools/print_signatures.py paddle.fluid > new.spec
-    python ${PADDLE_ROOT}/tools/diff_api.py origin.spec new.spec
+    if [ "$1" == "cp35-cp35m" ]; then
+        # Use sed to make python2 and python3 sepc keeps the same
+        sed -i 's/arg0: str/arg0: unicode/g' new.spec
+        sed -i "s/\(.*Transpiler.*\).__init__ ArgSpec(args=\['self'].*/\1.__init__ /g" new.spec
+    fi
+    python ${PADDLE_ROOT}/tools/diff_api.py ${PADDLE_ROOT}/paddle/fluid/API.spec new.spec
     deactivate
 }
 
+function assert_api_spec_approvals() {
+    if [ -z ${BRANCH} ]; then
+        BRANCH="develop"
+    fi
+
+    API_CHANGE=`git diff --name-only upstream/$BRANCH | grep "paddle/fluid/API.spec" || true`
+    echo "checking API.spec change, PR: ${GIT_PR_ID}, changes: ${API_CHANGE}"
+    if [ ${API_CHANGE} ] && [ "${GIT_PR_ID}" != "" ]; then
+        # NOTE: per_page=10000 should be ok for all cases, a PR review > 10000 is not human readable.
+        APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
+        python ${PADDLE_ROOT}/tools/check_pr_approval.py 2 7845005 2887803 728699 13348433`
+        echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
+        if [ "${APPROVALS}" == "FALSE" ]; then
+            echo "You must have at least 2 approvals for the api change!"
+        exit 1
+        fi
+    fi
+}
+
 
 function single_test() {
     TEST_NAME=$1
@@ -397,6 +431,61 @@ EOF
     linkchecker doc/v2/en/html/index.html
     linkchecker doc/v2/cn/html/index.html
     linkchecker doc/v2/api/en/html/index.html
+
+#    if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit 0; fi;
+#
+#    # Deploy to the the content server if its a "develop" or "release/version" branch
+#    # The "develop_doc" branch is reserved to test full deploy process without impacting the real content.
+#    if [ "$TRAVIS_BRANCH" == "develop_doc" ]; then
+#        PPO_SCRIPT_BRANCH=develop
+#    elif [[ "$TRAVIS_BRANCH" == "develop"  ||  "$TRAVIS_BRANCH" =~ ^v|release/[[:digit:]]+\.[[:digit:]]+(\.[[:digit:]]+)?(-\S*)?$ ]]; then
+#        PPO_SCRIPT_BRANCH=master
+#    else
+#        # Early exit, this branch doesn't require documentation build
+#        return 0;
+#    fi
+#     # Fetch the paddlepaddle.org deploy_docs.sh from the appopriate branch
+#    export DEPLOY_DOCS_SH=https://raw.githubusercontent.com/PaddlePaddle/PaddlePaddle.org/$PPO_SCRIPT_BRANCH/scripts/deploy/deploy_docs.sh
+#    export PYTHONPATH=$PYTHONPATH:${PADDLE_ROOT}/build/python:/paddle/build/python
+#    cd ..
+#    curl $DEPLOY_DOCS_SH | bash -s $CONTENT_DEC_PASSWD $TRAVIS_BRANCH ${PADDLE_ROOT} ${PADDLE_ROOT}/build/doc/ ${PPO_SCRIPT_BRANCH}
+#    cd -
+}
+
+function gen_doc_lib() {
+    mkdir -p ${PADDLE_ROOT}/build
+    cd ${PADDLE_ROOT}/build
+    cat <<EOF
+    ========================================
+    Building documentation library ...
+    In /paddle/build
+    ========================================
+EOF
+    cmake .. \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DWITH_DOC=ON \
+        -DWITH_GPU=OFF \
+        -DWITH_MKL=OFF \
+        -DWITH_FLUID_ONLY=ON
+
+    local LIB_TYPE=$1
+    case $LIB_TYPE in
+      full)
+        # Build full Paddle Python module. Will timeout without caching 'copy_paddle_pybind' first
+        make -j `nproc` gen_proto_py framework_py_proto copy_paddle_pybind paddle_python
+        ;;
+      pybind)
+        # Build paddle pybind library. Takes 49 minutes to build. Might timeout
+        make -j `nproc` copy_paddle_pybind
+        ;;
+      proto)
+        # Even smaller library.
+        make -j `nproc` framework_py_proto
+        ;;
+      *)
+        exit 0
+        ;;
+      esac
 }
 
 function gen_html() {
@@ -448,7 +537,7 @@ EOF
 EOF
 
     if [[ ${WITH_GPU} == "ON"  ]]; then
-        NCCL_DEPS="apt-get install -y --allow-downgrades libnccl2=2.1.2-1+cuda${CUDA_MAJOR} libnccl-dev=2.1.2-1+cuda${CUDA_MAJOR} &&"
+        NCCL_DEPS="apt-get install -y --allow-downgrades libnccl2=2.2.13-1+cuda${CUDA_MAJOR} libnccl-dev=2.2.13-1+cuda${CUDA_MAJOR} &&"
     else
         NCCL_DEPS=""
     fi
@@ -495,28 +584,41 @@ function gen_capi_package() {
         rm -rf $install_prefix
         make DESTDIR="$install_prefix" install
         cd $install_prefix/usr/local
-        ls | egrep -v "^Found.*item$" | xargs tar -cf ${PADDLE_ROOT}/build/paddle.tgz
+        ls | egrep -v "^Found.*item$" | xargs tar -czf ${PADDLE_ROOT}/build/paddle.tgz
     fi
 }
 
 function gen_fluid_inference_lib() {
     mkdir -p ${PADDLE_ROOT}/build
     cd ${PADDLE_ROOT}/build
-    if [ ${WITH_C_API:-OFF} == "OFF" ] ; then
+    if [[ ${WITH_C_API:-OFF} == "OFF" && ${WITH_INFERENCE:-ON} == "ON" ]] ; then
         cat <<EOF
     ========================================
     Deploying fluid inference library ...
     ========================================
 EOF
+        cmake .. -DWITH_DISTRIBUTE=OFF
         make -j `nproc` inference_lib_dist
         cd ${PADDLE_ROOT}/build
-        mv fluid_install_dir fluid
-        tar -cf fluid.tgz fluid
+        cp -r fluid_install_dir fluid
+        tar -czf fluid.tgz fluid
+      fi
+}
+
+function test_fluid_inference_lib() {
+    if [[ ${WITH_C_API:-OFF} == "OFF" && ${WITH_INFERENCE:-ON} == "ON" ]] ; then
+        cat <<EOF
+    ========================================
+    Testing fluid inference library ...
+    ========================================
+EOF
+        cd ${PADDLE_ROOT}/paddle/fluid/inference/api/demo_ci
+        ./run.sh ${PADDLE_ROOT} ${WITH_MKL:-ON} ${WITH_GPU:-OFF}
+        ./clean.sh
       fi
 }
 
 function main() {
-    set -e
     local CMD=$1
     init
     case $CMD in
@@ -543,6 +645,9 @@ function main() {
       doc)
         gen_docs
         ;;
+      gen_doc_lib)
+        gen_doc_lib $2
+        ;;
       html)
         gen_html
         ;;
@@ -557,6 +662,7 @@ function main() {
       fluid_inference_lib)
         cmake_gen ${PYTHON_ABI:-""}
         gen_fluid_inference_lib
+        test_fluid_inference_lib
         ;;
       check_style)
         check_style
@@ -564,10 +670,12 @@ function main() {
       cicheck)
         cmake_gen ${PYTHON_ABI:-""}
         build
-        assert_api_not_changed
+        assert_api_not_changed ${PYTHON_ABI:-""}
         run_test
         gen_capi_package
         gen_fluid_inference_lib
+        test_fluid_inference_lib
+        assert_api_spec_approvals
         ;;
       *)
         print_usage
diff --git a/paddle/scripts/paddle_docker_build.sh b/paddle/scripts/paddle_docker_build.sh
index 3462deb9c2f88b6da643d6aa833449ed5f4a9b34..174c2a12f007b282a5182c0aec9b0a6bec9e55fa 100755
--- a/paddle/scripts/paddle_docker_build.sh
+++ b/paddle/scripts/paddle_docker_build.sh
@@ -52,6 +52,9 @@ EOL
     ${DOCKER_CMD} run -it \
         ${DOCKER_ENV} \
         -e SCRIPT_NAME=$0 \
+        -e CONTENT_DEC_PASSWD=$CONTENT_DEC_PASSWD \
+        -e TRAVIS_BRANCH=$TRAVIS_BRANCH \
+        -e TRAVIS_PULL_REQUEST=$TRAVIS_PULL_REQUEST \
         -v $PADDLE_ROOT:/paddle \
         -v ${HOME}/.ccache:/root/.ccache \
         -w /paddle \
diff --git a/paddle/scripts/submit_local.sh.in b/paddle/scripts/submit_local.sh.in
index 1283de9d957a46b848c7bb6caf9c5f49398468e2..622a2d51049d164b6e8423e4054081f40f190cb9 100755
--- a/paddle/scripts/submit_local.sh.in
+++ b/paddle/scripts/submit_local.sh.in
@@ -54,7 +54,7 @@ function cpu_config() {
   if [ $platform == "Linux" ]; then
     ht=`lscpu |grep "per core"|awk -F':' '{print $2}'|xargs`
   elif [ $platform == "Darwin" ]; then
-    if [`sysctl -n hw.physicalcpu` -eq `sysctl -n hw.logicalcpu`]; then
+    if [ `sysctl -n hw.physicalcpu` -eq `sysctl -n hw.logicalcpu` ]; then
       # HT is OFF
       ht=1
     fi
diff --git a/patches/grpc/completion_queue.h b/patches/grpc/completion_queue.h
new file mode 100644
index 0000000000000000000000000000000000000000..6e92c60ea2db00cc6e227830228888f9a06735c4
--- /dev/null
+++ b/patches/grpc/completion_queue.h
@@ -0,0 +1,386 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/// A completion queue implements a concurrent producer-consumer queue, with
+/// two main API-exposed methods: \a Next and \a AsyncNext. These
+/// methods are the essential component of the gRPC C++ asynchronous API.
+/// There is also a \a Shutdown method to indicate that a given completion queue
+/// will no longer have regular events. This must be called before the
+/// completion queue is destroyed.
+/// All completion queue APIs are thread-safe and may be used concurrently with
+/// any other completion queue API invocation; it is acceptable to have
+/// multiple threads calling \a Next or \a AsyncNext on the same or different
+/// completion queues, or to call these methods concurrently with a \a Shutdown
+/// elsewhere.
+/// \remark{All other API calls on completion queue should be completed before
+/// a completion queue destructor is called.}
+#ifndef GRPCPP_IMPL_CODEGEN_COMPLETION_QUEUE_H
+#define GRPCPP_IMPL_CODEGEN_COMPLETION_QUEUE_H
+
+#include <typeinfo>
+
+#include <grpc/impl/codegen/atm.h>
+#include <grpcpp/impl/codegen/completion_queue_tag.h>
+#include <grpcpp/impl/codegen/core_codegen_interface.h>
+#include <grpcpp/impl/codegen/grpc_library.h>
+#include <grpcpp/impl/codegen/status.h>
+#include <grpcpp/impl/codegen/time.h>
+
+struct grpc_completion_queue;
+
+namespace grpc {
+
+template <class R>
+class ClientReader;
+template <class W>
+class ClientWriter;
+template <class W, class R>
+class ClientReaderWriter;
+template <class R>
+class ServerReader;
+template <class W>
+class ServerWriter;
+namespace internal {
+template <class W, class R>
+class ServerReaderWriterBody;
+}  // namespace internal
+
+class Channel;
+class ChannelInterface;
+class ClientContext;
+class CompletionQueue;
+class Server;
+class ServerBuilder;
+class ServerContext;
+class ServerInterface;
+
+namespace internal {
+class CompletionQueueTag;
+class RpcMethod;
+template <class ServiceType, class RequestType, class ResponseType>
+class RpcMethodHandler;
+template <class ServiceType, class RequestType, class ResponseType>
+class ClientStreamingHandler;
+template <class ServiceType, class RequestType, class ResponseType>
+class ServerStreamingHandler;
+template <class ServiceType, class RequestType, class ResponseType>
+class BidiStreamingHandler;
+class UnknownMethodHandler;
+template <class Streamer, bool WriteNeeded>
+class TemplatedBidiStreamingHandler;
+template <class InputMessage, class OutputMessage>
+class BlockingUnaryCallImpl;
+}  // namespace internal
+
+extern CoreCodegenInterface* g_core_codegen_interface;
+
+/// A thin wrapper around \ref grpc_completion_queue (see \ref
+/// src/core/lib/surface/completion_queue.h).
+/// See \ref doc/cpp/perf_notes.md for notes on best practices for high
+/// performance servers.
+class CompletionQueue : private GrpcLibraryCodegen {
+ public:
+  /// Default constructor. Implicitly creates a \a grpc_completion_queue
+  /// instance.
+  CompletionQueue()
+      : CompletionQueue(grpc_completion_queue_attributes{
+            GRPC_CQ_CURRENT_VERSION, GRPC_CQ_NEXT, GRPC_CQ_DEFAULT_POLLING}) {}
+
+  /// Wrap \a take, taking ownership of the instance.
+  ///
+  /// \param take The completion queue instance to wrap. Ownership is taken.
+  explicit CompletionQueue(grpc_completion_queue* take);
+
+  /// Destructor. Destroys the owned wrapped completion queue / instance.
+  ~CompletionQueue() {
+    if (typeid(*g_core_codegen_interface).hash_code() !=
+        typeid(CoreCodegenInterface).hash_code()) {
+      g_core_codegen_interface->grpc_completion_queue_destroy(cq_);
+    }
+  }
+
+  /// Tri-state return for AsyncNext: SHUTDOWN, GOT_EVENT, TIMEOUT.
+  enum NextStatus {
+    SHUTDOWN,   ///< The completion queue has been shutdown and fully-drained
+    GOT_EVENT,  ///< Got a new event; \a tag will be filled in with its
+                ///< associated value; \a ok indicating its success.
+    TIMEOUT     ///< deadline was reached.
+  };
+
+  /// Read from the queue, blocking until an event is available or the queue is
+  /// shutting down.
+  ///
+  /// \param tag[out] Updated to point to the read event's tag.
+  /// \param ok[out] true if read a successful event, false otherwise.
+  ///
+  /// Note that each tag sent to the completion queue (through RPC operations
+  /// or alarms) will be delivered out of the completion queue by a call to
+  /// Next (or a related method), regardless of whether the operation succeeded
+  /// or not. Success here means that this operation completed in the normal
+  /// valid manner.
+  ///
+  /// Server-side RPC request: \a ok indicates that the RPC has indeed
+  /// been started. If it is false, the server has been Shutdown
+  /// before this particular call got matched to an incoming RPC.
+  ///
+  /// Client-side StartCall/RPC invocation: \a ok indicates that the RPC is
+  /// going to go to the wire. If it is false, it not going to the wire. This
+  /// would happen if the channel is either permanently broken or
+  /// transiently broken but with the fail-fast option. (Note that async unary
+  /// RPCs don't post a CQ tag at this point, nor do client-streaming
+  /// or bidi-streaming RPCs that have the initial metadata corked option set.)
+  ///
+  /// Client-side Write, Client-side WritesDone, Server-side Write,
+  /// Server-side Finish, Server-side SendInitialMetadata (which is
+  /// typically included in Write or Finish when not done explicitly):
+  /// \a ok means that the data/metadata/status/etc is going to go to the
+  /// wire. If it is false, it not going to the wire because the call
+  /// is already dead (i.e., canceled, deadline expired, other side
+  /// dropped the channel, etc).
+  ///
+  /// Client-side Read, Server-side Read, Client-side
+  /// RecvInitialMetadata (which is typically included in Read if not
+  /// done explicitly): \a ok indicates whether there is a valid message
+  /// that got read. If not, you know that there are certainly no more
+  /// messages that can ever be read from this stream. For the client-side
+  /// operations, this only happens because the call is dead. For the
+  /// server-sider operation, though, this could happen because the client
+  /// has done a WritesDone already.
+  ///
+  /// Client-side Finish: \a ok should always be true
+  ///
+  /// Server-side AsyncNotifyWhenDone: \a ok should always be true
+  ///
+  /// Alarm: \a ok is true if it expired, false if it was canceled
+  ///
+  /// \return true if got an event, false if the queue is fully drained and
+  ///         shut down.
+  bool Next(void** tag, bool* ok) {
+    return (AsyncNextInternal(tag,
+                              ok,
+                              g_core_codegen_interface->gpr_inf_future(
+                                  GPR_CLOCK_REALTIME)) != SHUTDOWN);
+  }
+
+  /// Read from the queue, blocking up to \a deadline (or the queue's shutdown).
+  /// Both \a tag and \a ok are updated upon success (if an event is available
+  /// within the \a deadline).  A \a tag points to an arbitrary location usually
+  /// employed to uniquely identify an event.
+  ///
+  /// \param tag[out] Upon sucess, updated to point to the event's tag.
+  /// \param ok[out] Upon sucess, true if a successful event, false otherwise
+  ///        See documentation for CompletionQueue::Next for explanation of ok
+  /// \param deadline[in] How long to block in wait for an event.
+  ///
+  /// \return The type of event read.
+  template <typename T>
+  NextStatus AsyncNext(void** tag, bool* ok, const T& deadline) {
+    TimePoint<T> deadline_tp(deadline);
+    return AsyncNextInternal(tag, ok, deadline_tp.raw_time());
+  }
+
+  /// EXPERIMENTAL
+  /// First executes \a F, then reads from the queue, blocking up to
+  /// \a deadline (or the queue's shutdown).
+  /// Both \a tag and \a ok are updated upon success (if an event is available
+  /// within the \a deadline).  A \a tag points to an arbitrary location usually
+  /// employed to uniquely identify an event.
+  ///
+  /// \param F[in] Function to execute before calling AsyncNext on this queue.
+  /// \param tag[out] Upon sucess, updated to point to the event's tag.
+  /// \param ok[out] Upon sucess, true if read a regular event, false otherwise.
+  /// \param deadline[in] How long to block in wait for an event.
+  ///
+  /// \return The type of event read.
+  template <typename T, typename F>
+  NextStatus DoThenAsyncNext(F&& f, void** tag, bool* ok, const T& deadline) {
+    CompletionQueueTLSCache cache = CompletionQueueTLSCache(this);
+    f();
+    if (cache.Flush(tag, ok)) {
+      return GOT_EVENT;
+    } else {
+      return AsyncNext(tag, ok, deadline);
+    }
+  }
+
+  /// Request the shutdown of the queue.
+  ///
+  /// \warning This method must be called at some point if this completion queue
+  /// is accessed with Next or AsyncNext. \a Next will not return false
+  /// until this method has been called and all pending tags have been drained.
+  /// (Likewise for \a AsyncNext returning \a NextStatus::SHUTDOWN .)
+  /// Only once either one of these methods does that (that is, once the queue
+  /// has been \em drained) can an instance of this class be destroyed.
+  /// Also note that applications must ensure that no work is enqueued on this
+  /// completion queue after this method is called.
+  void Shutdown();
+
+  /// Returns a \em raw pointer to the underlying \a grpc_completion_queue
+  /// instance.
+  ///
+  /// \warning Remember that the returned instance is owned. No transfer of
+  /// owership is performed.
+  grpc_completion_queue* cq() { return cq_; }
+
+ protected:
+  /// Private constructor of CompletionQueue only visible to friend classes
+  CompletionQueue(const grpc_completion_queue_attributes& attributes) {
+    cq_ = g_core_codegen_interface->grpc_completion_queue_create(
+        g_core_codegen_interface->grpc_completion_queue_factory_lookup(
+            &attributes),
+        &attributes,
+        NULL);
+    InitialAvalanching();  // reserve this for the future shutdown
+  }
+
+ private:
+  // Friend synchronous wrappers so that they can access Pluck(), which is
+  // a semi-private API geared towards the synchronous implementation.
+  template <class R>
+  friend class ::grpc::ClientReader;
+  template <class W>
+  friend class ::grpc::ClientWriter;
+  template <class W, class R>
+  friend class ::grpc::ClientReaderWriter;
+  template <class R>
+  friend class ::grpc::ServerReader;
+  template <class W>
+  friend class ::grpc::ServerWriter;
+  template <class W, class R>
+  friend class ::grpc::internal::ServerReaderWriterBody;
+  template <class ServiceType, class RequestType, class ResponseType>
+  friend class ::grpc::internal::RpcMethodHandler;
+  template <class ServiceType, class RequestType, class ResponseType>
+  friend class ::grpc::internal::ClientStreamingHandler;
+  template <class ServiceType, class RequestType, class ResponseType>
+  friend class ::grpc::internal::ServerStreamingHandler;
+  template <class Streamer, bool WriteNeeded>
+  friend class ::grpc::internal::TemplatedBidiStreamingHandler;
+  friend class ::grpc::internal::UnknownMethodHandler;
+  friend class ::grpc::Server;
+  friend class ::grpc::ServerContext;
+  friend class ::grpc::ServerInterface;
+  template <class InputMessage, class OutputMessage>
+  friend class ::grpc::internal::BlockingUnaryCallImpl;
+
+  /// EXPERIMENTAL
+  /// Creates a Thread Local cache to store the first event
+  /// On this completion queue queued from this thread.  Once
+  /// initialized, it must be flushed on the same thread.
+  class CompletionQueueTLSCache {
+   public:
+    CompletionQueueTLSCache(CompletionQueue* cq);
+    ~CompletionQueueTLSCache();
+    bool Flush(void** tag, bool* ok);
+
+   private:
+    CompletionQueue* cq_;
+    bool flushed_;
+  };
+
+  NextStatus AsyncNextInternal(void** tag, bool* ok, gpr_timespec deadline);
+
+  /// Wraps \a grpc_completion_queue_pluck.
+  /// \warning Must not be mixed with calls to \a Next.
+  bool Pluck(internal::CompletionQueueTag* tag) {
+    auto deadline =
+        g_core_codegen_interface->gpr_inf_future(GPR_CLOCK_REALTIME);
+    auto ev = g_core_codegen_interface->grpc_completion_queue_pluck(
+        cq_, tag, deadline, nullptr);
+    bool ok = ev.success != 0;
+    void* ignored = tag;
+    GPR_CODEGEN_ASSERT(tag->FinalizeResult(&ignored, &ok));
+    GPR_CODEGEN_ASSERT(ignored == tag);
+    // Ignore mutations by FinalizeResult: Pluck returns the C API status
+    return ev.success != 0;
+  }
+
+  /// Performs a single polling pluck on \a tag.
+  /// \warning Must not be mixed with calls to \a Next.
+  ///
+  /// TODO: sreek - This calls tag->FinalizeResult() even if the cq_ is already
+  /// shutdown. This is most likely a bug and if it is a bug, then change this
+  /// implementation to simple call the other TryPluck function with a zero
+  /// timeout. i.e:
+  ///      TryPluck(tag, gpr_time_0(GPR_CLOCK_REALTIME))
+  void TryPluck(internal::CompletionQueueTag* tag) {
+    auto deadline = g_core_codegen_interface->gpr_time_0(GPR_CLOCK_REALTIME);
+    auto ev = g_core_codegen_interface->grpc_completion_queue_pluck(
+        cq_, tag, deadline, nullptr);
+    if (ev.type == GRPC_QUEUE_TIMEOUT) return;
+    bool ok = ev.success != 0;
+    void* ignored = tag;
+    // the tag must be swallowed if using TryPluck
+    GPR_CODEGEN_ASSERT(!tag->FinalizeResult(&ignored, &ok));
+  }
+
+  /// Performs a single polling pluck on \a tag. Calls tag->FinalizeResult if
+  /// the pluck() was successful and returned the tag.
+  ///
+  /// This exects tag->FinalizeResult (if called) to return 'false' i.e expects
+  /// that the tag is internal not something that is returned to the user.
+  void TryPluck(internal::CompletionQueueTag* tag, gpr_timespec deadline) {
+    auto ev = g_core_codegen_interface->grpc_completion_queue_pluck(
+        cq_, tag, deadline, nullptr);
+    if (ev.type == GRPC_QUEUE_TIMEOUT || ev.type == GRPC_QUEUE_SHUTDOWN) {
+      return;
+    }
+
+    bool ok = ev.success != 0;
+    void* ignored = tag;
+    GPR_CODEGEN_ASSERT(!tag->FinalizeResult(&ignored, &ok));
+  }
+
+  /// Manage state of avalanching operations : completion queue tags that
+  /// trigger other completion queue operations. The underlying core completion
+  /// queue should not really shutdown until all avalanching operations have
+  /// been finalized. Note that we maintain the requirement that an avalanche
+  /// registration must take place before CQ shutdown (which must be maintained
+  /// elsehwere)
+  void InitialAvalanching() {
+    gpr_atm_rel_store(&avalanches_in_flight_, static_cast<gpr_atm>(1));
+  }
+  void RegisterAvalanching() {
+    gpr_atm_no_barrier_fetch_add(&avalanches_in_flight_,
+                                 static_cast<gpr_atm>(1));
+  }
+  void CompleteAvalanching();
+
+  grpc_completion_queue* cq_;  // owned
+
+  gpr_atm avalanches_in_flight_;
+};
+
+/// A specific type of completion queue used by the processing of notifications
+/// by servers. Instantiated by \a ServerBuilder.
+class ServerCompletionQueue : public CompletionQueue {
+ public:
+  bool IsFrequentlyPolled() { return polling_type_ != GRPC_CQ_NON_LISTENING; }
+
+ private:
+  grpc_cq_polling_type polling_type_;
+  friend class ServerBuilder;
+  /// \param is_frequently_polled Informs the GRPC library about whether the
+  /// server completion queue would be actively polled (by calling Next() or
+  /// AsyncNext()). By default all server completion queues are assumed to be
+  /// frequently polled.
+  ServerCompletionQueue(grpc_cq_polling_type polling_type)
+      : CompletionQueue(grpc_completion_queue_attributes{
+            GRPC_CQ_CURRENT_VERSION, GRPC_CQ_NEXT, polling_type}),
+        polling_type_(polling_type) {}
+};
+
+}  // namespace grpc
+
+#endif  // GRPCPP_IMPL_CODEGEN_COMPLETION_QUEUE_H
diff --git a/patches/grpc/grpc_library.h b/patches/grpc/grpc_library.h
new file mode 100644
index 0000000000000000000000000000000000000000..4870a1cda4b2a6489bc379fe53cf3e9659fffc47
--- /dev/null
+++ b/patches/grpc/grpc_library.h
@@ -0,0 +1,64 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GRPCPP_IMPL_CODEGEN_GRPC_LIBRARY_H
+#define GRPCPP_IMPL_CODEGEN_GRPC_LIBRARY_H
+
+#include <typeinfo>
+
+#include <grpcpp/impl/codegen/core_codegen_interface.h>
+
+namespace grpc {
+
+class GrpcLibraryInterface {
+ public:
+  virtual ~GrpcLibraryInterface() = default;
+  virtual void init() = 0;
+  virtual void shutdown() = 0;
+};
+
+/// Initialized by \a grpc::GrpcLibraryInitializer from
+/// <grpcpp/impl/grpc_library.h>
+extern GrpcLibraryInterface* g_glip;
+
+/// Classes that require gRPC to be initialized should inherit from this class.
+class GrpcLibraryCodegen {
+ public:
+  GrpcLibraryCodegen(bool call_grpc_init = true) : grpc_init_called_(false) {
+    if (call_grpc_init) {
+      GPR_CODEGEN_ASSERT(g_glip &&
+                         "gRPC library not initialized. See "
+                         "grpc::internal::GrpcLibraryInitializer.");
+      g_glip->init();
+      grpc_init_called_ = true;
+    }
+  }
+  virtual ~GrpcLibraryCodegen() {
+    if (grpc_init_called_ &&
+        typeid(*g_glip).hash_code() !=
+            typeid(GrpcLibraryInterface).hash_code()) {
+      GPR_CODEGEN_ASSERT(g_glip &&
+                         "gRPC library not initialized. See "
+                         "grpc::internal::GrpcLibraryInitializer.");
+      g_glip->shutdown();
+    }
+  }
+
+ private:
+  bool grpc_init_called_;
+};
+
+}  // namespace grpc
+
+#endif  // GRPCPP_IMPL_CODEGEN_GRPC_LIBRARY_H
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 797c0fbcc4a2d61f5cbbf691db19b4cba5d38630..9cdcb87df5dd1669066c204c86c269973df506f1 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -91,3 +91,17 @@ endif()
 install(DIRECTORY ${PADDLE_PYTHON_PACKAGE_DIR}
     DESTINATION opt/paddle/share/wheels
 )
+
+if(APPLE)
+  find_program(INSTALL_NAME_TOOL_EXECUTABLE install_name_tool)
+  if(NOT INSTALL_NAME_TOOL_EXECUTABLE)
+    message(FATAL_ERROR "install_name_tool not found, please check.\n")
+  endif()
+endif()
+if(LINUX)
+  find_program(PATCHELF_EXECUTABLE patchelf)
+  if(NOT PATCHELF_EXECUTABLE)
+    message(FATAL_ERROR "patchelf not found, please install it.\n"
+            "For Ubuntu, the command is: apt-get install -y patchelf.")
+  endif()
+endif(LINUX)
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index d1cf04161ae4444ebc7da7fbc20e37dafe6c0fb1..53746afdb25b34b69f89fe0927c877ace62d7d55 100644
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -12,16 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 try:
-    from version import full_version as __version__
-    from version import commit as __git_commit__
+    from paddle.version import full_version as __version__
+    from paddle.version import commit as __git_commit__
 
 except ImportError:
     import sys
-    sys.stderr.write('''Warning with import paddle: you should not 
+    sys.stderr.write('''Warning with import paddle: you should not
      import paddle from the source directory; please install paddlepaddle*.whl firstly.'''
                      )
 
-import reader
-import dataset
-import batch
+import paddle.reader
+import paddle.dataset
+import paddle.batch
+import paddle.compat
 batch = batch.batch
diff --git a/python/paddle/batch.py b/python/paddle/batch.py
index 3c6a53db3c2287e8ef5931a06ca5dad455665ee0..008509660739d61245526278735064472b8b06dd 100644
--- a/python/paddle/batch.py
+++ b/python/paddle/batch.py
@@ -15,7 +15,7 @@
 __all__ = ['batch']
 
 
-def batch(reader, batch_size, drop_last=True):
+def batch(reader, batch_size, drop_last=False):
     """
     Create a batched reader.
 
@@ -40,4 +40,10 @@ def batch(reader, batch_size, drop_last=True):
         if drop_last == False and len(b) != 0:
             yield b
 
+    # Batch size check
+    batch_size = int(batch_size)
+    if batch_size <= 0:
+        raise ValueError("batch_size should be a positive integeral value, "
+                         "but got batch_size={}".format(batch_size))
+
     return batch_reader
diff --git a/python/paddle/compat.py b/python/paddle/compat.py
new file mode 100644
index 0000000000000000000000000000000000000000..50726b6fa1bbbde68a590c86db9344b8f02f79f2
--- /dev/null
+++ b/python/paddle/compat.py
@@ -0,0 +1,237 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import six
+import math
+
+__all__ = [
+    'long_type',
+    'to_text',
+    'to_bytes',
+    'round',
+    'floor_division',
+    'get_exception_message',
+]
+
+if six.PY2:
+    int_type = int
+    long_type = long
+else:
+    int_type = int
+    long_type = int
+
+
+#  str and bytes related functions
+def to_text(obj, encoding='utf-8', inplace=False):
+    """
+      All string in PaddlePaddle should be represented as a literal string.
+    This function will convert object to a literal string without any encoding.
+    Especially, if the object type is a list or set container, we will iterate
+    all items in the object and convert them to literal string.
+
+    In Python3:
+        Decode the bytes type object to str type with specific encoding
+
+    In Python2:
+        Decode the str type object to unicode type with specific encoding
+
+    Args:
+        obj(unicode|str|bytes|list|set) : The object to be decoded.
+        encoding(str) : The encoding format to decode a string
+        inplace(bool) : If we change the original object or we create a new one
+
+    Returns:
+        Decoded result of obj
+    """
+    if obj is None:
+        return obj
+
+    if isinstance(obj, list):
+        if inplace:
+            for i in six.moves.xrange(len(obj)):
+                obj[i] = _to_text(obj[i], encoding)
+            return obj
+        else:
+            return [_to_text(item, encoding) for item in obj]
+    elif isinstance(obj, set):
+        if inplace:
+            for item in obj:
+                obj.remove(item)
+                obj.add(_to_text(item, encoding))
+            return obj
+        else:
+            return set([_to_text(item, encoding) for item in obj])
+    else:
+        return _to_text(obj, encoding)
+
+
+def _to_text(obj, encoding):
+    """
+    In Python3:
+        Decode the bytes type object to str type with specific encoding
+
+    In Python2:
+        Decode the str type object to unicode type with specific encoding,
+        or we just return the unicode string of object
+
+    Args:
+        obj(unicode|str|bytes) : The object to be decoded.
+        encoding(str) : The encoding format
+
+    Returns:
+        decoded result of obj
+    """
+    if obj is None:
+        return obj
+
+    if isinstance(obj, six.binary_type):
+        return obj.decode(encoding)
+    elif isinstance(obj, six.text_type):
+        return obj
+    else:
+        return six.u(obj)
+
+
+def to_bytes(obj, encoding='utf-8', inplace=False):
+    """
+      All string in PaddlePaddle should be represented as a literal string.
+    This function will convert object to a bytes with specific encoding.
+    Especially, if the object type is a list or set container, we will iterate
+    all items in the object and convert them to bytes.
+
+    In Python3:
+        Encode the str type object to bytes type with specific encoding
+
+    In Python2:
+        Encode the unicode type object to str type with specific encoding,
+        or we just return the 8-bit string of object
+
+    Args:
+        obj(unicode|str|bytes|list|set) : The object to be encoded.
+        encoding(str) : The encoding format to encode a string
+        inplace(bool) : If we change the original object or we create a new one
+
+    Returns:
+        Decoded result of obj
+    """
+    if obj is None:
+        return obj
+
+    if isinstance(obj, list):
+        if inplace:
+            for i in six.moves.xrange(len(obj)):
+                obj[i] = _to_bytes(obj[i], encoding)
+            return obj
+        else:
+            return [_to_bytes(item, encoding) for item in obj]
+    elif isinstance(obj, set):
+        if inplace:
+            for item in obj:
+                obj.remove(item)
+                obj.add(_to_bytes(item, encoding))
+            return obj
+        else:
+            return set([_to_bytes(item, encoding) for item in obj])
+    else:
+        return _to_bytes(obj, encoding)
+
+
+def _to_bytes(obj, encoding):
+    """
+    In Python3:
+        Encode the str type object to bytes type with specific encoding
+
+    In Python2:
+        Encode the unicode type object to str type with specific encoding,
+        or we just return the 8-bit string of object
+
+    Args:
+        obj(unicode|str|bytes) : The object to be encoded.
+        encoding(str) : The encoding format
+
+    Returns:
+        encoded result of obj
+    """
+    if obj is None:
+        return obj
+
+    assert encoding is not None
+    if isinstance(obj, six.text_type):
+        return obj.encode(encoding)
+    elif isinstance(obj, six.binary_type):
+        return obj
+    else:
+        return six.b(obj)
+
+
+# math related functions
+def round(x, d=0):
+    """
+    Compatible round which act the same behaviour in Python3.
+
+    Args:
+        x(float) : The number to round halfway.
+
+    Returns:
+        round result of x
+    """
+    if six.PY3:
+        # The official walkaround of round in Python3 is incorrect
+        # we implement accroding this answer: https://www.techforgeek.info/round_python.html
+        if x > 0.0:
+            p = 10**d
+            return float(math.floor((x * p) + math.copysign(0.5, x))) / p
+        elif x < 0.0:
+            p = 10**d
+            return float(math.ceil((x * p) + math.copysign(0.5, x))) / p
+        else:
+            return math.copysign(0.0, x)
+    else:
+        import __builtin__
+        return __builtin__.round(x, d)
+
+
+def floor_division(x, y):
+    """
+    Compatible division which act the same behaviour in Python3 and Python2,
+    whose result will be a int value of floor(x / y) in Python3 and value of
+    (x / y) in Python2.
+
+    Args:
+        x(int|float) : The number to divide.
+        y(int|float) : The number to be divided
+
+    Returns:
+        division result of x // y
+    """
+    return x // y
+
+
+# exception related functions
+def get_exception_message(exc):
+    """
+    Get the error message of a specific exception
+
+    Args:
+        exec(Exception) : The exception to get error message.
+
+    Returns:
+        the error message of exec
+    """
+    assert exc is not None
+
+    if six.PY2:
+        return exc.message
+    else:
+        return str(exc)
diff --git a/python/paddle/dataset/__init__.py b/python/paddle/dataset/__init__.py
index 3315e826e82a33dfeb9c5223ce196cffb1ae7234..54aa3edc51d3734633ce077a59bd86cec8d09032 100644
--- a/python/paddle/dataset/__init__.py
+++ b/python/paddle/dataset/__init__.py
@@ -15,20 +15,20 @@
 Dataset package.
 """
 
-import mnist
-import imikolov
-import imdb
-import cifar
-import movielens
-import conll05
-import uci_housing
-import sentiment
-import wmt14
-import wmt16
-import mq2007
-import flowers
-import voc2012
-import image
+import paddle.dataset.mnist
+import paddle.dataset.imikolov
+import paddle.dataset.imdb
+import paddle.dataset.cifar
+import paddle.dataset.movielens
+import paddle.dataset.conll05
+import paddle.dataset.uci_housing
+import paddle.dataset.sentiment
+import paddle.dataset.wmt14
+import paddle.dataset.wmt16
+import paddle.dataset.mq2007
+import paddle.dataset.flowers
+import paddle.dataset.voc2012
+import paddle.dataset.image
 
 __all__ = [
     'mnist',
diff --git a/python/paddle/dataset/cifar.py b/python/paddle/dataset/cifar.py
index 07f4dcbdab2fecf84a0a7042a48a8c8a9e5f880d..b83fa78c4c65357407b7f884f8c3fe8ef0ccaba8 100644
--- a/python/paddle/dataset/cifar.py
+++ b/python/paddle/dataset/cifar.py
@@ -28,11 +28,14 @@ images per class.
 
 """
 
-import cPickle
+from __future__ import print_function
+
 import itertools
 import numpy
 import paddle.dataset.common
 import tarfile
+import six
+from six.moves import cPickle as pickle
 
 __all__ = ['train100', 'test100', 'train10', 'test10', 'convert']
 
@@ -43,12 +46,13 @@ CIFAR100_URL = URL_PREFIX + 'cifar-100-python.tar.gz'
 CIFAR100_MD5 = 'eb9058c3a382ffc7106e4002c42a8d85'
 
 
-def reader_creator(filename, sub_name):
+def reader_creator(filename, sub_name, cycle=False):
     def read_batch(batch):
-        data = batch['data']
-        labels = batch.get('labels', batch.get('fine_labels', None))
+        data = batch[six.b('data')]
+        labels = batch.get(
+            six.b('labels'), batch.get(six.b('fine_labels'), None))
         assert labels is not None
-        for sample, label in itertools.izip(data, labels):
+        for sample, label in six.moves.zip(data, labels):
             yield (sample / 255.0).astype(numpy.float32), int(label)
 
     def reader():
@@ -56,10 +60,17 @@ def reader_creator(filename, sub_name):
             names = (each_item.name for each_item in f
                      if sub_name in each_item.name)
 
-            for name in names:
-                batch = cPickle.load(f.extractfile(name))
-                for item in read_batch(batch):
-                    yield item
+            while True:
+                for name in names:
+                    if six.PY2:
+                        batch = pickle.load(f.extractfile(name))
+                    else:
+                        batch = pickle.load(
+                            f.extractfile(name), encoding='bytes')
+                    for item in read_batch(batch):
+                        yield item
+                if not cycle:
+                    break
 
     return reader
 
@@ -94,34 +105,40 @@ def test100():
         'test')
 
 
-def train10():
+def train10(cycle=False):
     """
     CIFAR-10 training set creator.
 
     It returns a reader creator, each sample in the reader is image pixels in
     [0, 1] and label in [0, 9].
 
+    :param cycle: whether to cycle through the dataset
+    :type cycle: bool
     :return: Training reader creator
     :rtype: callable
     """
     return reader_creator(
         paddle.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
-        'data_batch')
+        'data_batch',
+        cycle=cycle)
 
 
-def test10():
+def test10(cycle=False):
     """
     CIFAR-10 test set creator.
 
     It returns a reader creator, each sample in the reader is image pixels in
     [0, 1] and label in [0, 9].
 
+    :param cycle: whether to cycle through the dataset
+    :type cycle: bool
     :return: Test reader creator.
     :rtype: callable
     """
     return reader_creator(
         paddle.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
-        'test_batch')
+        'test_batch',
+        cycle=cycle)
 
 
 def fetch():
diff --git a/python/paddle/dataset/common.py b/python/paddle/dataset/common.py
index 68660601c161d2332b17b448fae089506238ba78..ece4046f5b7a7eff5be724d6f890665be7f3344e 100644
--- a/python/paddle/dataset/common.py
+++ b/python/paddle/dataset/common.py
@@ -12,17 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import requests
 import hashlib
 import os
 import errno
 import shutil
+import six
 import sys
 import importlib
 import paddle.dataset
-import cPickle
+import six.moves.cPickle as pickle
 import glob
-import cPickle as pickle
 
 __all__ = [
     'DATA_HOME',
@@ -75,24 +77,26 @@ def download(url, module_name, md5sum, save_name=None):
     retry_limit = 3
     while not (os.path.exists(filename) and md5file(filename) == md5sum):
         if os.path.exists(filename):
-            print "file md5", md5file(filename), md5sum
+            print("file md5", md5file(filename), md5sum)
         if retry < retry_limit:
             retry += 1
         else:
             raise RuntimeError("Cannot download {0} within retry limit {1}".
                                format(url, retry_limit))
-        print "Cache file %s not found, downloading %s" % (filename, url)
+        print("Cache file %s not found, downloading %s" % (filename, url))
         r = requests.get(url, stream=True)
         total_length = r.headers.get('content-length')
 
         if total_length is None:
-            with open(filename, 'w') as f:
+            with open(filename, 'wb') as f:
                 shutil.copyfileobj(r.raw, f)
         else:
-            with open(filename, 'w') as f:
+            with open(filename, 'wb') as f:
                 dl = 0
                 total_length = int(total_length)
                 for data in r.iter_content(chunk_size=4096):
+                    if six.PY2:
+                        data = six.b(data)
                     dl += len(data)
                     f.write(data)
                     done = int(50 * dl / total_length)
@@ -104,8 +108,9 @@ def download(url, module_name, md5sum, save_name=None):
 
 
 def fetch_all():
-    for module_name in filter(lambda x: not x.startswith("__"),
-                              dir(paddle.dataset)):
+    for module_name in [
+            x for x in dir(paddle.dataset) if not x.startswith("__")
+    ]:
         if "fetch" in dir(
                 importlib.import_module("paddle.dataset.%s" % module_name)):
             getattr(
@@ -114,8 +119,9 @@ def fetch_all():
 
 
 def fetch_all_recordio(path):
-    for module_name in filter(lambda x: not x.startswith("__"),
-                              dir(paddle.dataset)):
+    for module_name in [
+            x for x in dir(paddle.dataset) if not x.startswith("__")
+    ]:
         if "convert" in dir(
                 importlib.import_module("paddle.dataset.%s" % module_name)) and \
                 not module_name == "common":
@@ -126,7 +132,7 @@ def fetch_all_recordio(path):
                 "convert")(ds_path)
 
 
-def split(reader, line_count, suffix="%05d.pickle", dumper=cPickle.dump):
+def split(reader, line_count, suffix="%05d.pickle", dumper=pickle.dump):
     """
     you can call the function as:
 
@@ -167,7 +173,7 @@ def split(reader, line_count, suffix="%05d.pickle", dumper=cPickle.dump):
 def cluster_files_reader(files_pattern,
                          trainer_count,
                          trainer_id,
-                         loader=cPickle.load):
+                         loader=pickle.load):
     """
     Create a reader that yield element from the given files, select
     a file set according trainer count and trainer_id
@@ -188,7 +194,7 @@ def cluster_files_reader(files_pattern,
         my_file_list = []
         for idx, fn in enumerate(file_list):
             if idx % trainer_count == trainer_id:
-                print "append file: %s" % fn
+                print("append file: %s" % fn)
                 my_file_list.append(fn)
         for fn in my_file_list:
             with open(fn, "r") as f:
@@ -221,7 +227,7 @@ def convert(output_path, reader, line_count, name_prefix):
         for l in lines:
             # FIXME(Yancey1989):
             # dumps with protocol: pickle.HIGHEST_PROTOCOL
-            writer.write(cPickle.dumps(l))
+            writer.write(pickle.dumps(l))
         writer.close()
 
     lines = []
diff --git a/python/paddle/dataset/conll05.py b/python/paddle/dataset/conll05.py
index 4e94ce89892f8e6822c15fdc510805e75dfca988..55cfd92721e95d66f1cf38e2f77d9bb6b9e17d7a 100644
--- a/python/paddle/dataset/conll05.py
+++ b/python/paddle/dataset/conll05.py
@@ -20,22 +20,26 @@ dataset. And a pre-trained word vector model based on Wikipedia corpus is used
 to initialize SRL model.
 """
 
+from __future__ import print_function
+
 import tarfile
 import gzip
 import itertools
 import paddle.dataset.common
+import paddle.compat as cpt
+from six.moves import zip, range
 
 __all__ = ['test, get_dict', 'get_embedding', 'convert']
 
-DATA_URL = 'http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz'
+DATA_URL = 'http://paddlemodels.bj.bcebos.com/conll05st/conll05st-tests.tar.gz'
 DATA_MD5 = '387719152ae52d60422c016e92a742fc'
-WORDDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/wordDict.txt'
+WORDDICT_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2FwordDict.txt'
 WORDDICT_MD5 = 'ea7fb7d4c75cc6254716f0177a506baa'
-VERBDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/verbDict.txt'
+VERBDICT_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2FverbDict.txt'
 VERBDICT_MD5 = '0d2977293bbb6cbefab5b0f97db1e77c'
-TRGDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/targetDict.txt'
+TRGDICT_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2FtargetDict.txt'
 TRGDICT_MD5 = 'd8c7f03ceb5fc2e5a0fa7503a4353751'
-EMB_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/emb'
+EMB_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2Femb'
 EMB_MD5 = 'bf436eb0faa1f6f9103017f8be57cdb7'
 
 UNK_IDX = 0
@@ -87,12 +91,12 @@ def corpus_reader(data_path, words_name, props_name):
             sentences = []
             labels = []
             one_seg = []
-            for word, label in itertools.izip(words_file, props_file):
-                word = word.strip()
-                label = label.strip().split()
+            for word, label in zip(words_file, props_file):
+                word = cpt.to_text(word.strip())
+                label = cpt.to_text(label.strip().split())
 
                 if len(label) == 0:  # end of sentence
-                    for i in xrange(len(one_seg[0])):
+                    for i in range(len(one_seg[0])):
                         a_kind_lable = [x[i] for x in one_seg]
                         labels.append(a_kind_lable)
 
diff --git a/python/paddle/dataset/flowers.py b/python/paddle/dataset/flowers.py
index 527044b415533cc640e3cfc5837c08ab0f8b74b1..0d4e7f1ee46ff97912d010cdb268cc4898d99f58 100644
--- a/python/paddle/dataset/flowers.py
+++ b/python/paddle/dataset/flowers.py
@@ -28,23 +28,29 @@ Graphics and Image Processing (2008)
 http://www.robots.ox.ac.uk/~vgg/publications/papers/nilsback08.{pdf,ps.gz}.
 
 """
-import cPickle
+
+from __future__ import print_function
+
 import itertools
 import functools
-from common import download
+from .common import download
 import tarfile
+import six
 import scipy.io as scio
 from paddle.dataset.image import *
 from paddle.reader import *
 import os
 import numpy as np
 from multiprocessing import cpu_count
+import six
+from six.moves import cPickle as pickle
+from six.moves import zip
 __all__ = ['train', 'test', 'valid']
 
-DATA_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/102flowers.tgz'
-LABEL_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/imagelabels.mat'
-SETID_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/setid.mat'
-DATA_MD5 = '33bfc11892f1e405ca193ae9a9f2a118'
+DATA_URL = 'http://paddlemodels.cdn.bcebos.com/flowers/102flowers.tgz'
+LABEL_URL = 'http://paddlemodels.cdn.bcebos.com/flowers/imagelabels.mat'
+SETID_URL = 'http://paddlemodels.cdn.bcebos.com/flowers/setid.mat'
+DATA_MD5 = '52808999861908f626f3c1f4e79d11fa'
 LABEL_MD5 = 'e0620be6f572b9609742df49c70aed4d'
 SETID_MD5 = 'a5357ecc9cb78c4bef273ce3793fc85c'
 # In official 'readme', tstid is the flag of test data
@@ -76,7 +82,8 @@ def reader_creator(data_file,
                    dataset_name,
                    mapper,
                    buffered_size=1024,
-                   use_xmap=True):
+                   use_xmap=True,
+                   cycle=False):
     '''
     1. read images from tar file and
         merge images into batch files in 102flowers.tgz_batch/
@@ -96,6 +103,8 @@ def reader_creator(data_file,
     :type mapper: callable
     :param buffered_size: the size of buffer used to process images
     :type buffered_size: int
+    :param cycle: whether to cycle through the dataset
+    :type cycle: bool
     :return: data reader
     :rtype: callable
     '''
@@ -108,15 +117,21 @@ def reader_creator(data_file,
     file_list = batch_images_from_tar(data_file, dataset_name, img2label)
 
     def reader():
-        for file in open(file_list):
-            file = file.strip()
-            batch = None
-            with open(file, 'r') as f:
-                batch = cPickle.load(f)
-            data = batch['data']
-            labels = batch['label']
-            for sample, label in itertools.izip(data, batch['label']):
-                yield sample, int(label) - 1
+        while True:
+            for file in open(file_list):
+                file = file.strip()
+                batch = None
+                with open(file, 'rb') as f:
+                    if six.PY2:
+                        batch = pickle.load(f)
+                    else:
+                        batch = pickle.load(f, encoding='bytes')
+                data = batch['data']
+                labels = batch['label']
+                for sample, label in zip(data, batch['label']):
+                    yield sample, int(label) - 1
+            if not cycle:
+                break
 
     if use_xmap:
         cpu_num = int(os.environ.get('CPU_NUM', cpu_count()))
@@ -125,7 +140,7 @@ def reader_creator(data_file,
         return map_readers(mapper, reader)
 
 
-def train(mapper=train_mapper, buffered_size=1024, use_xmap=True):
+def train(mapper=train_mapper, buffered_size=1024, use_xmap=True, cycle=False):
     '''
     Create flowers training set reader.
     It returns a reader, each sample in the reader is
@@ -138,17 +153,23 @@ def train(mapper=train_mapper, buffered_size=1024, use_xmap=True):
     :type mapper: callable
     :param buffered_size: the size of buffer used to process images
     :type buffered_size: int
+    :param cycle: whether to cycle through the dataset
+    :type cycle: bool
     :return: train data reader
     :rtype: callable
     '''
     return reader_creator(
         download(DATA_URL, 'flowers', DATA_MD5),
         download(LABEL_URL, 'flowers', LABEL_MD5),
-        download(SETID_URL, 'flowers', SETID_MD5), TRAIN_FLAG, mapper,
-        buffered_size, use_xmap)
+        download(SETID_URL, 'flowers', SETID_MD5),
+        TRAIN_FLAG,
+        mapper,
+        buffered_size,
+        use_xmap,
+        cycle=cycle)
 
 
-def test(mapper=test_mapper, buffered_size=1024, use_xmap=True):
+def test(mapper=test_mapper, buffered_size=1024, use_xmap=True, cycle=False):
     '''
     Create flowers test set reader.
     It returns a reader, each sample in the reader is
@@ -161,14 +182,20 @@ def test(mapper=test_mapper, buffered_size=1024, use_xmap=True):
     :type mapper: callable
     :param buffered_size: the size of buffer used to process images
     :type buffered_size: int
+    :param cycle: whether to cycle through the dataset
+    :type cycle: bool
     :return: test data reader
     :rtype: callable
     '''
     return reader_creator(
         download(DATA_URL, 'flowers', DATA_MD5),
         download(LABEL_URL, 'flowers', LABEL_MD5),
-        download(SETID_URL, 'flowers', SETID_MD5), TEST_FLAG, mapper,
-        buffered_size, use_xmap)
+        download(SETID_URL, 'flowers', SETID_MD5),
+        TEST_FLAG,
+        mapper,
+        buffered_size,
+        use_xmap,
+        cycle=cycle)
 
 
 def valid(mapper=test_mapper, buffered_size=1024, use_xmap=True):
diff --git a/python/paddle/dataset/image.py b/python/paddle/dataset/image.py
index 9235c41e9eb95b25a0dc53a494a203e7a4525981..19fc229e6fa84792f58aeeb00be09eb2401b19c7 100644
--- a/python/paddle/dataset/image.py
+++ b/python/paddle/dataset/image.py
@@ -29,6 +29,9 @@ the image layout as follows.
   formats can be used for training. Noted that, the format should
   be keep consistent between the training and inference peroid.
 """
+
+from __future__ import print_function
+
 import numpy as np
 try:
     import cv2
@@ -36,7 +39,7 @@ except ImportError:
     cv2 = None
 import os
 import tarfile
-import cPickle
+import six.moves.cPickle as pickle
 
 __all__ = [
     "load_image_bytes", "load_image", "resize_short", "to_chw", "center_crop",
@@ -45,6 +48,18 @@ __all__ = [
 ]
 
 
+def _check_cv2():
+    if cv2 is None:
+        import sys
+        sys.stderr.write(
+            '''Warning with paddle image module: opencv-python should be imported,
+         or paddle image module could NOT work; please install opencv-python first.'''
+        )
+        return False
+    else:
+        return True
+
+
 def batch_images_from_tar(data_file,
                           dataset_name,
                           img2label,
@@ -56,7 +71,7 @@ def batch_images_from_tar(data_file,
     :type data_file: string
     :param dataset_name: 'train','test' or 'valid'
     :type dataset_name: string
-    :param img2label: a dic with image file name as key 
+    :param img2label: a dic with image file name as key
                     and image's label as value
     :type img2label: dic
     :param num_per_batch: image number per batch file
@@ -86,10 +101,10 @@ def batch_images_from_tar(data_file,
                 output = {}
                 output['label'] = labels
                 output['data'] = data
-                cPickle.dump(
+                pickle.dump(
                     output,
-                    open('%s/batch_%d' % (out_path, file_id), 'w'),
-                    protocol=cPickle.HIGHEST_PROTOCOL)
+                    open('%s/batch_%d' % (out_path, file_id), 'wb'),
+                    protocol=2)
                 file_id += 1
                 data = []
                 labels = []
@@ -97,10 +112,8 @@ def batch_images_from_tar(data_file,
         output = {}
         output['label'] = labels
         output['data'] = data
-        cPickle.dump(
-            output,
-            open('%s/batch_%d' % (out_path, file_id), 'w'),
-            protocol=cPickle.HIGHEST_PROTOCOL)
+        pickle.dump(
+            output, open('%s/batch_%d' % (out_path, file_id), 'wb'), protocol=2)
 
     with open(meta_file, 'a') as meta:
         for file in os.listdir(out_path):
@@ -113,7 +126,7 @@ def load_image_bytes(bytes, is_color=True):
     Load an color or gray image from bytes array.
 
     Example usage:
-    
+
     .. code-block:: python
 
         with open('cat.jpg') as f:
@@ -126,6 +139,8 @@ def load_image_bytes(bytes, is_color=True):
                      load and return a gray image.
     :type is_color: bool
     """
+    assert _check_cv2() is True
+
     flag = 1 if is_color else 0
     file_bytes = np.asarray(bytearray(bytes), dtype=np.uint8)
     img = cv2.imdecode(file_bytes, flag)
@@ -137,7 +152,7 @@ def load_image(file, is_color=True):
     Load an color or gray image from the file path.
 
     Example usage:
-    
+
     .. code-block:: python
 
         im = load_image('cat.jpg')
@@ -149,6 +164,8 @@ def load_image(file, is_color=True):
                      load and return a gray image.
     :type is_color: bool
     """
+    assert _check_cv2() is True
+
     # cv2.IMAGE_COLOR for OpenCV3
     # cv2.CV_LOAD_IMAGE_COLOR for older OpenCV Version
     # cv2.IMAGE_GRAYSCALE for OpenCV3
@@ -161,28 +178,30 @@ def load_image(file, is_color=True):
 
 
 def resize_short(im, size):
-    """ 
+    """
     Resize an image so that the length of shorter edge is size.
 
     Example usage:
-    
+
     .. code-block:: python
 
         im = load_image('cat.jpg')
         im = resize_short(im, 256)
-    
+
     :param im: the input image with HWC layout.
     :type im: ndarray
     :param size: the shorter edge size of image after resizing.
     :type size: int
     """
+    assert _check_cv2() is True
+
     h, w = im.shape[:2]
     h_new, w_new = size, size
     if h > w:
-        h_new = size * h / w
+        h_new = size * h // w
     else:
-        w_new = size * w / h
-    im = cv2.resize(im, (h_new, w_new), interpolation=cv2.INTER_CUBIC)
+        w_new = size * w // h
+    im = cv2.resize(im, (w_new, h_new), interpolation=cv2.INTER_CUBIC)
     return im
 
 
@@ -193,17 +212,17 @@ def to_chw(im, order=(2, 0, 1)):
     according the order (2,0,1).
 
     Example usage:
-    
+
     .. code-block:: python
 
         im = load_image('cat.jpg')
         im = resize_short(im, 256)
         im = to_chw(im)
-    
+
     :param im: the input image with HWC layout.
     :type im: ndarray
     :param order: the transposed order.
-    :type order: tuple|list 
+    :type order: tuple|list
     """
     assert len(im.shape) == len(order)
     im = im.transpose(order)
@@ -215,11 +234,11 @@ def center_crop(im, size, is_color=True):
     Crop the center of image with size.
 
     Example usage:
-    
+
     .. code-block:: python
 
         im = center_crop(im, 224)
-    
+
     :param im: the input image with HWC layout.
     :type im: ndarray
     :param size: the cropping size.
@@ -228,8 +247,8 @@ def center_crop(im, size, is_color=True):
     :type is_color: bool
     """
     h, w = im.shape[:2]
-    h_start = (h - size) / 2
-    w_start = (w - size) / 2
+    h_start = (h - size) // 2
+    w_start = (w - size) // 2
     h_end, w_end = h_start + size, w_start + size
     if is_color:
         im = im[h_start:h_end, w_start:w_end, :]
@@ -243,11 +262,11 @@ def random_crop(im, size, is_color=True):
     Randomly crop input image with size.
 
     Example usage:
-    
+
     .. code-block:: python
 
         im = random_crop(im, 224)
-    
+
     :param im: the input image with HWC layout.
     :type im: ndarray
     :param size: the cropping size.
@@ -272,11 +291,11 @@ def left_right_flip(im, is_color=True):
     Return the flipped image.
 
     Example usage:
-    
+
     .. code-block:: python
 
         im = left_right_flip(im)
-    
+
     :param im: input image with HWC layout or HW layout for gray image
     :type im: ndarray
     :param is_color: whether input image is color or not
@@ -299,7 +318,7 @@ def simple_transform(im,
     resizing, croping and flipping.
 
     Example usage:
-    
+
     .. code-block:: python
 
         im = simple_transform(im, 256, 224, True)
@@ -314,7 +333,7 @@ def simple_transform(im,
     :type is_train: bool
     :param is_color: whether the image is color or not.
     :type is_color: bool
-    :param mean: the mean values, which can be element-wise mean values or 
+    :param mean: the mean values, which can be element-wise mean values or
                  mean values per channel.
     :type mean: numpy array | list
     """
@@ -324,7 +343,6 @@ def simple_transform(im,
         if np.random.randint(2) == 0:
             im = left_right_flip(im, is_color)
     else:
-        im = center_crop(im, crop_size, is_color)
         im = center_crop(im, crop_size, is_color=is_color)
     if len(im.shape) == 3:
         im = to_chw(im)
@@ -332,7 +350,7 @@ def simple_transform(im,
     im = im.astype('float32')
     if mean is not None:
         mean = np.array(mean, dtype=np.float32)
-        # mean value, may be one value per channel 
+        # mean value, may be one value per channel
         if mean.ndim == 1 and is_color:
             mean = mean[:, np.newaxis, np.newaxis]
         elif mean.ndim == 1:
@@ -357,7 +375,7 @@ def load_and_transform(filename,
     for the transform operations.
 
     Example usage:
-    
+
     .. code-block:: python
 
         im = load_and_transform('cat.jpg', 256, 224, True)
@@ -372,7 +390,7 @@ def load_and_transform(filename,
     :type is_train: bool
     :param is_color: whether the image is color or not.
     :type is_color: bool
-    :param mean: the mean values, which can be element-wise mean values or 
+    :param mean: the mean values, which can be element-wise mean values or
                  mean values per channel.
     :type mean: numpy array | list
     """
diff --git a/python/paddle/dataset/imdb.py b/python/paddle/dataset/imdb.py
index 5ff05b1e9b7f4c42909370a21beb140ecdcd6868..fd92523a947689a71b6f9371a3ef4838eb9d194d 100644
--- a/python/paddle/dataset/imdb.py
+++ b/python/paddle/dataset/imdb.py
@@ -20,11 +20,14 @@ of 25,000 highly polar movie reviews for training, and 25,000 for testing.
 Besides, this module also provides API for building dictionary.
 """
 
+from __future__ import print_function
+
 import paddle.dataset.common
 import collections
 import tarfile
 import re
 import string
+import six
 
 __all__ = ['build_dict', 'train', 'test', 'convert']
 
@@ -46,8 +49,9 @@ def tokenize(pattern):
         while tf != None:
             if bool(pattern.match(tf.name)):
                 # newline and punctuations removal and ad-hoc tokenization.
-                yield tarf.extractfile(tf).read().rstrip("\n\r").translate(
-                    None, string.punctuation).lower().split()
+                yield tarf.extractfile(tf).read().rstrip(six.b(
+                    "\n\r")).translate(
+                        None, six.b(string.punctuation)).lower().split()
             tf = tarf.next()
 
 
@@ -62,11 +66,11 @@ def build_dict(pattern, cutoff):
             word_freq[word] += 1
 
     # Not sure if we should prune less-frequent words here.
-    word_freq = filter(lambda x: x[1] > cutoff, word_freq.items())
+    word_freq = [x for x in six.iteritems(word_freq) if x[1] > cutoff]
 
     dictionary = sorted(word_freq, key=lambda x: (-x[1], x[0]))
     words, _ = list(zip(*dictionary))
-    word_idx = dict(zip(words, xrange(len(words))))
+    word_idx = dict(list(zip(words, six.moves.range(len(words)))))
     word_idx['<unk>'] = len(words)
     return word_idx
 
diff --git a/python/paddle/dataset/imikolov.py b/python/paddle/dataset/imikolov.py
index c6c0a0f54373dd068b2c493f6fbc9c8578593aef..8eecb75231de450282fa4838aca5b293cc2101d1 100644
--- a/python/paddle/dataset/imikolov.py
+++ b/python/paddle/dataset/imikolov.py
@@ -14,13 +14,17 @@
 """
 imikolov's simple dataset.
 
-This module will download dataset from 
+This module will download dataset from
 http://www.fit.vutbr.cz/~imikolov/rnnlm/ and parse training set and test set
 into paddle reader creators.
 """
+
+from __future__ import print_function
+
 import paddle.dataset.common
 import collections
 import tarfile
+import six
 
 __all__ = ['train', 'test', 'build_dict', 'convert']
 
@@ -64,11 +68,13 @@ def build_dict(min_word_freq=50):
             # remove <unk> for now, since we will set it as last index
             del word_freq['<unk>']
 
-        word_freq = filter(lambda x: x[1] > min_word_freq, word_freq.items())
+        word_freq = [
+            x for x in six.iteritems(word_freq) if x[1] > min_word_freq
+        ]
 
         word_freq_sorted = sorted(word_freq, key=lambda x: (-x[1], x[0]))
         words, _ = list(zip(*word_freq_sorted))
-        word_idx = dict(zip(words, xrange(len(words))))
+        word_idx = dict(list(zip(words, six.moves.range(len(words)))))
         word_idx['<unk>'] = len(words)
 
     return word_idx
@@ -89,7 +95,7 @@ def reader_creator(filename, word_idx, n, data_type):
                     l = ['<s>'] + l.strip().split() + ['<e>']
                     if len(l) >= n:
                         l = [word_idx.get(w, UNK) for w in l]
-                        for i in range(n, len(l) + 1):
+                        for i in six.moves.range(n, len(l) + 1):
                             yield tuple(l[i - n:i])
                 elif DataType.SEQ == data_type:
                     l = l.strip().split()
diff --git a/python/paddle/dataset/mnist.py b/python/paddle/dataset/mnist.py
index 9d05aeeb95c4f936cb773ece20407ecb32cbbf21..38addd0cfd9bd0afde7eefc57f2111b717b7e636 100644
--- a/python/paddle/dataset/mnist.py
+++ b/python/paddle/dataset/mnist.py
@@ -17,10 +17,15 @@ MNIST dataset.
 This module will download dataset from http://yann.lecun.com/exdb/mnist/ and
 parse training set and test set into paddle reader creators.
 """
+
+from __future__ import print_function
+
 import paddle.dataset.common
 import subprocess
 import numpy
 import platform
+import tempfile
+from six.moves import range
 __all__ = ['train', 'test', 'convert']
 
 URL_PREFIX = 'http://yann.lecun.com/exdb/mnist/'
@@ -45,31 +50,42 @@ def reader_creator(image_filename, label_filename, buffer_size):
 
         # According to http://stackoverflow.com/a/38061619/724872, we
         # cannot use standard package gzip here.
-        m = subprocess.Popen([zcat_cmd, image_filename], stdout=subprocess.PIPE)
-        m.stdout.read(16)  # skip some magic bytes
+        tmp_image_file = tempfile.TemporaryFile(prefix='paddle_dataset')
+        m = subprocess.Popen(
+            [zcat_cmd, image_filename], stdout=tmp_image_file).communicate()
+        tmp_image_file.seek(16)  # skip some magic bytes
 
-        l = subprocess.Popen([zcat_cmd, label_filename], stdout=subprocess.PIPE)
-        l.stdout.read(8)  # skip some magic bytes
+        # Python3 will not take stdout as file
+        tmp_label_file = tempfile.TemporaryFile(prefix='paddle_dataset')
+        l = subprocess.Popen(
+            [zcat_cmd, label_filename], stdout=tmp_label_file).communicate()
+        tmp_label_file.seek(8)  # skip some magic bytes
 
         try:  # reader could be break.
             while True:
                 labels = numpy.fromfile(
-                    l.stdout, 'ubyte', count=buffer_size).astype("int")
+                    tmp_label_file, 'ubyte', count=buffer_size).astype("int")
 
                 if labels.size != buffer_size:
                     break  # numpy.fromfile returns empty slice after EOF.
 
                 images = numpy.fromfile(
-                    m.stdout, 'ubyte', count=buffer_size * 28 * 28).reshape(
-                        (buffer_size, 28 * 28)).astype('float32')
+                    tmp_image_file, 'ubyte', count=buffer_size * 28 *
+                    28).reshape((buffer_size, 28 * 28)).astype('float32')
 
                 images = images / 255.0 * 2.0 - 1.0
 
-                for i in xrange(buffer_size):
+                for i in range(buffer_size):
                     yield images[i, :], int(labels[i])
         finally:
-            m.terminate()
-            l.terminate()
+            try:
+                m.terminate()
+            except:
+                pass
+            try:
+                l.terminate()
+            except:
+                pass
 
     return reader
 
diff --git a/python/paddle/dataset/movielens.py b/python/paddle/dataset/movielens.py
index ab11716202a8298c182e23b661eb1d2ac74bf4da..64bf7414819ad74365744adbd760b73d4adaff7c 100644
--- a/python/paddle/dataset/movielens.py
+++ b/python/paddle/dataset/movielens.py
@@ -16,17 +16,22 @@ Movielens 1-M dataset.
 
 Movielens 1-M dataset contains 1 million ratings from 6000 users on 4000
 movies, which was collected by GroupLens Research. This module will download
-Movielens 1-M dataset from 
+Movielens 1-M dataset from
 http://files.grouplens.org/datasets/movielens/ml-1m.zip and parse training
 set and test set into paddle reader creators.
 
 """
 
+from __future__ import print_function
+
+import numpy as np
 import zipfile
 import paddle.dataset.common
 import re
 import random
 import functools
+import six
+import paddle.compat as cpt
 
 __all__ = [
     'train', 'test', 'get_movie_title_dict', 'max_movie_id', 'max_user_id',
@@ -112,6 +117,7 @@ def __initialize_meta_info__():
                 categories_set = set()
                 with package.open('ml-1m/movies.dat') as movie_file:
                     for i, line in enumerate(movie_file):
+                        line = cpt.to_text(line, encoding='latin')
                         movie_id, title, categories = line.strip().split('::')
                         categories = categories.split('|')
                         for c in categories:
@@ -136,6 +142,7 @@ def __initialize_meta_info__():
                 USER_INFO = dict()
                 with package.open('ml-1m/users.dat') as user_file:
                     for line in user_file:
+                        line = cpt.to_text(line, encoding='latin')
                         uid, gender, age, job, _ = line.strip().split("::")
                         USER_INFO[int(uid)] = UserInfo(
                             index=uid, gender=gender, age=age, job_id=job)
@@ -144,11 +151,12 @@ def __initialize_meta_info__():
 
 def __reader__(rand_seed=0, test_ratio=0.1, is_test=False):
     fn = __initialize_meta_info__()
-    rand = random.Random(x=rand_seed)
+    np.random.seed(rand_seed)
     with zipfile.ZipFile(file=fn) as package:
         with package.open('ml-1m/ratings.dat') as rating:
             for line in rating:
-                if (rand.random() < test_ratio) == is_test:
+                line = cpt.to_text(line, encoding='latin')
+                if (np.random.random() < test_ratio) == is_test:
                     uid, mov_id, rating, _ = line.strip().split("::")
                     uid = int(uid)
                     mov_id = int(mov_id)
@@ -187,7 +195,7 @@ def max_movie_id():
     Get the maximum value of movie id.
     """
     __initialize_meta_info__()
-    return reduce(__max_index_info__, MOVIE_INFO.viewvalues()).index
+    return six.moves.reduce(__max_index_info__, list(MOVIE_INFO.values())).index
 
 
 def max_user_id():
@@ -195,7 +203,7 @@ def max_user_id():
     Get the maximum value of user id.
     """
     __initialize_meta_info__()
-    return reduce(__max_index_info__, USER_INFO.viewvalues()).index
+    return six.moves.reduce(__max_index_info__, list(USER_INFO.values())).index
 
 
 def __max_job_id_impl__(a, b):
@@ -210,7 +218,8 @@ def max_job_id():
     Get the maximum value of job id.
     """
     __initialize_meta_info__()
-    return reduce(__max_job_id_impl__, USER_INFO.viewvalues()).job_id
+    return six.moves.reduce(__max_job_id_impl__,
+                            list(USER_INFO.values())).job_id
 
 
 def movie_categories():
@@ -243,7 +252,7 @@ def unittest():
     for test_count, _ in enumerate(test()()):
         pass
 
-    print train_count, test_count
+    print(train_count, test_count)
 
 
 def fetch():
diff --git a/python/paddle/dataset/mq2007.py b/python/paddle/dataset/mq2007.py
index d3b3dd524c34be660c5f2d4fc5ce2fa0420efbc1..d5740f30c898d5704636e1de9b2e1137d12e3c35 100644
--- a/python/paddle/dataset/mq2007.py
+++ b/python/paddle/dataset/mq2007.py
@@ -23,10 +23,12 @@ http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ20
 
 """
 
+from __future__ import print_function
+
 import os
 import functools
 import rarfile
-from common import download
+from .common import download
 import numpy as np
 
 # URL = "http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ2007.rar"
@@ -53,7 +55,7 @@ class Query(object):
   ----------
   query_id : int
     query_id in dataset, mapping from query to relevance documents
-  relevance_score : int 
+  relevance_score : int
     relevance score of query and document pair
   feature_vector : array, dense feature
     feature in vector format
@@ -92,7 +94,7 @@ class Query(object):
             sys.stdout.write("expect 48 space split parts, get %d" %
                              (len(parts)))
             return None
-        # format : 0 qid:10 1:0.000272 2:0.000000 .... 
+        # format : 0 qid:10 1:0.000272 2:0.000000 ....
         self.relevance_score = int(parts[0])
         self.query_id = int(parts[1].split(':')[1])
         for p in parts[2:]:
@@ -295,7 +297,7 @@ def __reader__(filepath, format="pairwise", shuffle=False, fill_missing=-1):
   --------
   filename : string
   fill_missing : fill the missing value. default in MQ2007 is -1
-  
+
   Returns
   ------
   yield
@@ -330,4 +332,4 @@ if __name__ == "__main__":
     mytest = functools.partial(
         __reader__, filepath="MQ2007/MQ2007/Fold1/sample", format="listwise")
     for label, query in mytest():
-        print label, query
+        print(label, query)
diff --git a/python/paddle/dataset/sentiment.py b/python/paddle/dataset/sentiment.py
index f5461164fe6b816356e42fc7b7dcf388eccfadfb..22d867beea25c97efcbcb6f61ca2b7a7777f9c5c 100644
--- a/python/paddle/dataset/sentiment.py
+++ b/python/paddle/dataset/sentiment.py
@@ -20,6 +20,9 @@ The script fetch and preprocess movie_reviews data set that provided by NLTK
 TODO(yuyang18): Complete dataset.
 """
 
+from __future__ import print_function
+
+import six
 import collections
 from itertools import chain
 
@@ -43,11 +46,11 @@ def download_data_if_not_yet():
             nltk.data.path.append(paddle.dataset.common.DATA_HOME)
         movie_reviews.categories()
     except LookupError:
-        print "Downloading movie_reviews data set, please wait....."
+        print("Downloading movie_reviews data set, please wait.....")
         nltk.download(
             'movie_reviews', download_dir=paddle.dataset.common.DATA_HOME)
-        print "Download data set success....."
-        print "Path is " + nltk.data.find('corpora/movie_reviews').path
+        print("Download data set success.....")
+        print("Path is " + nltk.data.find('corpora/movie_reviews').path)
 
 
 def get_word_dict():
@@ -64,7 +67,7 @@ def get_word_dict():
         for field in movie_reviews.fileids(category):
             for words in movie_reviews.words(field):
                 word_freq_dict[words] += 1
-    words_sort_list = word_freq_dict.items()
+    words_sort_list = six.iteritems(word_freq_dict)
     words_sort_list.sort(cmp=lambda a, b: b[1] - a[1])
     for index, word in enumerate(words_sort_list):
         words_freq_sorted.append((word[0], index))
@@ -80,7 +83,8 @@ def sort_files():
     files_list = list()
     neg_file_list = movie_reviews.fileids('neg')
     pos_file_list = movie_reviews.fileids('pos')
-    files_list = list(chain.from_iterable(zip(neg_file_list, pos_file_list)))
+    files_list = list(
+        chain.from_iterable(list(zip(neg_file_list, pos_file_list))))
     return files_list
 
 
diff --git a/python/paddle/dataset/tests/cifar_test.py b/python/paddle/dataset/tests/cifar_test.py
index 839125b09dd5c6432e3572374a7345a77a43f7cf..8e514f0fd9a18a7d512430111a8a11b942950d20 100644
--- a/python/paddle/dataset/tests/cifar_test.py
+++ b/python/paddle/dataset/tests/cifar_test.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import paddle.dataset.cifar
 import unittest
 
diff --git a/python/paddle/dataset/tests/common_test.py b/python/paddle/dataset/tests/common_test.py
index e7cc02aa83061599ffefa18de6cb02ac0fc9e9b7..0ce7d83f374f8c09f68527473418de8ce84c36b1 100644
--- a/python/paddle/dataset/tests/common_test.py
+++ b/python/paddle/dataset/tests/common_test.py
@@ -12,10 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import paddle.dataset.common
 import unittest
 import tempfile
 import glob
+from six.moves import range
 
 
 class TestCommon(unittest.TestCase):
@@ -36,7 +39,7 @@ class TestCommon(unittest.TestCase):
     def test_split(self):
         def test_reader():
             def reader():
-                for x in xrange(10):
+                for x in range(10):
                     yield x
 
             return reader
@@ -49,7 +52,7 @@ class TestCommon(unittest.TestCase):
 
     def test_cluster_file_reader(self):
         _, temp_path = tempfile.mkstemp()
-        for x in xrange(5):
+        for x in range(5):
             with open(temp_path + '/%05d.test' % x) as f:
                 f.write('%d\n' % x)
         reader = paddle.dataset.common.cluster_files_reader(
@@ -63,7 +66,7 @@ class TestCommon(unittest.TestCase):
 
         def test_reader():
             def reader():
-                for x in xrange(record_num):
+                for x in range(record_num):
                     yield x
 
             return reader
diff --git a/python/paddle/dataset/tests/flowers_test.py b/python/paddle/dataset/tests/flowers_test.py
index 06260fd796ce0271b7cec2f42a8a5a255a02dc24..06a0a7761cfa10ca3211297d176e3e909332e271 100644
--- a/python/paddle/dataset/tests/flowers_test.py
+++ b/python/paddle/dataset/tests/flowers_test.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import paddle.dataset.flowers
 import unittest
 
diff --git a/python/paddle/dataset/tests/imdb_test.py b/python/paddle/dataset/tests/imdb_test.py
index 539da049449cd273db0a9e260851ed40e1be0f04..415947e3477f2e5b9979588528f7cb6f799acf6a 100644
--- a/python/paddle/dataset/tests/imdb_test.py
+++ b/python/paddle/dataset/tests/imdb_test.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import paddle.dataset.imdb
 import unittest
 import re
diff --git a/python/paddle/dataset/tests/imikolov_test.py b/python/paddle/dataset/tests/imikolov_test.py
index 233fd9fc8cea4cd0b5cd052580030fc8c993693c..1f78a5dd4d1a09c3192bc8c144c5a78c8a214f3a 100644
--- a/python/paddle/dataset/tests/imikolov_test.py
+++ b/python/paddle/dataset/tests/imikolov_test.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import paddle.dataset.imikolov
 import unittest
 
@@ -59,7 +61,7 @@ class TestMikolov(unittest.TestCase):
         self.assertEqual(first_line, read_line)
 
     def test_total(self):
-        _, idx = zip(*WORD_DICT.items())
+        _, idx = list(zip(*list(WORD_DICT.items())))
         self.assertEqual(sorted(idx)[-1], len(WORD_DICT) - 1)
 
 
diff --git a/python/paddle/dataset/tests/mnist_test.py b/python/paddle/dataset/tests/mnist_test.py
index 8ada19d3f2ee13e194d08e19a4b86b558c69a0a7..fbb5d926494e38283e78ec15381530e50f32915d 100644
--- a/python/paddle/dataset/tests/mnist_test.py
+++ b/python/paddle/dataset/tests/mnist_test.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import paddle.dataset.mnist
 import unittest
 
diff --git a/python/paddle/dataset/tests/mq2007_test.py b/python/paddle/dataset/tests/mq2007_test.py
index fba388724a8e84591df7150b41f8ea39a850fc31..ee0897e88f0d7ad089b7f7b68d31d04d96fa3e9d 100644
--- a/python/paddle/dataset/tests/mq2007_test.py
+++ b/python/paddle/dataset/tests/mq2007_test.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import paddle.dataset.mq2007
 import unittest
 
diff --git a/python/paddle/dataset/tests/test_image.py b/python/paddle/dataset/tests/test_image.py
index 8bd56607ae1998935a3b3aaa0e3279515c2a540c..32d2eb17ae673e72bbee2fc3bb5e3b05f1b20074 100644
--- a/python/paddle/dataset/tests/test_image.py
+++ b/python/paddle/dataset/tests/test_image.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 
diff --git a/python/paddle/dataset/tests/test_sentiment.py b/python/paddle/dataset/tests/test_sentiment.py
index 543f4b7378b583ea3857bf785cf330c43e535c2a..bb9830132e987370022df3192060de3e908a2e85 100644
--- a/python/paddle/dataset/tests/test_sentiment.py
+++ b/python/paddle/dataset/tests/test_sentiment.py
@@ -15,6 +15,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import nltk
 import paddle.dataset.sentiment as st
@@ -24,9 +26,8 @@ from nltk.corpus import movie_reviews
 class TestSentimentMethods(unittest.TestCase):
     def test_get_word_dict(self):
         word_dict = st.get_word_dict()[0:10]
-        test_word_list = [(u',', 0), (u'the', 1), (u'.', 2), (u'a', 3),
-                          (u'and', 4), (u'of', 5), (u'to', 6), (u"'", 7),
-                          (u'is', 8), (u'in', 9)]
+        test_word_list = [(',', 0), ('the', 1), ('.', 2), ('a', 3), ('and', 4),
+                          ('of', 5), ('to', 6), ("'", 7), ('is', 8), ('in', 9)]
         for idx, each in enumerate(word_dict):
             self.assertEqual(each, test_word_list[idx])
         self.assertTrue("/root/.cache/paddle/dataset" in nltk.data.path)
diff --git a/python/paddle/dataset/tests/voc2012_test.py b/python/paddle/dataset/tests/voc2012_test.py
index 0d285461a8ae8a9cc69fbec0dcf5efc106b594f0..cddeb91cab2c0f90567f28f8258156e2bb654abc 100644
--- a/python/paddle/dataset/tests/voc2012_test.py
+++ b/python/paddle/dataset/tests/voc2012_test.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import paddle.dataset.voc2012
 import unittest
 
diff --git a/python/paddle/dataset/tests/wmt16_test.py b/python/paddle/dataset/tests/wmt16_test.py
index 8b949d8bf5212d51016a33da322095bde2038200..be121bb10121967590c9e136e9a1964a133e934b 100644
--- a/python/paddle/dataset/tests/wmt16_test.py
+++ b/python/paddle/dataset/tests/wmt16_test.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import paddle.dataset.wmt16
 import unittest
 
diff --git a/python/paddle/dataset/uci_housing.py b/python/paddle/dataset/uci_housing.py
index fbfa477d055eb5f484989eacce38cee8d617d729..f87fdcc4f0f3c42a92bcff5ddcd532c3108565b1 100644
--- a/python/paddle/dataset/uci_housing.py
+++ b/python/paddle/dataset/uci_housing.py
@@ -19,9 +19,10 @@ https://archive.ics.uci.edu/ml/machine-learning-databases/housing/ and
 parse training set and test set into paddle reader creators.
 """
 
-import os
+from __future__ import print_function
 
 import numpy as np
+import six
 import tempfile
 import tarfile
 import os
@@ -49,9 +50,12 @@ def feature_range(maximums, minimums):
     import matplotlib.pyplot as plt
     fig, ax = plt.subplots()
     feature_num = len(maximums)
-    ax.bar(range(feature_num), maximums - minimums, color='r', align='center')
+    ax.bar(list(range(feature_num)),
+           maximums - minimums,
+           color='r',
+           align='center')
     ax.set_title('feature scale')
-    plt.xticks(range(feature_num), feature_names)
+    plt.xticks(list(range(feature_num)), feature_names)
     plt.xlim([-1, feature_num])
     fig.set_figheight(6)
     fig.set_figwidth(10)
@@ -67,11 +71,11 @@ def load_data(filename, feature_num=14, ratio=0.8):
         return
 
     data = np.fromfile(filename, sep=' ')
-    data = data.reshape(data.shape[0] / feature_num, feature_num)
+    data = data.reshape(data.shape[0] // feature_num, feature_num)
     maximums, minimums, avgs = data.max(axis=0), data.min(axis=0), data.sum(
         axis=0) / data.shape[0]
     feature_range(maximums[:-1], minimums[:-1])
-    for i in xrange(feature_num - 1):
+    for i in six.moves.range(feature_num - 1):
         data[:, i] = (data[:, i] - avgs[i]) / (maximums[i] - minimums[i])
     offset = int(data.shape[0] * ratio)
     UCI_TRAIN_DATA = data[:offset]
@@ -134,7 +138,7 @@ def predict_reader():
     It returns just one tuple data to do inference.
 
     :return: one tuple data
-    :rtype: tuple 
+    :rtype: tuple
     """
     global UCI_TEST_DATA
     load_data(paddle.dataset.common.download(URL, 'uci_housing', MD5))
diff --git a/python/paddle/dataset/voc2012.py b/python/paddle/dataset/voc2012.py
index 9c945574dbcc15f5cee370206ed7e70ba8ab5014..50688937654ae72b77e1439f21a0d7c847d5e135 100644
--- a/python/paddle/dataset/voc2012.py
+++ b/python/paddle/dataset/voc2012.py
@@ -19,6 +19,8 @@ to training/test sets has been maintained. The total number of images
 with segmentation has been increased from 7,062 to 9,993.
 """
 
+from __future__ import print_function
+
 import tarfile
 import io
 import numpy as np
diff --git a/python/paddle/dataset/wmt14.py b/python/paddle/dataset/wmt14.py
index f0908c737874fa7335cca5b5f0cba83190c9f90f..f8c1a33574e642b21feb6843d115b7f4205ef250 100644
--- a/python/paddle/dataset/wmt14.py
+++ b/python/paddle/dataset/wmt14.py
@@ -19,10 +19,15 @@ http://paddlepaddle.cdn.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz and
 parse training set and test set into paddle reader creators.
 
 """
+
+from __future__ import print_function
+
+import six
 import tarfile
 import gzip
 
 import paddle.dataset.common
+import paddle.compat as cpt
 
 __all__ = [
     'train',
@@ -36,11 +41,10 @@ URL_DEV_TEST = ('http://www-lium.univ-lemans.fr/~schwenk/'
 MD5_DEV_TEST = '7d7897317ddd8ba0ae5c5fa7248d3ff5'
 # this is a small set of data for test. The original data is too large and
 # will be add later.
-URL_TRAIN = ('http://paddlepaddle.cdn.bcebos.com/demo/'
-             'wmt_shrinked_data/wmt14.tgz')
+URL_TRAIN = ('http://paddlemodels.bj.bcebos.com/wmt/wmt14.tgz')
 MD5_TRAIN = '0791583d57d5beb693b9414c5b36798c'
 # BLEU of this trained model is 26.92
-URL_MODEL = 'http://paddlepaddle.bj.bcebos.com/demo/wmt_14/wmt14_model.tar.gz'
+URL_MODEL = 'http://paddlemodels.bj.bcebos.com/wmt%2Fwmt14.tgz'
 MD5_MODEL = '0cb4a5366189b6acba876491c8724fa3'
 
 START = "<s>"
@@ -54,7 +58,7 @@ def __read_to_dict(tar_file, dict_size):
         out_dict = dict()
         for line_count, line in enumerate(fd):
             if line_count < size:
-                out_dict[line.strip()] = line_count
+                out_dict[cpt.to_text(line.strip())] = line_count
             else:
                 break
         return out_dict
@@ -85,7 +89,7 @@ def reader_creator(tar_file, file_name, dict_size):
             ]
             for name in names:
                 for line in f.extractfile(name):
-                    line_split = line.strip().split('\t')
+                    line_split = line.strip().split(six.b('\t'))
                     if len(line_split) != 2:
                         continue
                     src_seq = line_split[0]  # one source sequence
@@ -154,8 +158,8 @@ def get_dict(dict_size, reverse=True):
     tar_file = paddle.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN)
     src_dict, trg_dict = __read_to_dict(tar_file, dict_size)
     if reverse:
-        src_dict = {v: k for k, v in src_dict.items()}
-        trg_dict = {v: k for k, v in trg_dict.items()}
+        src_dict = {v: k for k, v in six.iteritems(src_dict)}
+        trg_dict = {v: k for k, v in six.iteritems(trg_dict)}
     return src_dict, trg_dict
 
 
diff --git a/python/paddle/dataset/wmt16.py b/python/paddle/dataset/wmt16.py
index 540d43b692e0f65460f558dd74a52715ff4db68d..f30dcd518ea6c0c685d027ede3ad6e0a1cb0c82c 100644
--- a/python/paddle/dataset/wmt16.py
+++ b/python/paddle/dataset/wmt16.py
@@ -28,12 +28,16 @@ Multi30K: Multilingual English-German Image Descriptions.
 }
 """
 
+from __future__ import print_function
+
 import os
+import six
 import tarfile
 import gzip
 from collections import defaultdict
 
 import paddle.dataset.common
+import paddle.compat as cpt
 
 __all__ = [
     "train",
@@ -60,7 +64,7 @@ def __build_dict(tar_file, dict_size, save_path, lang):
     word_dict = defaultdict(int)
     with tarfile.open(tar_file, mode="r") as f:
         for line in f.extractfile("wmt16/train"):
-            line_split = line.strip().split("\t")
+            line_split = line.strip().split(six.b("\t"))
             if len(line_split) != 2: continue
             sen = line_split[0] if lang == "en" else line_split[1]
             for w in sen.split():
@@ -70,7 +74,8 @@ def __build_dict(tar_file, dict_size, save_path, lang):
         fout.write("%s\n%s\n%s\n" % (START_MARK, END_MARK, UNK_MARK))
         for idx, word in enumerate(
                 sorted(
-                    word_dict.iteritems(), key=lambda x: x[1], reverse=True)):
+                    six.iteritems(word_dict), key=lambda x: x[1],
+                    reverse=True)):
             if idx + 3 == dict_size: break
             fout.write("%s\n" % (word[0]))
 
@@ -79,16 +84,16 @@ def __load_dict(tar_file, dict_size, lang, reverse=False):
     dict_path = os.path.join(paddle.dataset.common.DATA_HOME,
                              "wmt16/%s_%d.dict" % (lang, dict_size))
     if not os.path.exists(dict_path) or (
-            len(open(dict_path, "r").readlines()) != dict_size):
+            len(open(dict_path, "rb").readlines()) != dict_size):
         __build_dict(tar_file, dict_size, dict_path, lang)
 
     word_dict = {}
-    with open(dict_path, "r") as fdict:
+    with open(dict_path, "rb") as fdict:
         for idx, line in enumerate(fdict):
             if reverse:
-                word_dict[idx] = line.strip()
+                word_dict[idx] = cpt.to_text(line.strip())
             else:
-                word_dict[line.strip()] = idx
+                word_dict[cpt.to_text(line.strip())] = idx
     return word_dict
 
 
@@ -118,7 +123,7 @@ def reader_creator(tar_file, file_name, src_dict_size, trg_dict_size, src_lang):
 
         with tarfile.open(tar_file, mode="r") as f:
             for line in f.extractfile(file_name):
-                line_split = line.strip().split("\t")
+                line_split = line.strip().split(six.b("\t"))
                 if len(line_split) != 2:
                     continue
                 src_words = line_split[src_col].split()
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 74b268aedece0983177616581f0755ea16916697..e4d7575ca4014c29aa29198c1017188c413d0e00 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -14,77 +14,79 @@
 
 from __future__ import print_function
 # import all class inside framework into fluid module
-import framework
-from framework import *
+from . import framework
+from .framework import *
 # import all class inside executor into fluid module
-import executor
-from executor import *
-
-import trainer
-from trainer import Trainer
-from trainer import BeginEpochEvent
-from trainer import EndEpochEvent
-from trainer import BeginStepEvent
-from trainer import EndStepEvent
-from trainer import CheckpointConfig
-
-import inferencer
-from inferencer import Inferencer
-
-import io
-import evaluator
-import initializer
-import layers
-import nets
-import optimizer
-import backward
-import regularizer
-import average
-import metrics
-import transpiler
-from param_attr import ParamAttr, WeightNormParamAttr
-from data_feeder import DataFeeder
-from core import LoDTensor, CPUPlace, CUDAPlace, CUDAPinnedPlace, Scope
-from transpiler import DistributeTranspiler, InferenceTranspiler, \
-    memory_optimize, release_memory
-from concurrency import (Go, make_channel, channel_send, channel_recv,
-                         channel_close, Select)
-from lod_tensor import create_lod_tensor, create_random_int_lodtensor
-import clip
-import profiler
-import unique_name
-import recordio_writer
-import parallel_executor
-from parallel_executor import *
+from . import executor
+from .executor import *
+
+from . import trainer
+from .trainer import Trainer
+from .trainer import BeginEpochEvent
+from .trainer import EndEpochEvent
+from .trainer import BeginStepEvent
+from .trainer import EndStepEvent
+from .trainer import CheckpointConfig
+
+from . import inferencer
+from .inferencer import Inferencer
+
+from . import io
+from . import evaluator
+from . import initializer
+from . import layers
+from . import contrib
+from . import nets
+from . import optimizer
+from . import backward
+from . import regularizer
+from . import average
+from . import metrics
+from . import transpiler
+from .param_attr import ParamAttr, WeightNormParamAttr
+from .data_feeder import DataFeeder
+from .core import LoDTensor, LoDTensorArray, CPUPlace, CUDAPlace, CUDAPinnedPlace, Scope
+from .transpiler import DistributeTranspiler, InferenceTranspiler, \
+    memory_optimize, release_memory, DistributeTranspilerConfig
+from .lod_tensor import create_lod_tensor, create_random_int_lodtensor
+from . import clip
+from . import profiler
+from . import unique_name
+from . import recordio_writer
+from . import parallel_executor
+from .parallel_executor import *
+from paddle.fluid.layers.math_op_patch import monkey_patch_variable
 
 Tensor = LoDTensor
 
-__all__ = framework.__all__ + executor.__all__ + concurrency.__all__ + \
-          trainer.__all__ + inferencer.__all__ + transpiler.__all__ + \
-          parallel_executor.__all__ + lod_tensor.__all__ + [
-              'io',
-              'initializer',
-              'layers',
-              'transpiler'
-              'nets',
-              'optimizer',
-              'learning_rate_decay',
-              'backward',
-              'regularizer',
-              'LoDTensor',
-              'CPUPlace',
-              'CUDAPlace',
-              'CUDAPinnedPlace',
-              'Tensor',
-              'ParamAttr',
-              'WeightNormParamAttr',
-              'DataFeeder',
-              'clip',
-              'profiler',
-              'unique_name',
-              'recordio_writer',
-              'Scope',
-          ]
+__all__ = framework.__all__ + executor.__all__ + \
+    trainer.__all__ + inferencer.__all__ + transpiler.__all__ + \
+    parallel_executor.__all__ + lod_tensor.__all__ + [
+        'io',
+        'initializer',
+        'layers',
+        'contrib',
+        'transpiler',
+        'nets',
+        'optimizer',
+        'learning_rate_decay',
+        'backward',
+        'regularizer',
+        'LoDTensor',
+        'LoDTensorArray',
+        'CPUPlace',
+        'CUDAPlace',
+        'CUDAPinnedPlace',
+        'Tensor',
+        'ParamAttr',
+        'WeightNormParamAttr',
+        'DataFeeder',
+        'clip',
+        'profiler',
+        'unique_name',
+        'recordio_writer',
+        'Scope',
+    ]
 
 
 def __bootstrap__():
@@ -95,8 +97,8 @@ def __bootstrap__():
         None
     """
     import sys
-    import core
     import os
+    from . import core
 
     in_test = 'unittest' in sys.modules
 
@@ -117,20 +119,16 @@ def __bootstrap__():
     os.environ['OMP_NUM_THREADS'] = str(num_threads)
 
     read_env_flags = [
-        'use_pinned_memory',
-        'check_nan_inf',
-        'benchmark',
-        'warpctc_dir',
-        'eager_delete_scope',
-        'use_mkldnn',
-        'initial_cpu_memory_in_mb',
-        'init_allocated_mem',
-        'free_idle_memory',
-        'paddle_num_threads',
-        "dist_threadpool_size",
-        'cpu_deterministic',
-        'eager_delete_tensor_GB',
+        'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir',
+        'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb',
+        'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads',
+        "dist_threadpool_size", 'cpu_deterministic', 'eager_delete_tensor_GB'
     ]
+    if core.is_compiled_with_dist():
+        read_env_flags.append('rpc_deadline')
+        read_env_flags.append('rpc_server_profile_period')
+        read_env_flags.append('rpc_server_profile_path')
+
     if core.is_compiled_with_cuda():
         read_env_flags += [
             'fraction_of_gpu_memory_to_use', 'cudnn_deterministic'
@@ -144,5 +142,5 @@ def __bootstrap__():
 
 # TODO(panyx0718): Avoid doing complex initialization logic in __init__.py.
 # Consider paddle.init(args) or paddle.main(args)
-layers.monkey_patch_variable()
+monkey_patch_variable()
 __bootstrap__()
diff --git a/python/paddle/fluid/annotations.py b/python/paddle/fluid/annotations.py
new file mode 100644
index 0000000000000000000000000000000000000000..15e7976354f2a22065f1723bfa696d056181dac2
--- /dev/null
+++ b/python/paddle/fluid/annotations.py
@@ -0,0 +1,39 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import functools
+import sys
+
+__all__ = ['deprecated']
+
+
+def deprecated(since, instead, extra_message=""):
+    def decorator(func):
+        err_msg = "API {0} is deprecated since {1}. Please use {2} instead.".format(
+            func.__name__, since, instead)
+        if len(extra_message) != 0:
+            err_msg += "\n"
+            err_msg += extra_message
+
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            print(err_msg, file=sys.stderr)
+            return func(*args, **kwargs)
+
+        wrapper.__doc__ += "\n    "
+        wrapper.__doc__ += err_msg
+        return wrapper
+
+    return decorator
diff --git a/python/paddle/fluid/average.py b/python/paddle/fluid/average.py
index 358e24df31bb517604481bb48b9180e579f8460d..42cd3b36420ef5a17a9a7d981978ba8869809936 100644
--- a/python/paddle/fluid/average.py
+++ b/python/paddle/fluid/average.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import numpy as np
 import warnings
 """
diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py
index 4faa06303170488d0de2fda4c1461cfe2d623d35..a415cdbeaaae2a3bb4a137744205e3fe7366a78f 100644
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -12,16 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 from paddle.fluid import framework as framework
 from . import core
 import collections
 import copy
-import unique_name
+import six
+from .. import compat as cpt
+from . import unique_name
 
-__all__ = [
-    'append_backward',
-    'calc_gradient',
-]
+__all__ = ['append_backward']
 
 
 def _rename_arg_(op_descs, old_name, new_name, begin_idx=None, end_idx=None):
@@ -47,17 +48,25 @@ def _create_op_desc_(op_type, inputs, outputs, attrs):
     """
     op_desc = core.OpDesc()
     op_desc.set_type(op_type)
-    for para, args in inputs.iteritems():
-        op_desc.set_input(para, args)
-    for para, args in outputs.iteritems():
-        op_desc.set_output(para, args)
+    for para, args in six.iteritems(inputs):
+        op_desc.set_input(
+            para,
+            list(
+                map(lambda arg: arg.decode() if isinstance(arg, six.binary_type) else arg,
+                    args)))
+    for para, args in six.iteritems(outputs):
+        op_desc.set_output(
+            para,
+            list(
+                map(lambda arg: arg.decode() if isinstance(arg, six.binary_type) else arg,
+                    args)))
 
     op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName()
 
     if op_role_attr_name not in attrs:
         attrs[
             op_role_attr_name] = core.op_proto_and_checker_maker.OpRole.Backward
-    for name, val in attrs.iteritems():
+    for name, val in six.iteritems(attrs):
         if isinstance(val, framework.Block):
             op_desc.set_block_attr(name, val.desc)
         else:
@@ -69,10 +78,10 @@ def _infer_var_data_type_(grad_var_name, block):
     """
     Infer the data type of given grad variable
     """
-    grad_var = block.desc.find_var(grad_var_name.encode("ascii"))
-    fwd_name = _strip_grad_suffix_(grad_var_name.encode("ascii"))
-    if block.desc.has_var_recursive(fwd_name):
-        fwd_var = block.desc.find_var_recursive(fwd_name.encode("ascii"))
+    grad_var = block.desc.find_var(cpt.to_bytes(grad_var_name))
+    fwd_name = _strip_grad_suffix_(grad_var_name)
+    if block.desc.has_var_recursive(cpt.to_bytes(fwd_name)):
+        fwd_var = block.desc.find_var_recursive(cpt.to_bytes(fwd_name))
         grad_var.set_dtype(fwd_var.dtype())
     else:
         grad_var.set_dtype(core.VarDesc.VarType.FP32)
@@ -96,8 +105,10 @@ def _some_in_set_(cands, s):
     """
     if len(cands) == 0:
         return False
-    for c in cands:
-        if c in s:
+    literal_set = cpt.to_text(s)
+    literal_cands = cpt.to_text(cands)
+    for c in literal_cands:
+        if c in literal_set:
             return True
     return False
 
@@ -108,6 +119,7 @@ def _strip_grad_suffix_(name):
     e.g. x@GRAD ==> x
          y@GRAD@RENAME@1 ==> y
     """
+    name = cpt.to_text(name)
     pos = name.find(core.grad_var_suffix())
     return name[:pos] if pos != -1 else name
 
@@ -117,13 +129,14 @@ def _append_grad_suffix_(name):
     Append grad suffix to the given variable name
     e.g. x ==> x@GRAD
     """
-    return name + core.grad_var_suffix()
+    return cpt.to_text(name) + core.grad_var_suffix()
 
 
 def _addup_repetitive_outputs_(op_descs):
     """
     In backward part, an variable may be the output of more than one ops.
-    In this case, the variable should be the accumulation of all the outputs.
+    And one op may yield its multiple outputs to the same variable.
+    In these cases, the variable should be the accumulation of all the outputs.
     `sum_op`s are added to implement the accumulate.
     """
     pending_sum_ops = []
@@ -136,30 +149,47 @@ def _addup_repetitive_outputs_(op_descs):
                     "sum", {"X": renamed_vars[var_name]}, {"Out": [var_name]},
                     {"use_mkldnn": False}), idx))
                 renamed_vars[var_name] = [var_name]
-        for var_name in op_desc.output_arg_names():
-            if var_name == core.empty_var_name(
-            ) or var_name in op_desc.input_arg_names():
-                # empty variable or inplace op
-                continue
-            if len(renamed_vars[var_name]) == 0:
-                # it's the first time we get the variable
-                renamed_vars[var_name] = [var_name]
-            else:
-                if len(renamed_vars[var_name]) == 1:
+        for param_idx, param_name in enumerate(op_desc.output_names()):
+            arg_names = op_desc.output(param_name)
+            for arg_idx, var_name in enumerate(arg_names):
+                if var_name == core.empty_var_name(
+                ) or var_name in op_desc.input_arg_names():
+                    # empty variable or inplace op
+                    continue
+                if len(renamed_vars[var_name]) == 0:
+                    # it's the first time we get the variable
+                    renamed_vars[var_name] = [var_name]
+                else:
+                    if len(renamed_vars[var_name]) == 1:
+                        new_name = var_name + "@RENAME@" + \
+                            str(var_rename_count[var_name])
+                        var_rename_count[var_name] += 1
+                        # rename original var_name
+                        renamed_vars[var_name][0] = new_name
+                        _rename_arg_(op_descs, var_name, new_name, 0, idx)
+                        _rename_arg_(pending_sum_ops, var_name, new_name)
+
+                        for p in op_desc.output_names()[:param_idx]:
+                            p_arg_names = op_desc.output(p)
+                            if var_name in p_arg_names:
+                                op_desc.set_output(p, [
+                                    new_name if x == var_name else x
+                                    for x in p_arg_names
+                                ])
+
+                        arg_names = [
+                            new_name if x == var_name else x
+                            for x in arg_names[:arg_idx]
+                        ] + arg_names[arg_idx:]
+
                     new_name = var_name + "@RENAME@" + \
                         str(var_rename_count[var_name])
                     var_rename_count[var_name] += 1
-                    # rename original var_name
-                    renamed_vars[var_name][0] = new_name
-                    _rename_arg_(op_descs, var_name, new_name, 0, idx)
-                    _rename_arg_(pending_sum_ops, var_name, new_name)
-
-                new_name = var_name + "@RENAME@" + \
-                    str(var_rename_count[var_name])
-                var_rename_count[var_name] += 1
-                op_desc.rename_output(var_name, new_name)
-                renamed_vars[var_name].append(new_name)
-    for var_name, inputs in renamed_vars.iteritems():
+                    arg_names[arg_idx] = new_name
+                    op_desc.set_output(param_name, arg_names)
+                    renamed_vars[var_name].append(new_name)
+
+    for var_name, inputs in six.iteritems(renamed_vars):
         if len(inputs) > 1:
             pending_sum_ops.append(
                 (_create_op_desc_("sum", {"X": inputs}, {"Out": [var_name]},
@@ -183,16 +213,19 @@ def _remove_no_grad_branch_(op_descs, no_grad_set):
         out_arg_names = op_desc.output_arg_names()
         if len(out_arg_names) == 0 or _all_in_set_(out_arg_names, no_grad_set):
             return True
-        if _all_in_set_(
-                filter(lambda name: name.find(core.grad_var_suffix()) != -1,
-                       op_desc.input_arg_names()), no_grad_set):
+        if _all_in_set_([
+                name for name in op_desc.input_arg_names()
+                if name.find(core.grad_var_suffix()) != -1
+        ], no_grad_set):
             no_grad_set.update(out_arg_names)
             return True
         return False
 
     # Remove ops whose outputs are all in no_grad_dict
-    op_descs = filter(
-        lambda op_desc: not _op_can_be_removed_(op_desc, no_grad_set), op_descs)
+    op_descs = [
+        op_desc for op_desc in op_descs
+        if not _op_can_be_removed_(op_desc, no_grad_set)
+    ]
     # Insert fill_zeros_like_op
     to_insert = []
     for idx, op_desc in enumerate(op_descs):
@@ -202,17 +235,17 @@ def _remove_no_grad_branch_(op_descs, no_grad_set):
                     "X": [_strip_grad_suffix_(arg)]
                 }, {"Out": [arg]}, {}), idx))
 
-    map(lambda p: op_descs.insert(p[1], p[0]), reversed(to_insert))
+    list([op_descs.insert(p[1], p[0]) for p in reversed(to_insert)])
 
     return op_descs
 
 
-import proto.framework_pb2 as framework_pb2
+from .proto import framework_pb2
 
 
 def serialize_op_decs(op_desc):
     protostr = op_desc.serialize_to_string()
-    proto = framework_pb2.OpDesc.FromString(str(protostr))
+    proto = framework_pb2.OpDesc.FromString(six.binary_type(protostr))
     return proto.__str__()
 
 
@@ -229,8 +262,10 @@ def _callback_lookup_(op):
     if op.type == 'parallel_do' and op.attr('use_nccl'):
         all_vars = op.block.vars
         param_names = set(op.input('parameters'))
-        param_names = filter(lambda name: all_vars[name].stop_gradient is False,
-                             param_names)
+        param_names = [
+            name for name in param_names
+            if all_vars[name].stop_gradient is False
+        ]
         param_grad_names = [n + "@GRAD" for n in param_names]
 
         class ParallelDoCallBack(object):
@@ -311,9 +346,9 @@ def _append_backward_ops_(block,
         grad_sub_block_list = []
         # If the op has its own sub-block, deal with the sub-block first
         if op.has_attr("sub_block"):
-            sub_block = program.block(op.block_attr("sub_block"))
+            sub_block = program.block(op.block_attr_id("sub_block"))
             grad_sub_block = program.create_block()
-            grad_sub_block.set_forward_block_idx(sub_block.idx)
+            grad_sub_block._set_forward_block_idx(sub_block.idx)
             cb = _callback_lookup_(op)
             if cb is not None:
                 if callbacks is None:
@@ -331,7 +366,7 @@ def _append_backward_ops_(block,
 
         # Getting op's corresponding grad_op
         grad_op_desc, op_grad_to_var = core.get_grad_op_desc(
-            op.desc, no_grad_dict[block.idx], grad_sub_block_list)
+            op.desc, cpt.to_text(no_grad_dict[block.idx]), grad_sub_block_list)
 
         grad_op_descs.extend(grad_op_desc)
         grad_to_var.update(op_grad_to_var)
@@ -373,18 +408,17 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map):
     for op_idx in range(start_op_idx, block.desc.op_size()):
         op_desc = block.desc.op(op_idx)
         if op_desc.has_attr("sub_block"):
-            sub_block = block.program.block(op_desc.block_attr("sub_block"))
+            sub_block = block.program.block(op_desc.block_attr_id("sub_block"))
             _append_backward_vars_(sub_block, 0, grad_to_var, grad_info_map)
         new_vars = set()
         # create new gradient variables
         for grad_var_name in op_desc.output_arg_names():
-            grad_var_name = grad_var_name.encode("ascii")
-            if block.desc.has_var_recursive(
-                    grad_var_name) or grad_var_name == core.empty_var_name():
+            if block.desc.has_var_recursive(cpt.to_bytes(
+                    grad_var_name)) or grad_var_name == core.empty_var_name():
                 continue
-            block.desc.var(grad_var_name)
+            block.desc.var(cpt.to_bytes(grad_var_name))
             new_vars.add(grad_var_name)
-            if not grad_to_var.has_key(grad_var_name):
+            if grad_var_name not in grad_to_var:
                 continue
             grad_info_map[grad_to_var[grad_var_name]] = (grad_var_name, block)
         # infer_shape and infer_type
@@ -412,7 +446,7 @@ def _rename_grad_(block, start_op_idx, grad_to_var, target_grad_map):
                 op_desc.rename_output(name, new_name)
                 var_map[name] = new_name
 
-    for g, ng in var_map.iteritems():
+    for g, ng in six.iteritems(var_map):
         if g in grad_to_var:
             grad_to_var[ng] = grad_to_var[g]
             grad_to_var.pop(g)
@@ -424,7 +458,7 @@ def _get_stop_gradients_(program):
     for block in program.blocks:
         assert isinstance(block, framework.Block)
         block_no_grad_set = set()
-        for var in block.vars.itervalues():
+        for var in list(block.vars.values()):
             assert isinstance(var, framework.Variable)
             if var.stop_gradient:
                 block_no_grad_set.add(_append_grad_suffix_(var.name))
@@ -437,51 +471,51 @@ def append_backward(loss, parameter_list=None, no_grad_set=None,
     """
     Append backward part to main_program.
 
-    A complete neural network training is made up of forward and backward 
-    propagation. However, when we configure a network, we only need to 
-    specify its forwrd part. The backward part is generated automatically 
+    A complete neural network training is made up of forward and backward
+    propagation. However, when we configure a network, we only need to
+    specify its forwrd part. The backward part is generated automatically
     according to the forward part by this function.
 
-    In most cases, users do not need to invoke this function manually. It 
+    In most cases, users do not need to invoke this function manually. It
     will be automatically invoked by the optimizer's `minimize` function.
 
     Args:
         loss(Variable): The loss variable of the network.
-        parameter_list(list[string]|None): Names of parameters that need 
-                                           to be updated by optimizers. 
-                                           If it is None, all parameters 
+        parameter_list(list[string]|None): Names of parameters that need
+                                           to be updated by optimizers.
+                                           If it is None, all parameters
                                            will be updated.
                                            Default: None
-        no_grad_set(set|None): Variables in the Block 0 whose gradients 
-                               should be ignored. All variables with 
-                               `step_gradient=True` from all blocks will 
+        no_grad_set(set|None): Variables in the Block 0 whose gradients
+                               should be ignored. All variables with
+                               `step_gradient=True` from all blocks will
                                be automatically added into this set.
                                Default: None
-        callbacks(list[callable object]|None): The callbacks are used for 
-                                               doing some custom jobs during 
-                                               backward part building. All 
-                                               callable objects in it will 
-                                               be invoked once each time a 
-                                               new gradient operator is added 
-                                               into the program. The callable 
-                                               object must has two input 
-                                               parameters: 'block' and 'context'. 
-                                               The 'block' is the block which 
-                                               the new gradient operator will 
-                                               be added to. The 'context' is a 
-                                               map, whose keys are gradient 
-                                               variable names and values are 
+        callbacks(list[callable object]|None): The callbacks are used for
+                                               doing some custom jobs during
+                                               backward part building. All
+                                               callable objects in it will
+                                               be invoked once each time a
+                                               new gradient operator is added
+                                               into the program. The callable
+                                               object must has two input
+                                               parameters: 'block' and 'context'.
+                                               The 'block' is the block which
+                                               the new gradient operator will
+                                               be added to. The 'context' is a
+                                               map, whose keys are gradient
+                                               variable names and values are
                                                corresponding original variables.
-                                               In addition to this, the 'context' 
-                                               has another special key-value pair: 
-                                               the key is string '__current_op_desc__' 
-                                               and the value is the op_desc of the 
-                                               gradient operator who has just 
-                                               triggered the callable object. 
+                                               In addition to this, the 'context'
+                                               has another special key-value pair:
+                                               the key is string '__current_op_desc__'
+                                               and the value is the op_desc of the
+                                               gradient operator who has just
+                                               triggered the callable object.
 
     Returns:
-        list[(Variable,Variable)]: Pairs of parameter and its 
-        corresponding gradients. The key is the parameter and the 
+        list[(Variable,Variable)]: Pairs of parameter and its
+        corresponding gradients. The key is the parameter and the
         value is gradient variable.
 
     Raises:
@@ -520,7 +554,7 @@ def append_backward(loss, parameter_list=None, no_grad_set=None,
         no_grad_set = set()
     no_grad_set = copy.copy(no_grad_set)
     no_grad_dict = _get_stop_gradients_(program)
-    no_grad_dict[0].update(map(_append_grad_suffix_, no_grad_set))
+    no_grad_dict[0].update(list(map(_append_grad_suffix_, no_grad_set)))
 
     grad_info_map = dict()
     root_block = program.block(0)
@@ -543,7 +577,7 @@ def append_backward(loss, parameter_list=None, no_grad_set=None,
 
     block_no_grad_set = set(map(_strip_grad_suffix_, no_grad_dict[0]))
     op_path = _find_op_path_(root_block, [loss], [], block_no_grad_set)
-    no_grad_dict[0].update(map(_append_grad_suffix_, block_no_grad_set))
+    no_grad_dict[0].update(list(map(_append_grad_suffix_, block_no_grad_set)))
 
     _append_backward_ops_(root_block, op_path, root_block, no_grad_dict,
                           grad_to_var, callbacks)
@@ -556,19 +590,18 @@ def append_backward(loss, parameter_list=None, no_grad_set=None,
     _append_backward_vars_(root_block, fwd_op_num, grad_to_var, grad_info_map)
 
     program.current_block_idx = current_block_idx
-    program.sync_with_cpp()
-    # FIXME(zcd): prevent loss.grad optimized by mem_opt.
-    loss.block.var(_append_grad_suffix_(loss.name)).persistable = True
+    program._sync_with_cpp()
 
     if parameter_list is not None:
         parameters = parameter_list
     else:
         params = program.global_block().all_parameters()
+        program.global_block().iter_parameters()
         parameters = [param.name for param in params]
 
     params_and_grads = []
     for param in parameters:
-        if param not in grad_info_map:
+        if cpt.to_text(param) not in grad_info_map:
             continue
         grad_info = grad_info_map[param]
         grad_block = grad_info[1]
@@ -684,7 +717,7 @@ def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None):
         no_grad_set = set()
     no_grad_set = copy.copy(no_grad_set)
     no_grad_dict = _get_stop_gradients_(prog)
-    no_grad_dict[0].update(map(_append_grad_suffix_, no_grad_set))
+    no_grad_dict[0].update(list(map(_append_grad_suffix_, no_grad_set)))
 
     fwd_op_num = block.desc.op_size()
 
@@ -718,7 +751,7 @@ def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None):
 
     block_no_grad_set = set(map(_strip_grad_suffix_, no_grad_dict[0]))
     op_path = _find_op_path_(block, targets, inputs, block_no_grad_set)
-    no_grad_dict[0].update(map(_append_grad_suffix_, block_no_grad_set))
+    no_grad_dict[0].update(list(map(_append_grad_suffix_, block_no_grad_set)))
     grad_to_var = dict()
     grad_info_map = dict()
     _append_backward_ops_(block, op_path, block, no_grad_dict, grad_to_var)
@@ -729,7 +762,7 @@ def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None):
     _rename_grad_(block, fwd_op_num, grad_to_var, target_grad_map)
 
     _append_backward_vars_(block, fwd_op_num, grad_to_var, grad_info_map)
-    prog.sync_with_cpp()
+    prog._sync_with_cpp()
 
     grad_vars = []
     for input_var in inputs:
diff --git a/python/paddle/fluid/clip.py b/python/paddle/fluid/clip.py
index 18e2f3045e272fb4712391f87bffd3f367c1c744..ba7ba3b5e983bfbaa82fc752f4821e8a934dfb8c 100644
--- a/python/paddle/fluid/clip.py
+++ b/python/paddle/fluid/clip.py
@@ -12,11 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import copy
+import six
 
 import functools
-import layers
-import framework
+from . import layers
+from . import framework
 from . import core
 
 __all__ = [
@@ -31,7 +34,7 @@ class BaseErrorClipAttr(object):
     def __str__(self):
         raise NotImplementedError()
 
-    def append_clip_op(self, block, grad_name):
+    def _append_clip_op(self, block, grad_name):
         raise NotImplementedError()
 
 
@@ -67,7 +70,7 @@ class ErrorClipByValue(BaseErrorClipAttr):
     def __str__(self):
         return "ByValue, min=%f, max=%f" % (self.min, self.max)
 
-    def append_clip_op(self, block, grad_name):
+    def _append_clip_op(self, block, grad_name):
         clip_op_desc = block.desc.append_op()
         clip_op_desc.set_type("clip")
         clip_op_desc.set_input("X", [grad_name])
@@ -80,9 +83,8 @@ def error_clip_callback(block, context):
     # the context is a grad_to_var map
     grad_to_var = context
     op_desc = block.desc.op(block.desc.op_size() - 1)
-    for grad_n in filter(lambda n: grad_to_var.has_key(n),
-                         op_desc.output_arg_names()):
-        fwd_var = block.var_recursive(grad_to_var[grad_n])
+    for grad_n in [n for n in op_desc.output_arg_names() if n in grad_to_var]:
+        fwd_var = block._var_recursive(grad_to_var[grad_n])
         error_clip = getattr(fwd_var, "error_clip", None)
         if not (error_clip is None or isinstance(error_clip,
                                                  BaseErrorClipAttr)):
@@ -90,17 +92,17 @@ def error_clip_callback(block, context):
                 "Variable's error_clip should be an instance of BaseErrorClipAttr or None."
             )
         if error_clip is not None:
-            error_clip.append_clip_op(block, grad_n)
+            error_clip._append_clip_op(block, grad_n)
 
 
 class BaseGradientClipAttr(object):
     def __str__(self):
         raise NotImplementedError()
 
-    def process_context(self, context, param, grad):
+    def _process_context(self, context, param, grad):
         raise NotImplementedError()
 
-    def create_operators(self, param, grad):
+    def _create_operators(self, param, grad):
         raise NotImplementedError()
 
 
@@ -108,10 +110,10 @@ class NullGradientClipAttr(BaseGradientClipAttr):
     def __str__(self):
         return "Null"
 
-    def process_context(self, context, param, grad):
+    def _process_context(self, context, param, grad):
         pass
 
-    def create_operators(self, param, grad):
+    def _create_operators(self, param, grad):
         return param, grad
 
 
@@ -153,10 +155,10 @@ class GradientClipByValue(BaseGradientClipAttr):
     def __str__(self):
         return "ByValue, min=%f, max=%f" % (self.min, self.max)
 
-    def process_context(self, context, param, grad):
+    def _process_context(self, context, param, grad):
         pass
 
-    def create_operators(self, param, grad):
+    def _create_operators(self, param, grad):
         new_grad = layers.clip(x=grad, min=self.min, max=self.max)
         return param, new_grad
 
@@ -199,10 +201,10 @@ class GradientClipByNorm(BaseGradientClipAttr):
     def __str__(self):
         return "ByNorm, clip_norm=%f" % self.clip_norm
 
-    def process_context(self, context, param, grad):
+    def _process_context(self, context, param, grad):
         pass
 
-    def create_operators(self, param, grad):
+    def _create_operators(self, param, grad):
         new_grad = layers.clip_by_norm(x=grad, max_norm=self.clip_norm)
         return param, new_grad
 
@@ -247,8 +249,8 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr):
     """
 
     def __init__(self, clip_norm, group_name="default_group"):
-        if not isinstance(group_name, basestring):
-            raise TypeError("'group_name' must be a basestring.")
+        if not isinstance(group_name, six.string_types):
+            raise TypeError("'group_name' must be a %s." % (six.string_types))
 
         self.clip_norm = clip_norm
         self.group_name = group_name
@@ -257,7 +259,7 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr):
         return "ByGlobalNorm, group_name=%s, clip_norm=%f" % (self.group_name,
                                                               self.clip_norm)
 
-    def process_context(self, context, param, grad):
+    def _process_context(self, context, param, grad):
         if self.group_name not in context:
             context[self.group_name] = []
             context[self.group_name + "_clip_value"] = self.clip_norm
@@ -274,7 +276,7 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr):
 
         self.context = context
 
-    def create_operators(self, param, grad):
+    def _create_operators(self, param, grad):
         group_scale_name = self.group_name + "_scale"
         if group_scale_name not in self.context:
             group_norm_var = layers.sums(input=self.context[self.group_name])
@@ -284,7 +286,7 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr):
                 x=clip_var,
                 y=layers.elementwise_max(
                     x=clip_var, y=group_norm_var))
-            assert group_scale_var.shape == (1L, )
+            assert group_scale_var.shape == (1, )
             self.context[group_scale_name] = group_scale_var
 
         new_grad = layers.elementwise_mul(
@@ -313,7 +315,7 @@ def set_gradient_clip(clip, param_list=None, program=None):
         program = framework.default_main_program()
     if param_list is None:
         param_list = program.block(0).all_parameters()
-    if all(isinstance(elem, basestring) for elem in param_list):
+    if all(isinstance(elem, six.string_types) for elem in param_list):
         param_list = [program.block(0).var(elem) for elem in param_list]
     if not all(isinstance(elem, framework.Parameter) for elem in param_list):
         raise TypeError(
@@ -324,10 +326,12 @@ def set_gradient_clip(clip, param_list=None, program=None):
         param.gradient_clip_attr = copy.deepcopy(clip)
 
 
-def append_gradient_clip_ops(param_grad):
+def append_gradient_clip_ops(param_grads):
     context = dict()
-    for p, g in param_grad:
-        with p.block.program.optimized_guard(p):
+    for p, g in param_grads:
+        if g is None:
+            continue
+        with p.block.program.optimized_guard([p, g]):
             clip_attr = getattr(p, 'gradient_clip_attr', NullGradientClipAttr())
             if clip_attr is None:
                 clip_attr = NullGradientClipAttr()
@@ -336,12 +340,14 @@ def append_gradient_clip_ops(param_grad):
                     "clip attribute should be an instance of BaseGradientClipAttr"
                 )
 
-            clip_attr.process_context(context=context, param=p, grad=g)
+            clip_attr._process_context(context=context, param=p, grad=g)
 
     res = []
-    for p, g in param_grad:
-        with p.block.program.optimized_guard(p):
-            res.append(clip_attr.create_operators(param=p, grad=g))
+    for p, g in param_grads:
+        if g is None:
+            continue
+        with p.block.program.optimized_guard([p, g]):
+            res.append(clip_attr._create_operators(param=p, grad=g))
 
     return res
 
diff --git a/python/paddle/fluid/concurrency.py b/python/paddle/fluid/concurrency.py
index 470dd0df524936a773f6e740c8079f0efa8ef7b4..b4a06f23a6f2713b665bdd42919925e4a0475a82 100644
--- a/python/paddle/fluid/concurrency.py
+++ b/python/paddle/fluid/concurrency.py
@@ -12,15 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from layers.control_flow import BlockGuard, equal
+from __future__ import print_function
+
+from .layers.control_flow import BlockGuard, equal
 from .framework import Operator
-from layer_helper import LayerHelper, unique_name
-from layers import fill_constant
-import core
+from .layer_helper import LayerHelper, unique_name
+from .layers import fill_constant
+from . import core
 
 __all__ = [
-    'Go', 'make_channel', 'channel_send', 'channel_recv', 'channel_close',
-    'Select'
+    'make_channel', 'channel_send', 'channel_recv', 'channel_close', 'Select'
 ]
 
 
@@ -35,10 +36,10 @@ class Go(BlockGuard):
     def __exit__(self, exc_type, exc_val, exc_tb):
         if exc_type is not None:
             return False
-        self.construct_go_op()
+        self._construct_go_op()
         return super(Go, self).__exit__(exc_type, exc_val, exc_tb)
 
-    def construct_go_op(self):
+    def _construct_go_op(self):
         main_program = self.helper.main_program
         go_block = main_program.current_block()
         parent_block = main_program.block(main_program.current_block()
@@ -69,8 +70,10 @@ class Go(BlockGuard):
         parent_block.append_op(
             type='go',
             inputs={
-                'X':
-                [parent_block.var_recursive(x_name) for x_name in x_name_list]
+                'X': [
+                    parent_block._var_recursive(x_name)
+                    for x_name in x_name_list
+                ]
             },
             outputs={},
             attrs={'sub_block': go_block})
@@ -259,7 +262,7 @@ class Select(BlockGuard):
             if var_name in intermediate
         ]
 
-        X = [select_block.var_recursive(x_name) for x_name in params]
+        X = [select_block._var_recursive(x_name) for x_name in params]
 
         # Needs to be used by `equal` inside the cases block.
         X.append(self.case_to_execute)
diff --git a/python/paddle/fluid/contrib/__init__.py b/python/paddle/fluid/contrib/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5607f11932bbe6aff548be316dc39b4636e079f4
--- /dev/null
+++ b/python/paddle/fluid/contrib/__init__.py
@@ -0,0 +1,22 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+from . import decoder
+from .decoder import *
+from . import memory_usage_calc
+from .memory_usage_calc import *
+
+__all__ = decoder.__all__ + memory_usage_calc.__all__
diff --git a/paddle/contrib/CMakeLists.txt b/python/paddle/fluid/contrib/decoder/__init__.py
similarity index 65%
rename from paddle/contrib/CMakeLists.txt
rename to python/paddle/fluid/contrib/decoder/__init__.py
index 4b19256ef4533a09162edf907f6cd51146517e46..9f973fd3c9af60a0c9a2ba5225a616671545436b 100644
--- a/paddle/contrib/CMakeLists.txt
+++ b/python/paddle/fluid/contrib/decoder/__init__.py
@@ -1,16 +1,20 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-# http://www.apache.org/licenses/LICENSE-2.0
+#     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-#
 
-add_subdirectory(inference)
+from __future__ import print_function
+
+from . import beam_search_decoder
+from .beam_search_decoder import *
+
+__all__ = beam_search_decoder.__all__
diff --git a/python/paddle/fluid/contrib/decoder/beam_search_decoder.py b/python/paddle/fluid/contrib/decoder/beam_search_decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..f2b7ac8375af25beed562b8279b6044f11c09d44
--- /dev/null
+++ b/python/paddle/fluid/contrib/decoder/beam_search_decoder.py
@@ -0,0 +1,842 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This module provides a general beam search decoder API for RNN based decoders.
+The purpose of this API is to allow users to highly customize the behavior
+within their RNN decoder(vanilla RNN, LSTM, attention + LSTM, future etc.),
+without using the low level API such as while ops.
+
+This API is still under active development and may change drastically.
+"""
+
+from __future__ import print_function
+
+import contextlib
+import numpy as np
+import six
+
+from ... import layers
+from ...framework import Variable
+from ... import core
+from ... import framework, unique_name
+from ...layer_helper import LayerHelper
+
+__all__ = ['InitState', 'StateCell', 'TrainingDecoder', 'BeamSearchDecoder']
+
+
+class _DecoderType:
+    TRAINING = 1
+    BEAM_SEARCH = 2
+
+
+class InitState(object):
+    """
+    The initial hidden state object. The state objects holds a variable, and may
+    use it to initialize the hidden state cell of RNN. Usually used as input to
+    `StateCell` class.
+
+    Args:
+        init (Variable): The initial variable of the hidden state. If set None,
+            the variable will be created as a tensor with constant value based
+            on `shape` and `value` param.
+        shape (tuple|list): If `init` is None, new Variable's shape. Default
+            None.
+        value (float): If `init` is None, new Variable's value. Default None.
+        init_boot (Variable): If provided, the initial variable will be created
+            with the same shape as this variable.
+        need_reorder (bool): If set true, the init will be sorted by its lod
+            rank within its batches. This should be used if `batch_size > 1`.
+        dtype (np.dtype|core.VarDesc.VarType|str): Data type of the initial
+            variable.
+
+    Returns:
+        An initialized state object.
+
+    Examples:
+        See `StateCell`.
+    """
+
+    def __init__(self,
+                 init=None,
+                 shape=None,
+                 value=0.0,
+                 init_boot=None,
+                 need_reorder=False,
+                 dtype='float32'):
+        if init is not None:
+            self._init = init
+        elif init_boot is None:
+            raise ValueError(
+                'init_boot must be provided to infer the shape of InitState .\n')
+        else:
+            self._init = layers.fill_constant_batch_size_like(
+                input=init_boot, value=value, shape=shape, dtype=dtype)
+
+        self._shape = shape
+        self._value = value
+        self._need_reorder = need_reorder
+        self._dtype = dtype
+
+    @property
+    def value(self):
+        return self._init
+
+    @property
+    def need_reorder(self):
+        return self._need_reorder
+
+
+class _MemoryState(object):
+    def __init__(self, state_name, rnn_obj, init_state):
+        self._state_name = state_name  # each is a rnn.memory
+        self._rnn_obj = rnn_obj
+        self._state_mem = self._rnn_obj.memory(
+            init=init_state.value, need_reorder=init_state.need_reorder)
+
+    def get_state(self):
+        return self._state_mem
+
+    def update_state(self, state):
+        self._rnn_obj.update_memory(self._state_mem, state)
+
+
+class _ArrayState(object):
+    def __init__(self, state_name, block, init_state):
+        self._state_name = state_name
+        self._block = block
+
+        self._state_array = self._block.create_var(
+            name=unique_name.generate('array_state_array'),
+            type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
+            dtype=init_state.value.dtype)
+
+        self._counter = self._block.create_var(
+            name=unique_name.generate('array_state_counter'),
+            type=core.VarDesc.VarType.LOD_TENSOR,
+            dtype='int64')
+
+        # initialize counter
+        self._block.append_op(
+            type='fill_constant',
+            inputs={},
+            outputs={'Out': [self._counter]},
+            attrs={
+                'shape': [1],
+                'dtype': self._counter.dtype,
+                'value': float(0.0),
+                'force_cpu': True
+            })
+
+        self._counter.stop_gradient = True
+
+        # write initial state
+        block.append_op(
+            type='write_to_array',
+            inputs={'X': init_state.value,
+                    'I': self._counter},
+            outputs={'Out': self._state_array})
+
+    def get_state(self):
+        state = layers.array_read(array=self._state_array, i=self._counter)
+        return state
+
+    def update_state(self, state):
+        layers.increment(x=self._counter, value=1, in_place=True)
+        layers.array_write(state, array=self._state_array, i=self._counter)
+
+
+class StateCell(object):
+    """
+    The state cell class stores the hidden state of the RNN cell. A typical RNN
+    cell has one or more hidden states, and one or more step inputs. This class
+    allows you to defines the name of hidden states as well as step inputs, and
+    their associated variables.
+
+    Args:
+        inputs (dict): A feeding dict of {name(str) : Variable}. It specifies
+            the names of step inputs for RNN cell, and the associated variables.
+            The variable could initially be None and set manually during each
+            RNN step.
+        states (dict): A feeding dict of {name(str) : InitState object}. It
+            specifies the names of hidden states and their initialized state.
+        out_state (str): A string that specifies the name of hidden state that
+            will be used to compute the score in beam search process.
+        name (str): The name of the RNN cell. Default None.
+
+    Raises:
+        `ValueError`: If the initial state is not an instance of InitState, or
+            the out_state is not in the dict of states.
+
+    Returns:
+        StateCell: The initialized StateCell object.
+
+    Examples:
+        .. code-block:: python
+          hidden_state = InitState(init=encoder_out, need_reorder=True)
+          state_cell = StateCell(
+              inputs={'current_word': None},
+              states={'h': hidden_state},
+              out_state='h')
+    """
+
+    def __init__(self, inputs, states, out_state, name=None):
+        self._helper = LayerHelper('state_cell', name=name)
+        self._cur_states = {}
+        self._state_names = []
+        for state_name, state in six.iteritems(states):
+            if not isinstance(state, InitState):
+                raise ValueError('state must be an InitState object.')
+            self._cur_states[state_name] = state
+            self._state_names.append(state_name)
+        self._inputs = inputs  # inputs is place holder here
+        self._cur_decoder_obj = None
+        self._in_decoder = False
+        self._states_holder = {}
+        self._switched_decoder = False
+        self._state_updater = None
+        self._out_state = out_state
+        if self._out_state not in self._cur_states:
+            raise ValueError('out_state must be one state in states')
+
+    def _enter_decoder(self, decoder_obj):
+        if self._in_decoder == True or self._cur_decoder_obj is not None:
+            raise ValueError('StateCell has already entered a decoder.')
+        self._in_decoder = True
+        self._cur_decoder_obj = decoder_obj
+        self._switched_decoder = False
+
+    def _leave_decoder(self, decoder_obj):
+        if not self._in_decoder:
+            raise ValueError('StateCell not in decoder, '
+                             'invalid leaving operation.')
+
+        if self._cur_decoder_obj != decoder_obj:
+            raise ValueError('Inconsistent decoder object in StateCell.')
+
+        self._in_decoder = False
+        self._cur_decoder_obj = None
+        self._switched_decoder = False
+
+    def _switch_decoder(self):  # lazy switch
+        if not self._in_decoder:
+            raise ValueError('StateCell must be enter a decoder.')
+
+        if self._switched_decoder:
+            raise ValueError('StateCell already done switching.')
+
+        for state_name in self._state_names:
+            if state_name not in self._states_holder:
+                state = self._cur_states[state_name]
+
+                if not isinstance(state, InitState):
+                    raise ValueError('Current type of state is %s, should be '
+                                     'an InitState object.' % type(state))
+
+                self._states_holder[state_name] = {}
+
+                if self._cur_decoder_obj.type == _DecoderType.TRAINING:
+                    self._states_holder[state_name][id(self._cur_decoder_obj)] \
+                        = _MemoryState(state_name,
+                                       self._cur_decoder_obj.dynamic_rnn,
+                                       state)
+                elif self._cur_decoder_obj.type == _DecoderType.BEAM_SEARCH:
+                    self._states_holder[state_name][id(self._cur_decoder_obj)] \
+                        = _ArrayState(state_name,
+                                      self._cur_decoder_obj._parent_block(),
+                                      state)
+                else:
+                    raise ValueError('Unknown decoder type, only support '
+                                     '[TRAINING, BEAM_SEARCH]')
+
+            # Read back, since current state should be LoDTensor
+            self._cur_states[state_name] = \
+                self._states_holder[state_name][
+                    id(self._cur_decoder_obj)].get_state()
+
+        self._switched_decoder = True
+
+    def get_state(self, state_name):
+        """
+        The getter of state object. Find the state variable by its name.
+
+        Args:
+            state_name (str): A string of the state's name.
+
+        Returns:
+            The associated state object.
+        """
+        if self._in_decoder and not self._switched_decoder:
+            self._switch_decoder()
+
+        if state_name not in self._cur_states:
+            raise ValueError(
+                'Unknown state %s. Please make sure _switch_decoder() '
+                'invoked.' % state_name)
+
+        return self._cur_states[state_name]
+
+    def get_input(self, input_name):
+        """
+        The getter of input variable. Find the input variable by its name.
+
+        Args:
+            input_name (str): The string of the input's name.
+
+        Returns:
+            The associated input variable.
+        """
+        if input_name not in self._inputs or self._inputs[input_name] is None:
+            raise ValueError('Invalid input %s.' % input_name)
+        return self._inputs[input_name]
+
+    def set_state(self, state_name, state_value):
+        """
+        The setter of the state variable. Change the variable of the given
+        `state_name`.
+
+        Args:
+            state_name (str): The name of the state to change.
+            state_value (Var): The variable of the new state.
+        """
+        self._cur_states[state_name] = state_value
+
+    def state_updater(self, updater):
+        """
+        Set up the updater to update the hidden state every RNN step. The
+        behavior of updater could be customized by users. The updater should be
+        a function that takes a `StateCell` object as input and update the
+        hidden state within it. The hidden state could be accessed through
+        `get_state` method.
+
+        Args:
+            updater (func): the updater to update the state cell.
+        """
+        self._state_updater = updater
+
+        def _decorator(state_cell):
+            if state_cell == self:
+                raise TypeError('Updater should only accept a StateCell object '
+                                'as argument.')
+            updater(state_cell)
+
+        return _decorator
+
+    def compute_state(self, inputs):
+        """
+        Provide the step input of RNN cell, and compute the new hidden state
+        with updater and give step input.
+
+        Args:
+            inputs (dict): A feed dict, {name(str): Variable}. name should be
+            the names of step inputs for this RNN cell, and Variable should be
+            the associated variables.
+
+        Examples:
+        .. code-block:: python
+          state_cell.compute_state(inputs={'x': current_word})
+        """
+        if self._in_decoder and not self._switched_decoder:
+            self._switch_decoder()
+
+        for input_name, input_value in six.iteritems(inputs):
+            if input_name not in self._inputs:
+                raise ValueError('Unknown input %s. '
+                                 'Please make sure %s in input '
+                                 'place holder.' % (input_name, input_name))
+            self._inputs[input_name] = input_value
+        self._state_updater(self)
+
+    def update_states(self):
+        """
+        Update and record state information after each RNN step.
+        """
+        if self._in_decoder and not self._switched_decoder:
+            self._switched_decoder()
+
+        for state_name, decoder_state in six.iteritems(self._states_holder):
+            if id(self._cur_decoder_obj) not in decoder_state:
+                raise ValueError('Unknown decoder object, please make sure '
+                                 'switch_decoder been invoked.')
+            decoder_state[id(self._cur_decoder_obj)].update_state(
+                self._cur_states[state_name])
+
+    def out_state(self):
+        """
+        Get the output state variable. This must be called after update_states.
+
+        Returns:
+            The output variable of the RNN cell.
+        """
+        return self._cur_states[self._out_state]
+
+
+class TrainingDecoder(object):
+    """
+    A decoder that can only be used for training. The decoder could be
+    initialized with a `StateCell` object. The computation within the RNN cell
+    could be defined with decoder's block.
+
+    Args:
+        state_cell (StateCell): A StateCell object that handles the input and
+            state variables.
+        name (str): The name of this decoder. Default None.
+
+    Returns:
+        TrainingDecoder: The initialized TrainingDecoder object.
+
+    Examples:
+        .. code-block:: python
+          decoder = TrainingDecoder(state_cell)
+          with decoder.block():
+              current_word = decoder.step_input(trg_embedding)
+              decoder.state_cell.compute_state(inputs={'x': current_word})
+              current_score = layers.fc(input=decoder.state_cell.get_state('h'),
+                                        size=32,
+                                        act='softmax')
+              decoder.state_cell.update_states()
+              decoder.output(current_score)
+    """
+    BEFORE_DECODER = 0
+    IN_DECODER = 1
+    AFTER_DECODER = 2
+
+    def __init__(self, state_cell, name=None):
+        self._helper = LayerHelper('training_decoder', name=name)
+        self._status = TrainingDecoder.BEFORE_DECODER
+        self._dynamic_rnn = layers.DynamicRNN()
+        self._type = _DecoderType.TRAINING
+        self._state_cell = state_cell
+        self._state_cell._enter_decoder(self)
+
+    @contextlib.contextmanager
+    def block(self):
+        """
+        Define the behavior of the decoder for each RNN time step.
+        """
+        if self._status != TrainingDecoder.BEFORE_DECODER:
+            raise ValueError('decoder.block() can only be invoked once')
+        self._status = TrainingDecoder.IN_DECODER
+        with self._dynamic_rnn.block():
+            yield
+        self._status = TrainingDecoder.AFTER_DECODER
+        self._state_cell._leave_decoder(self)
+
+    @property
+    def state_cell(self):
+        self._assert_in_decoder_block('state_cell')
+        return self._state_cell
+
+    @property
+    def dynamic_rnn(self):
+        return self._dynamic_rnn
+
+    @property
+    def type(self):
+        return self._type
+
+    def step_input(self, x):
+        """
+        Set the input variable as a step input to the RNN cell. For example,
+        in machine translation, each time step we read one word from the target
+        sentences, then the target sentence is a step input to the RNN cell.
+
+        Args:
+            x (Variable): the variable to be used as step input.
+
+        Returns:
+            Variable: The variable as input of current step.
+
+        Examples:
+        .. code-block:: python
+          current_word = decoder.step_input(trg_embedding)
+        """
+        self._assert_in_decoder_block('step_input')
+        return self._dynamic_rnn.step_input(x)
+
+    def static_input(self, x):
+        """
+        Set the input variable as a static input of RNN cell. In contrast to
+        step input, this variable will be used as a whole within the RNN decode
+        loop and will not be scattered into time steps.
+
+        Args:
+            x (Variable): the variable to be used as static input.
+
+        Returns:
+            Variable: The variable as input of current step.
+
+        Examples:
+        .. code-block:: python
+          encoder_vec = decoder.static_input(encoded_vector)
+        """
+        self._assert_in_decoder_block('static_input')
+        return self._dynamic_rnn.static_input(x)
+
+    def __call__(self, *args, **kwargs):
+        """
+        Get the output of RNN. This API should only be invoked after RNN.block()
+
+        Returns:
+            Variable: The specified output of the RNN cell.
+        """
+        if self._status != TrainingDecoder.AFTER_DECODER:
+            raise ValueError('Output of training decoder can only be visited '
+                             'outside the block.')
+        return self._dynamic_rnn(*args, **kwargs)
+
+    def output(self, *outputs):
+        """
+        Set the output variable of the RNN cell.
+
+        Args:
+            *outputs (Variables): a series of variables that treated as output
+                of the RNN cell.
+
+        Examples:
+        .. code-block:: python
+          out = fluid.layers.fc(input=h,
+                                size=32,
+                                bias_attr=True,
+                                act='softmax')
+          decoder.output(out)
+        """
+        self._assert_in_decoder_block('output')
+        self._dynamic_rnn.output(*outputs)
+
+    def _assert_in_decoder_block(self, method):
+        if self._status != TrainingDecoder.IN_DECODER:
+            raise ValueError('%s should be invoked inside block of '
+                             'TrainingDecoder object.' % method)
+
+
+class BeamSearchDecoder(object):
+    """
+    A beam search decoder that can be used for inference. The decoder should be
+    initialized with a `StateCell` object. The decode process can be defined
+    within its block.
+
+    Args:
+        state_cell (StateCell): A StateCell object that handles the input and
+            state variables.
+        init_ids (Variable): The init beam search token ids.
+        init_scores (Variable): The associated score of each id.
+        target_dict_dim (int): Size of dictionary.
+        word_dim (int): Word embedding dimension.
+        input_var_dict (dict): A feeding dict to feed the required input
+            variables to the state cell. It will be used by state_cell 's
+            compute method. Default empty.
+        topk_size (int): The topk size used for beam search. Default 50.
+        max_len (int): The maximum allowed length of the generated sentence.
+            Default 100.
+        beam_size (int): The beam width of beam search decode. Default 1.
+        end_id (int): The id of end token within beam search.
+        name (str): The name of this decoder. Default None.
+
+    Returns:
+        BeamSearchDecoder: A initialized BeamSearchDecoder object.
+
+    Examples:
+    .. code-block:: python
+      decoder = BeamSearchDecoder(
+          state_cell=state_cell,
+          init_ids=init_ids,
+          init_scores=init_scores,
+          target_dict_dim=target_dict_dim,
+          word_dim=word_dim,
+          init_var_dict={},
+          topk_size=topk_size,
+          sparse_emb=IS_SPARSE,
+          max_len=max_length,
+          beam_size=beam_size,
+          end_id=1,
+          name=None
+      )
+      decoder.decode()
+      translation_ids, translation_scores = decoder()
+    """
+    BEFORE_BEAM_SEARCH_DECODER = 0
+    IN_BEAM_SEARCH_DECODER = 1
+    AFTER_BEAM_SEARCH_DECODER = 2
+
+    def __init__(self,
+                 state_cell,
+                 init_ids,
+                 init_scores,
+                 target_dict_dim,
+                 word_dim,
+                 input_var_dict={},
+                 topk_size=50,
+                 sparse_emb=True,
+                 max_len=100,
+                 beam_size=1,
+                 end_id=1,
+                 name=None):
+        self._helper = LayerHelper('beam_search_decoder', name=name)
+        self._counter = layers.zeros(shape=[1], dtype='int64')
+        self._counter.stop_gradient = True
+        self._type = _DecoderType.BEAM_SEARCH
+        self._max_len = layers.fill_constant(
+            shape=[1], dtype='int64', value=max_len)
+        self._cond = layers.less_than(
+            x=self._counter,
+            y=layers.fill_constant(
+                shape=[1], dtype='int64', value=max_len))
+        self._while_op = layers.While(self._cond)
+        self._state_cell = state_cell
+        self._state_cell._enter_decoder(self)
+        self._status = BeamSearchDecoder.BEFORE_BEAM_SEARCH_DECODER
+        self._zero_idx = layers.fill_constant(
+            shape=[1], value=0, dtype='int64', force_cpu=True)
+        self._array_dict = {}
+        self._array_link = []
+        self._ids_array = None
+        self._scores_array = None
+        self._beam_size = beam_size
+        self._end_id = end_id
+
+        self._init_ids = init_ids
+        self._init_scores = init_scores
+        self._target_dict_dim = target_dict_dim
+        self._topk_size = topk_size
+        self._sparse_emb = sparse_emb
+        self._word_dim = word_dim
+        self._input_var_dict = input_var_dict
+
+    @contextlib.contextmanager
+    def block(self):
+        """
+        Define the behavior of the decoder for each RNN time step.
+        """
+        if self._status != BeamSearchDecoder.BEFORE_BEAM_SEARCH_DECODER:
+            raise ValueError('block() can only be invoke once.')
+
+        self._status = BeamSearchDecoder.IN_BEAM_SEARCH_DECODER
+
+        with self._while_op.block():
+            yield
+            with layers.Switch() as switch:
+                with switch.case(self._cond):
+                    layers.increment(x=self._counter, value=1.0, in_place=True)
+
+                    for value, array in self._array_link:
+                        layers.array_write(
+                            x=value, i=self._counter, array=array)
+
+                    layers.less_than(
+                        x=self._counter, y=self._max_len, cond=self._cond)
+
+        self._status = BeamSearchDecoder.AFTER_BEAM_SEARCH_DECODER
+        self._state_cell._leave_decoder(self)
+
+    @property
+    def type(self):
+        return self._type
+
+    def early_stop(self):
+        """
+        Stop the generation process in advance. Could be used as "break".
+        """
+        layers.fill_constant(
+            shape=[1], value=0, dtype='bool', force_cpu=True, out=self._cond)
+
+    def decode(self):
+        """
+        Set up the computation within the decoder. Then you could call the
+        decoder to get the result of beam search decode. If you want to define
+        a more specific decoder, you could override this function.
+
+        Examples:
+        .. code-block:: python
+          decoder.decode()
+          translation_ids, translation_scores = decoder()
+        """
+        with self.block():
+            prev_ids = self.read_array(init=self._init_ids, is_ids=True)
+            prev_scores = self.read_array(
+                init=self._init_scores, is_scores=True)
+            prev_ids_embedding = layers.embedding(
+                input=prev_ids,
+                size=[self._target_dict_dim, self._word_dim],
+                dtype='float32',
+                is_sparse=self._sparse_emb)
+
+            feed_dict = {}
+            update_dict = {}
+
+            for init_var_name, init_var in six.iteritems(self._input_var_dict):
+                if init_var_name not in self.state_cell._inputs:
+                    raise ValueError('Variable ' + init_var_name +
+                                     ' not found in StateCell!\n')
+
+                read_var = self.read_array(init=init_var)
+                update_dict[init_var_name] = read_var
+                feed_var_expanded = layers.sequence_expand(read_var,
+                                                           prev_scores)
+                feed_dict[init_var_name] = feed_var_expanded
+
+            for state_str in self._state_cell._state_names:
+                prev_state = self.state_cell.get_state(state_str)
+                prev_state_expanded = layers.sequence_expand(prev_state,
+                                                             prev_scores)
+                self.state_cell.set_state(state_str, prev_state_expanded)
+
+            for i, input_name in enumerate(self._state_cell._inputs):
+                if input_name not in feed_dict:
+                    feed_dict[input_name] = prev_ids_embedding
+
+            self.state_cell.compute_state(inputs=feed_dict)
+            current_state = self.state_cell.out_state()
+            current_state_with_lod = layers.lod_reset(
+                x=current_state, y=prev_scores)
+            scores = layers.fc(input=current_state_with_lod,
+                               size=self._target_dict_dim,
+                               act='softmax')
+            topk_scores, topk_indices = layers.topk(scores, k=self._topk_size)
+            accu_scores = layers.elementwise_add(
+                x=layers.log(x=topk_scores),
+                y=layers.reshape(
+                    prev_scores, shape=[-1]),
+                axis=0)
+            selected_ids, selected_scores = layers.beam_search(
+                prev_ids,
+                prev_scores,
+                topk_indices,
+                accu_scores,
+                self._beam_size,
+                end_id=1,
+                level=0)
+
+            with layers.Switch() as switch:
+                with switch.case(layers.is_empty(selected_ids)):
+                    self.early_stop()
+                with switch.default():
+                    self.state_cell.update_states()
+                    self.update_array(prev_ids, selected_ids)
+                    self.update_array(prev_scores, selected_scores)
+                    for update_name, var_to_update in six.iteritems(
+                            update_dict):
+                        self.update_array(var_to_update, feed_dict[update_name])
+
+    def read_array(self, init, is_ids=False, is_scores=False):
+        """
+        Read an array to get the decoded ids and scores generated by previous
+        RNN step. At the first step of RNN, the init variable mut be used to
+        initialize the array.
+
+        Args:
+            init (Variable): The initial variable for first step usage. init
+                must be provided.
+            is_ids (bool): Specify whether the variable is an id.
+            is_scores (bool): Specify whether the variable is a score.
+
+        Returns:
+            The associated variable generated during previous RNN steps.
+
+        Examples:
+            .. code-block:: python
+              prev_ids = decoder.read_array(init=init_ids, is_ids=True)
+              prev_scores = decoder.read_array(init=init_scores, is_scores=True)
+        """
+        self._assert_in_decoder_block('read_array')
+
+        if is_ids and is_scores:
+            raise ValueError('Shouldn\'t mark current array be ids array and'
+                             'scores array at the same time.')
+
+        if not isinstance(init, Variable):
+            raise TypeError('The input argument `init` must be a Variable.')
+
+        parent_block = self._parent_block()
+        array = parent_block.create_var(
+            name=unique_name.generate('beam_search_decoder_array'),
+            type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
+            dtype=init.dtype)
+        parent_block.append_op(
+            type='write_to_array',
+            inputs={'X': init,
+                    'I': self._zero_idx},
+            outputs={'Out': array})
+
+        if is_ids:
+            self._ids_array = array
+        elif is_scores:
+            self._scores_array = array
+
+        read_value = layers.array_read(array=array, i=self._counter)
+        self._array_dict[read_value.name] = array
+        return read_value
+
+    def update_array(self, array, value):
+        """
+        Store the value generated in current step in an array for each RNN step.
+        This array could be accessed by read_array method.
+
+        Args:
+            array (Variable): The array to append the new variable to.
+            value (Variable): The newly generated value to be stored.
+        """
+        self._assert_in_decoder_block('update_array')
+
+        if not isinstance(array, Variable):
+            raise TypeError(
+                'The input argument `array` of  must be a Variable.')
+        if not isinstance(value, Variable):
+            raise TypeError('The input argument `value` of must be a Variable.')
+
+        array = self._array_dict.get(array.name, None)
+        if array is None:
+            raise ValueError('Please invoke read_array before update_array.')
+        self._array_link.append((value, array))
+
+    def __call__(self):
+        """
+        Run the decode process and return the final decode result.
+
+        Returns:
+            A tuple of decoded (id, score) pairs. id is a Variable that holds
+            the generated tokens, and score is a Variable with the same shape
+            as id, holds the score for each generated token.
+        """
+        if self._status != BeamSearchDecoder.AFTER_BEAM_SEARCH_DECODER:
+            raise ValueError('Output of BeamSearchDecoder object can '
+                             'only be visited outside the block.')
+        return layers.beam_search_decode(
+            ids=self._ids_array,
+            scores=self._scores_array,
+            beam_size=self._beam_size,
+            end_id=self._end_id)
+
+    @property
+    def state_cell(self):
+        self._assert_in_decoder_block('state_cell')
+        return self._state_cell
+
+    def _parent_block(self):
+        """
+        Getter of parent block.
+
+        Returns:
+            The parent block of decoder.
+        """
+        program = self._helper.main_program
+        parent_block_idx = program.current_block().parent_idx
+        if parent_block_idx < 0:
+            raise ValueError('Invalid block with index %d.' % parent_block_idx)
+        parent_block = program.block(parent_block_idx)
+        return parent_block
+
+    def _assert_in_decoder_block(self, method):
+        if self._status != BeamSearchDecoder.IN_BEAM_SEARCH_DECODER:
+            raise ValueError('%s should be invoked inside block of '
+                             'BeamSearchDecoder object.' % method)
diff --git a/python/paddle/fluid/contrib/memory_usage_calc.py b/python/paddle/fluid/contrib/memory_usage_calc.py
new file mode 100644
index 0000000000000000000000000000000000000000..baa14a573fcfdfa943af1e995f687c74e9fb4d07
--- /dev/null
+++ b/python/paddle/fluid/contrib/memory_usage_calc.py
@@ -0,0 +1,120 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This module privides a memory usage calculate function for user.
+The purpose of this API is to allow users to estimate memory usage of
+a program under a special batch size, then user can set appropriate
+batch size to fully utilize a GPU.
+
+This API is still under active development and may change drastically.
+"""
+
+from __future__ import print_function
+
+import six
+
+from .. import core
+from ..framework import Program, Variable
+
+__all__ = ['memory_usage']
+
+dtype_to_size = {
+    core.VarDesc.VarType.FP16: 2,
+    core.VarDesc.VarType.FP32: 4,
+    core.VarDesc.VarType.FP64: 8,
+    core.VarDesc.VarType.INT16: 2,
+    core.VarDesc.VarType.INT32: 4,
+    core.VarDesc.VarType.INT64: 8,
+    core.VarDesc.VarType.BOOL: 1,
+    core.VarDesc.VarType.UINT8: 1,
+}
+
+DEBUG = False
+
+
+def memory_usage(program, batch_size):
+    """
+    Get the estimate memory usage of program with input batch size.
+
+    Args:
+        program(Program): The current Program.
+        batch_size(int): The current input data batch_size.
+
+    Returns:
+        min_total_memory(float): the estimate memory usage lower bound.
+        max_total_memory(float): the estimate memory usage upper bound.
+        unit_str(string): the unit of estimate usage result.
+
+    Examples:
+
+        >>> import paddle.fluid as fluid
+        >>> lower_usage, upper_usage, unit = fluid.contrib.memory_usage(
+                fluid.default_main_program(), batch_size=10)
+        >>> print "memory usage is about %.3f - %.3f %s" % \
+                (lower_usage, upper_usage, unit)
+
+    """
+
+    # Parameters check
+    if not isinstance(program, Program):
+        raise TypeError(
+            "Calculating Memory Usage requires Program as its Parameter."
+            "But you passed in %s" % (type(program)))
+    if batch_size <= 0:
+        raise ValueError("The batch size need to be positive.")
+
+    # Get the var_name list of first block and calculate
+    total_memory = 0.0
+    processed_var_names = set()
+    for op in program.global_block().ops:
+        for var_name in op.output_arg_names:
+            if var_name in processed_var_names:
+                continue
+            processed_var_names.add(var_name)
+            var = program.global_block().vars[var_name]
+            if var.desc.type() != core.VarDesc.VarType.LOD_TENSOR:
+                continue
+
+            data_count = 1
+            neg_dim_count = 0
+            for x in var.shape:
+                if x < 0:
+                    if neg_dim_count >= 1:
+                        raise ValueError("Var %s has more than one negtive dim."
+                                         % (var_name))
+                    neg_dim_count += 1
+                    data_count *= batch_size * (-x)
+                else:
+                    data_count *= x
+            var_memory = data_count * dtype_to_size[var.dtype]
+            if DEBUG:
+                print("%s memory usage: %d" % (var.name, var_memory))
+            total_memory += var_memory
+    if DEBUG:
+        print("total memory usage: %.2f" % (total_memory))
+
+    # Convert appropriate unit
+    unit_str = "B"
+    if total_memory > 1024:
+        total_memory /= 1024
+        unit_str = "KB"
+        if total_memory > 1024:
+            total_memory /= 1024
+            unit_str = "MB"
+
+    # Append extra memory consumption (5% - 10%)
+    min_total_memory = total_memory * 1.05
+    max_total_memory = total_memory * 1.1
+
+    return min_total_memory, max_total_memory, unit_str
diff --git a/python/paddle/fluid/data_feeder.py b/python/paddle/fluid/data_feeder.py
index c859778b3757f638ac531620f241e684522add57..631bbfe1fe59ddd9cd315fb64ca32e1e125b0e8d 100644
--- a/python/paddle/fluid/data_feeder.py
+++ b/python/paddle/fluid/data_feeder.py
@@ -13,13 +13,15 @@
 # limitations under the License.
 
 from __future__ import print_function
-import core
+
+from . import core
 import numpy
 import os
-import six.moves as six
+import six
+from six.moves import zip, range, xrange
 import multiprocessing
 
-from framework import Variable, default_main_program
+from .framework import Variable, default_main_program
 
 __all__ = ['DataFeeder']
 
@@ -53,7 +55,7 @@ class DataToLoDTensorConverter(object):
         self.data = []
         self.lod = []
 
-        for i in six.range(lod_level):
+        for i in six.moves.range(lod_level):
             self.lod.append([])
 
     def feed(self, data):
@@ -142,7 +144,7 @@ class DataFeeder(object):
         if program is None:
             program = default_main_program()
         for each_var in feed_list:
-            if isinstance(each_var, basestring):
+            if isinstance(each_var, six.string_types):
                 each_var = program.block(0).var(each_var)
             if not isinstance(each_var, Variable):
                 raise TypeError("Feed list should contain a list of variable")
@@ -174,7 +176,7 @@ class DataFeeder(object):
             dict: the result of conversion.
         """
         converter = []
-        for lod_level, shape, dtype in six.zip(
+        for lod_level, shape, dtype in six.moves.zip(
                 self.feed_lod_level, self.feed_shapes, self.feed_dtypes):
             converter.append(
                 DataToLoDTensorConverter(
@@ -187,10 +189,12 @@ class DataFeeder(object):
             assert len(each_sample) == len(converter), (
                 "The number of fields in data (%s) does not match " +
                 "len(feed_list) (%s)") % (len(each_sample), len(converter))
-            for each_converter, each_slot in six.zip(converter, each_sample):
+            for each_converter, each_slot in six.moves.zip(converter,
+                                                           each_sample):
                 each_converter.feed(each_slot)
         ret_dict = {}
-        for each_name, each_converter in six.zip(self.feed_names, converter):
+        for each_name, each_converter in six.moves.zip(self.feed_names,
+                                                       converter):
             ret_dict[each_name] = each_converter.done()
         return ret_dict
 
@@ -212,12 +216,14 @@ class DataFeeder(object):
         if isinstance(self.place, core.CUDAPlace):
             places = [
                 core.CUDAPlace(i)
-                for i in six.xrange(self._get_number_of_places_(num_places))
+                for i in six.moves.xrange(
+                    self._get_number_of_places_(num_places))
             ]
         else:
             places = [
                 core.CPUPlace()
-                for _ in six.xrange(self._get_number_of_places_(num_places))
+                for _ in six.moves.xrange(
+                    self._get_number_of_places_(num_places))
             ]
 
         if len(iterable) != len(places):
@@ -227,7 +233,7 @@ class DataFeeder(object):
                              "must be same.")
 
         place = self.place
-        for p, batch in six.zip(places, iterable):
+        for p, batch in six.moves.zip(places, iterable):
             self.place = p
             yield self.feed(batch)
         self.place = place
diff --git a/python/paddle/fluid/debugger.py b/python/paddle/fluid/debugger.py
index 1c56064a1e8bdc5d975837cb5a75a40d557765ad..63060a77d1abdfd4060648bfabe25709afcfeb8d 100644
--- a/python/paddle/fluid/debugger.py
+++ b/python/paddle/fluid/debugger.py
@@ -12,10 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import sys
+import six
 import re
-from graphviz import GraphPreviewGenerator
-import proto.framework_pb2 as framework_pb2
+from .graphviz import GraphPreviewGenerator
+from .proto import framework_pb2
 from google.protobuf import text_format
 
 _vartype2str_ = [
@@ -225,7 +228,7 @@ def draw_block_graphviz(block, highlights=None, path="./temp.dot"):
     graph = GraphPreviewGenerator("some graph")
     # collect parameters and args
     protostr = block.desc.serialize_to_string()
-    desc = framework_pb2.BlockDesc.FromString(str(protostr))
+    desc = framework_pb2.BlockDesc.FromString(six.binary_type(protostr))
 
     def need_highlight(name):
         if highlights is None: return False
diff --git a/python/paddle/fluid/default_scope_funcs.py b/python/paddle/fluid/default_scope_funcs.py
index f8faf6942524612ccc63713240bb289eeeaf75eb..a5b2c84dfe6f2650b4a2ee4465f723812e5d4a01 100644
--- a/python/paddle/fluid/default_scope_funcs.py
+++ b/python/paddle/fluid/default_scope_funcs.py
@@ -26,6 +26,8 @@ A `scoped_function` will take a `function` as input. That function will be
 invoked in a new local scope.
 """
 
+from __future__ import print_function
+
 import paddle.fluid.core
 import threading
 
diff --git a/python/paddle/fluid/evaluator.py b/python/paddle/fluid/evaluator.py
index 00ba1a0457583d1cc1fa7136ebd51e9ced167832..7a82038ff78b17b2ddfd7b47320d41a7de9a2b8a 100644
--- a/python/paddle/fluid/evaluator.py
+++ b/python/paddle/fluid/evaluator.py
@@ -12,14 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import warnings
 import numpy as np
 
-import layers
-from framework import Program, Variable, program_guard
-import unique_name
-from layer_helper import LayerHelper
-from initializer import Constant
+from . import layers
+from .framework import Program, Variable, program_guard
+from . import unique_name
+from .layer_helper import LayerHelper
+from .initializer import Constant
 
 __all__ = [
     'ChunkEvaluator',
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index b436dfe70afdb52299222f8ba3f5bdff2842d103..288951cd7cd32155f136125fb817c35dd2ec6444 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -12,14 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import numpy as np
 import contextlib
-from framework import Program, default_main_program, Variable
+import six
+from .framework import Program, default_main_program, Variable
 from . import core
 
-__all__ = [
-    'Executor', 'global_scope', 'scope_guard', '_switch_scope', 'fetch_var'
-]
+__all__ = ['Executor', 'global_scope', 'scope_guard', '_switch_scope']
 
 g_scope = core.Scope()
 
@@ -170,7 +171,7 @@ def has_fetch_operators(block, fetch_targets, fetch_holder_name):
     return fetch_count > 0
 
 
-def fetch_var(name, scope=None, return_numpy=True):
+def _fetch_var(name, scope=None, return_numpy=True):
     """
     Fetch the value of the variable with the given name from the
     given scope.
@@ -204,23 +205,54 @@ def fetch_var(name, scope=None, return_numpy=True):
 
 
 def _get_program_cache_key(feed, fetch_list):
-    feed_var_names = feed.keys()
+    feed_var_names = list(feed.keys())
 
     def to_name_str(var):
         if isinstance(var, Variable):
             return var.desc.name()
         elif isinstance(var, str):
             return var
-        elif isinstance(var, basestring):
+        elif isinstance(var, six.string_types):
             return str(var)
         else:
             raise TypeError(str(var) + " should be Variable or str")
 
-    fetch_var_names = map(to_name_str, fetch_list)
+    fetch_var_names = list(map(to_name_str, fetch_list))
 
     return str(feed_var_names + fetch_var_names)
 
 
+def _as_lodtensor(data, place):
+    """
+        Convert numpy.ndarray to Tensor, its only support Tensor without LoD information.
+        For higher dimensional sequence data, please use LoDTensor directly.
+
+        Examples:
+            >>> import paddle.fluid as fluid
+            >>> place = fluid.CPUPlace()
+            >>> exe = fluid.executor(place)
+            >>> data = np.array(size=(100, 200, 300))
+            >>> np_outs = map(lambda x: fluid.executor._as_lodtensor(x, place), data)
+            >>>     ...
+
+        Args:
+            data(numpy.ndarray): a instance of array
+
+        Returns:
+            LoDTensor
+        """
+    if isinstance(data, list):
+        raise RuntimeError("Some of your feed data hold LoD information. \
+                They can not be completely cast from a list of Python \
+                ndarray to LoDTensor. Please convert data to LoDTensor \
+                directly before feeding the data.\
+                ")
+    # single tensor case
+    tensor = core.LoDTensor()
+    tensor.set(data, place)
+    return tensor
+
+
 class Executor(object):
     """
     An Executor in Python, only support the single-GPU running. For multi-cards, please refer to
@@ -229,8 +261,8 @@ class Executor(object):
     to feed map and fetch_list. Feed map provides input data for the program. fetch_list provides
     the variables(or names) that user want to get after program run. Note: the executor will run all
     operators in the program but not only the operators dependent by the fetch_list.
-    It store the global variables into the global scope, and create a local scope for the temporary 
-    variables. The local scope contents will be discarded after every minibatch forward/backward finished. 
+    It store the global variables into the global scope, and create a local scope for the temporary
+    variables. The local scope contents will be discarded after every minibatch forward/backward finished.
     But the global scope variables will be persistent through different runs.
     All of ops in program will be running in sequence.
 
@@ -247,35 +279,7 @@ class Executor(object):
         p.set_place(place)
         self.executor = core.Executor(p)
         self.program_caches = dict()
-
-    def as_lodtensor(self, data):
-        """
-        Convert numpy.ndarray to Tensor, its only support Tensor without LoD information.
-        For higher dimensional sequence data, please use LoDTensor directly.
-
-        Examples:
-            >>> import paddle.fluid as fluid
-            >>> exe = fluid.executor(fluid.CPUPlace())
-            >>> data = np.array(size=(100, 200, 300))
-            >>> np_outs = map(lambda x: exe.as_lodtensor(x), data)
-            >>>     ...
-
-        Args:
-            data(numpy.ndarray): a instance of array
-
-        Returns:
-            LoDTensor
-        """
-        if isinstance(data, list):
-            raise RuntimeError("Some of your feed data hold LoD information. \
-                They can not be completely cast from a list of Python \
-                ndarray to LoDTensor. Please convert data to LoDTensor \
-                directly before feeding the data.\
-                ")
-        # single tensor case
-        tensor = core.LoDTensor()
-        tensor.set(data, self.place)
-        return tensor
+        self._closed = False
 
     def _get_program_cache(self, program_cache_key):
         return self.program_caches.get(program_cache_key, None)
@@ -309,7 +313,7 @@ class Executor(object):
         if not has_feed_operators(global_block, feed, feed_var_name):
             for i, name in enumerate(feed):
                 out = global_block.var(name)
-                global_block.prepend_op(
+                global_block._prepend_op(
                     type='feed',
                     inputs={'X': [feed_var]},
                     outputs={'Out': [out]},
@@ -318,8 +322,9 @@ class Executor(object):
         # append fetch_operators
         if not has_fetch_operators(global_block, fetch_list, fetch_var_name):
             for i, var in enumerate(fetch_list):
-                assert isinstance(var, Variable) or isinstance(var, str), (
-                    "Wrong type for fetch_list[%s]: %s" % (i, type(var)))
+                assert isinstance(var, Variable) or isinstance(
+                    var, six.string_types), (
+                        "Wrong type for fetch_list[%s]: %s" % (i, type(var)))
                 global_block.append_op(
                     type='fetch',
                     inputs={'X': [var]},
@@ -335,7 +340,7 @@ class Executor(object):
                 feed_target_name = op.desc.output('Out')[0]
                 cur_feed = feed[feed_target_name]
                 if not isinstance(cur_feed, core.LoDTensor):
-                    cur_feed = self.as_lodtensor(cur_feed)
+                    cur_feed = _as_lodtensor(cur_feed, self.place)
                 idx = op.desc.attr('col')
                 core.set_feed_variable(scope, cur_feed, feed_var_name, idx)
             else:
@@ -344,15 +349,27 @@ class Executor(object):
     def _fetch_data(self, fetch_list, fetch_var_name, scope):
         outs = [
             core.get_fetch_variable(scope, fetch_var_name, i)
-            for i in xrange(len(fetch_list))
+            for i in six.moves.range(len(fetch_list))
         ]
         return outs
 
-    def begin_pass(self):
-        self.executor.begin_pass()
+    def close(self):
+        """
+        Close this executor.
+
+        You can no long use this executor after calling this method.
+        For the distributed training, this method would free the resource on PServers related to
+        the current Trainer.
 
-    def end_pass(self):
-        self.executor.end_pass()
+        Example:
+            >>> cpu = core.CPUPlace()
+            >>> exe = Executor(cpu)
+            >>> ...
+            >>> exe.close()
+        """
+        if not self._closed:
+            self.executor.close()
+            self._closed = True
 
     def run(self,
             program=None,
@@ -405,6 +422,10 @@ class Executor(object):
             >>>     feed={'X': x},
             >>>     fetch_list=[loss.name])
         """
+
+        if self._closed:
+            raise RuntimeError("Attempted to use a closed Executor")
+
         if feed is None:
             feed = {}
         if not isinstance(feed, dict):
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index ea3117e02bd993b06de39725b2c3296031065e3c..8892606486ee97bb085e642e89fce872e5ba1f7e 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -12,28 +12,30 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import collections
 import contextlib
 import re
+import six
 
 import numpy as np
 
-import proto.framework_pb2 as framework_pb2
+from .. import compat as cpt
+from .proto import framework_pb2
 try:
     from . import core
-except ImportError, e:
+except ImportError as e:
     raise ImportError(
         """NOTE: You may need to run \"export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH\"
     if you encounters \"libmkldnn.so not found\" errors. If you have python
     installed in other directory, replace \"/usr/local/lib\" with your own
-    directory. The original error is: \n""" + e.message)
-except Exception, e:
+    directory. The original error is: \n""" + cpt.get_exception_message(e))
+except Exception as e:
     raise e
-import unique_name
+from . import unique_name
 
 __all__ = [
-    'Block',
-    'Variable',
     'Program',
     'Operator',
     'Parameter',
@@ -41,12 +43,83 @@ __all__ = [
     'default_main_program',
     'program_guard',
     'get_var',
+    'name_scope',
 ]
 
 EMPTY_VAR_NAME = core.kEmptyVarName()
 TEMP_VAR_NAME = core.kTempVarName()
 GRAD_VAR_SUFFIX = core.kGradVarSuffix()
 ZERO_VAR_SUFFIX = core.kZeroVarSuffix()
+CONTROL_DEP_VAR_PREFIX = core.kControlDepVarName()
+
+
+class NameScope(object):
+    def __init__(self, name="", parent=None):
+        self._children = dict()
+        self._name = name
+        self._parent = parent
+
+    def child(self, prefix):
+        if prefix not in self._children:
+            new_child = NameScope(prefix, self)
+            self._children[prefix] = [new_child]
+        else:
+            new_child = NameScope(prefix + "_%d" % len(self._children[prefix]),
+                                  self)
+            self._children[prefix].append(new_child)
+        return new_child
+
+    def parent(self):
+        return self._parent
+
+    def name(self):
+        return self._name
+
+
+_name_scope = NameScope()
+
+
+@contextlib.contextmanager
+def name_scope(prefix=None):
+    """
+    Generate hierarchical name prefix for the operators.
+
+    Note: This should only used for debugging and visualization purpose.
+    Don't use it for serious analysis such as graph/program transformations.
+
+    Args:
+        prefix(str): prefix.
+
+    Examples:
+        .. code-block:: python
+          with name_scope("encoder"):
+             ...
+          with name_scope("decoder"):
+             ...
+             with name_scope("attention"):
+                ...
+    """
+    # TODO(panyx0718): Only [0-9a-z].
+    assert prefix, "namescope prefix cannot be empty."
+    global _name_scope
+    _name_scope = _name_scope.child(prefix)
+    yield
+    _name_scope = _name_scope.parent()
+
+
+def _full_name_scope():
+    global _name_scope
+    scope = _name_scope
+    name = ""
+    while scope:
+        name = scope.name() + "/" + name
+        scope = scope.parent()
+    return name
+
+
+def generate_control_dev_var_name():
+    import random
+    return CONTROL_DEP_VAR_PREFIX + "@" + str(random.random())
 
 
 def grad_var_name(var_name):
@@ -87,8 +160,10 @@ def convert_np_dtype_to_dtype_(np_dtype):
         return core.VarDesc.VarType.INT16
     elif dtype == np.uint8:
         return core.VarDesc.VarType.UINT8
+    elif dtype == np.int8:
+        return core.VarDesc.VarType.INT8
     else:
-        raise ValueError("Not supported numpy dtype " + str(dtype))
+        raise ValueError("Not supported numpy dtype %s" % dtype)
 
 
 def dtype_is_floating(dtype):
@@ -131,15 +206,15 @@ def _debug_string_(proto, throw_on_error=True):
 
 class Variable(object):
     """
-    In Fluid, every input and output of an operator is a variable. In most 
-    cases, variables are used for holding different kinds of data or training 
-    labels. A variable belongs to a block. All variable has its own name and 
+    In Fluid, every input and output of an operator is a variable. In most
+    cases, variables are used for holding different kinds of data or training
+    labels. A variable belongs to a block. All variable has its own name and
     two variables in different blocks could have the same name.
 
-    There are many kinds of variables. Each kind of them has its own attributes 
-    and usages. Please reference the framework.proto for details. 
+    There are many kinds of variables. Each kind of them has its own attributes
+    and usages. Please reference the framework.proto for details.
 
-    Most of a Variable's member variables can be setted to be None. It mean 
+    Most of a Variable's member variables can be setted to be None. It mean
     it is not available or will be specified later.
 
     Args:
@@ -199,10 +274,11 @@ class Variable(object):
         if name is None:
             name = unique_name.generate('_generated_var')
         is_new_var = False
-        self.desc = self.block.desc.find_var(name)
+        name = cpt.to_text(name)
+        self.desc = self.block.desc.find_var(cpt.to_bytes(name))
 
         if self.desc is None:
-            self.desc = self.block.desc.var(name)
+            self.desc = self.block.desc.var(cpt.to_bytes(name))
             is_new_var = True
 
         if is_new_var:
@@ -292,18 +368,18 @@ class Variable(object):
         assert isinstance(throw_on_error, bool) and isinstance(with_details,
                                                                bool)
         protostr = self.desc.serialize_to_string()
-        proto = framework_pb2.VarDesc.FromString(str(protostr))
+        proto = framework_pb2.VarDesc.FromString(six.binary_type(protostr))
         res_str = _debug_string_(proto, throw_on_error)
         if with_details:
             additional_attr = ("error_clip", "stop_gradient")
             for attr_name in additional_attr:
-                res_str += "%s: %s\n" % (attr_name,
-                                         str(getattr(self, attr_name)))
+                res_str += "%s: %s\n" % (
+                    attr_name, six.binary_type(getattr(self, attr_name)))
         return res_str
 
     __repr__ = __str__
 
-    def set_desc(self, input):
+    def _set_desc(self, input):
         """
         Set the variable description.
 
@@ -325,7 +401,7 @@ class Variable(object):
 
     @property
     def name(self):
-        return self.desc.name()
+        return cpt.to_text(self.desc.name())
 
     @name.setter
     def name(self, new_name):
@@ -348,7 +424,7 @@ class Variable(object):
     def type(self):
         return self.desc.type()
 
-    def set_error_clip(self, error_clip):
+    def _set_error_clip(self, error_clip):
         """
         Set the error_clip.
 
@@ -371,7 +447,7 @@ def get_all_op_protos():
     protostrs = core.get_all_op_protos()
     ret_values = []
     for pbstr in protostrs:
-        op_proto = framework_pb2.OpProto.FromString(str(pbstr))
+        op_proto = framework_pb2.OpProto.FromString(six.binary_type(pbstr))
         ret_values.append(op_proto)
     return ret_values
 
@@ -447,7 +523,7 @@ class Operator(object):
 
     Notes:
         The constructor of operator should not be invoked directly. Use
-        Block.append_op or Block.prepend_op instead.
+        Block.append_op or Block._prepend_op instead.
 
     Examples:
         .. code-block:: python
@@ -474,26 +550,27 @@ class Operator(object):
                  inputs=None,
                  outputs=None,
                  attrs=None):
-
         self.block = block
         self.desc = desc
-        self.attrs = attrs
-        if self.attrs is None:
-            self.attrs = dict()
+        # note: not add self.attrs here:
+        # https://github.com/PaddlePaddle/Paddle/pull/12583#pullrequestreview-145093173
+        op_attrs = attrs
+        if op_attrs is None:
+            op_attrs = dict()
         del attrs
 
         op_maker = core.op_proto_and_checker_maker
 
-        if op_maker.kOpRoleAttrName() not in self.attrs:
-            self.attrs[op_maker.kOpRoleAttrName()] = self.block.program.op_role
+        if op_maker.kOpRoleAttrName() not in op_attrs:
+            op_attrs[op_maker.kOpRoleAttrName()] = self.block.program.op_role
 
         role_var_name = op_maker.kOpRoleVarAttrName()
         if len(self.block.program.
-               op_role_var) != 0 and role_var_name not in self.attrs:
-            self.attrs[role_var_name] = self.block.program.op_role_var
+               op_role_var) != 0 and role_var_name not in op_attrs:
+            op_attrs[role_var_name] = self.block.program.op_role_var
 
-        if role_var_name in self.attrs and len(self.attrs[role_var_name]) == 0:
-            del self.attrs[role_var_name]
+        if role_var_name in op_attrs and len(op_attrs[role_var_name]) == 0:
+            del op_attrs[role_var_name]
 
         if len(self.desc.type()) != 0:
             return
@@ -503,6 +580,9 @@ class Operator(object):
         self.desc.set_type(type)
         proto = OpProtoHolder.instance().get_op_proto(type)
 
+        namescope_var_name = op_maker.kOpNameScopeAttrName()
+        op_attrs[namescope_var_name] = _full_name_scope()
+
         def find_name(var_list, name):
             for var_name in var_list:
                 if var_list[var_name] is not None and var_name == name:
@@ -525,10 +605,12 @@ class Operator(object):
                             % (in_proto.name, len(in_args)))
                     in_arg_names = []
                     for arg in in_args:
-                        if isinstance(arg, basestring):
+                        if isinstance(arg, six.string_types):
                             in_arg_names.append(arg)
+                        elif isinstance(arg, six.binary_type):
+                            in_arg_names.append(arg.decode())
                         else:
-                            in_arg_names.append(arg.name)
+                            in_arg_names.append(cpt.to_text(arg.name))
                     self.desc.set_input(in_proto.name, in_arg_names)
                 else:
                     self.desc.set_input(in_proto.name, [])
@@ -543,8 +625,9 @@ class Operator(object):
             if not given == need:
                 raise ValueError(("Incorrect setting for output(s) of "
                                   "operator \"%s\". Need: [%s] Given: [%s]") %
-                                 (type, ", ".join(str(e) for e in need),
-                                  ", ".join(str(e) for e in given)))
+                                 (type,
+                                  ", ".join(six.binary_type(e) for e in need),
+                                  ", ".join(six.binary_type(e) for e in given)))
 
             for out_proto in proto.outputs:
                 out_args = outputs[out_proto.name]
@@ -556,19 +639,18 @@ class Operator(object):
                         (out_proto.name, len(out_args)))
                 out_arg_names = []
                 for arg in out_args:
-                    out_arg_names.append(arg.name)
+                    out_arg_names.append(cpt.to_text(arg.name))
                     arg.op = self
                 self.desc.set_output(out_proto.name, out_arg_names)
 
-        if self.attrs is not None:
-            if not isinstance(self.attrs, dict):
+        if op_attrs is not None:
+            if not isinstance(op_attrs, dict):
                 raise TypeError("'attrs' should be a dict.")
             for attr in proto.attrs:
                 attr_name = attr.name
-                if (attr_name not in self.attrs) or (
-                        self.attrs[attr_name] is None):
+                if (attr_name not in op_attrs) or (op_attrs[attr_name] is None):
                     continue
-                attr_val = self.attrs[attr_name]
+                attr_val = op_attrs[attr_name]
                 self._update_desc_attr(attr_name, attr_val)
 
         self.desc.check_attrs()
@@ -592,7 +674,7 @@ class Operator(object):
 
         """
         protostr = self.desc.serialize_to_string()
-        proto = framework_pb2.OpDesc.FromString(str(protostr))
+        proto = framework_pb2.OpDesc.FromString(six.binary_type(protostr))
         return _debug_string_(proto, throw_on_error)
 
     def __str__(self):
@@ -716,7 +798,6 @@ class Operator(object):
         Raises:
             ValueError: If the type of value doesn't match with desc.attr_type(name).
         """
-        self.attrs[name] = val
         self._update_desc_attr(name, val)
 
     def _update_desc_attr(self, name, val):
@@ -758,9 +839,9 @@ class Operator(object):
         """
         return self.desc.attr(name)
 
-    def block_attr(self, name):
+    def block_attr_id(self, name):
         """
-        Get the block attribute by name.
+        Get the block attribute's id by name.
 
         Args:
             name(str): the attribute name.
@@ -768,22 +849,74 @@ class Operator(object):
         Returns:
             int: the block index.
         """
-        return self.desc.block_attr(name)
+        return self.desc.block_attr_id(name)
+
+    def block_attr(self, name):
+        """
+        Get the block attribute  by name.
+
+        Args:
+            name(str): the attribute name.
+
+        Returns:
+            block: the block attribute.
+        """
+
+        id = self.block_attr_id(name)
+        assert (id >= 0 and id < len(self.block.program.blocks))
+        return self.block.program.blocks[id]
+
+    def blocks_attr(self, name):
+        """
+        Get the blocks attribute  by name.
+
+        Args:
+            name(str): the attribute name.
+
+        Returns:
+            list: list of the blocks attribute.
+        """
+        attrs = []
+        for i in self.blocks_attr_ids(name):
+            assert (i >= 0 and i < len(self.block.program.blocks))
+            attrs.append(self.block.program.blocks[i])
+
+        return attrs
+
+    def blocks_attr_ids(self, name):
+        """
+        Get the blocks attribute's ids by name.
+
+        Args:
+            name(str): the attribute name.
+
+        Returns:
+            list: list of the blocks ids.
+        """
+
+        return self.desc.blocks_attr_ids(name)
 
     def all_attrs(self):
         """
         Get the attribute dict.
 
         Returns:
-            dict: The Operator's attribute dict.
+            dict: The Operator's attribute dict, name->attr.
         """
         attr_names = self.attr_names
         attr_map = {}
         for n in attr_names:
-            if n == 'sub_block':
+            attr_type = self.desc.attr_type(n)
+            if attr_type == core.AttrType.BLOCK:
                 attr_map[n] = self.block_attr(n)
-            else:
-                attr_map[n] = self.attr(n)
+                continue
+
+            if attr_type == core.AttrType.BLOCKS:
+                attr_map[n] = self.blocks_attr(n)
+                continue
+
+            attr_map[n] = self.attr(n)
+
         return attr_map
 
 
@@ -847,7 +980,7 @@ class Block(object):
             re_add_indent = re.compile(r"\n(.)")
             res_str = "blocks {\n  idx: %d\n  parent_idx: %d" % (
                 self.idx, self.parent_idx)
-            for var in self.vars.itervalues():
+            for var in list(self.vars.values()):
                 res_str += "\n  vars {\n    %s  }" % re_add_indent.sub(
                     r"\n    \1", var.to_string(throw_on_error, with_details))
             for op in self.ops:
@@ -856,7 +989,8 @@ class Block(object):
             res_str += "\n}"
         else:
             protostr = self.desc.serialize_to_string()
-            proto = framework_pb2.BlockDesc.FromString(str(protostr))
+            proto = framework_pb2.BlockDesc.FromString(
+                six.binary_type(protostr))
             res_str = _debug_string_(proto, throw_on_error)
         return res_str
 
@@ -870,7 +1004,7 @@ class Block(object):
     def forward_block_idx(self):
         return self.desc.get_forward_block_idx()
 
-    def set_forward_block_idx(self, idx):
+    def _set_forward_block_idx(self, idx):
         """
         Set the forward block Idx.
 
@@ -880,7 +1014,7 @@ class Block(object):
         Returns:
             None
         """
-        self.desc.set_forward_block_idx(idx)
+        self.desc._set_forward_block_idx(idx)
 
     @property
     def idx(self):
@@ -900,7 +1034,7 @@ class Block(object):
         Returns:
             Variable: the Variable with the giving name.
         """
-        if not isinstance(name, basestring):
+        if not isinstance(name, six.string_types):
             raise TypeError(
                 "var require string as parameter, but get %s instead." %
                 (type(name)))
@@ -909,7 +1043,7 @@ class Block(object):
             raise ValueError("var %s not in this block" % name)
         return v
 
-    def var_recursive(self, name):
+    def _var_recursive(self, name):
         """
         Get a Variable by name from this block recursively.
 
@@ -954,7 +1088,7 @@ class Block(object):
         return list(self.iter_parameters())
 
     def iter_parameters(self):
-        return (item[1] for item in self.vars.iteritems()
+        return (item[1] for item in six.iteritems(self.vars)
                 if isinstance(item[1], Parameter))
 
     def create_var(self, *args, **kwargs):
@@ -966,7 +1100,7 @@ class Block(object):
     def has_var(self, name):
         return name in self.vars
 
-    def rename_var(self, name, new_name):
+    def _rename_var(self, name, new_name):
         """
         Rename variable in vars and ops' inputs and outputs
 
@@ -982,6 +1116,9 @@ class Block(object):
         Returns:
             Variable: the Variable with the giving name.
         """
+        name = cpt.to_text(name)
+        new_name = cpt.to_text(new_name)
+
         if not self.has_var(name):
             raise ValueError("var %s is not in current block" % name)
         v = self.var(name)
@@ -1000,9 +1137,9 @@ class Block(object):
         else:
             raise ValueError("unsupported var type: %s", type(v))
         orig_var_type = v.type
-        self.desc.rename_var(name, new_name)
-        # NOTE: v is destroyed by C++ after calling rename_var.
-        d = self.desc.find_var(new_name)
+        self.desc._rename_var(cpt.to_bytes(name), cpt.to_bytes(new_name))
+        # NOTE: v is destroyed by C++ after calling _rename_var.
+        d = self.desc.find_var(cpt.to_bytes(new_name))
         if var_type == "Parameter":
             var = Parameter(
                 self,
@@ -1024,23 +1161,42 @@ class Block(object):
                 error_clip=error_clip,
                 stop_gradient=stop_gradient)
 
-        # rename the python side, sync_with_cpp will only add
+        # rename the python side, _sync_with_cpp will only add
         # new vars/ops to python side.
         self.vars[new_name] = var
         del self.vars[name]
-        self.sync_with_cpp()
+        self._sync_with_cpp()
         return var
 
-    def remove_var(self, name):
-        self.sync_with_cpp()
-        self.desc.remove_var(name)
+    def _remove_var(self, name):
+        self._sync_with_cpp()
+        self.desc._remove_var(cpt.to_bytes(name))
         del self.vars[name]
 
     def create_parameter(self, *args, **kwargs):
         global_block = self.program.global_block()
         param = Parameter(global_block, *args, **kwargs)
         if 'initializer' in kwargs:
-            kwargs['initializer'](param, self)
+
+            def _is_inited_by(block, var):
+                init_ops = []
+                for op in block.ops:
+                    if var.name in op.output_arg_names:
+                        init_ops.append(op)
+                return init_ops
+
+            initializer = kwargs['initializer']
+            init_ops = _is_inited_by(global_block, param)
+            init_ops_len = len(init_ops)
+            if init_ops_len > 1:
+                raise RuntimeError("param " + param.name +
+                                   " is inited by multiple init ops " + str(
+                                       init_ops))
+            elif init_ops_len == 1:
+                #TODO already inited, do nothing, should log a warning
+                pass
+            else:
+                initializer(param, self)
         return param
 
     def append_op(self, *args, **kwargs):
@@ -1055,7 +1211,7 @@ class Block(object):
         self.ops.append(op)
         return op
 
-    def insert_op(self, index, *args, **kwargs):
+    def _insert_op(self, index, *args, **kwargs):
         """
         Insert a Operator according to the giving arguments.
 
@@ -1065,13 +1221,13 @@ class Block(object):
         Returns:
             Operator: the insert Operator.
         """
-        self.sync_with_cpp()
-        op_desc = self.desc.insert_op(index)
+        self._sync_with_cpp()
+        op_desc = self.desc._insert_op(index)
         op = Operator(block=self, desc=op_desc, *args, **kwargs)
         self.ops.insert(index, op)
         return op
 
-    def remove_op(self, index):
+    def _remove_op(self, index):
         """
         Remove the specific position operator.
 
@@ -1081,11 +1237,11 @@ class Block(object):
         Returns:
             None
         """
-        self.sync_with_cpp()
-        self.desc.remove_op(index, index + 1)
+        self._sync_with_cpp()
+        self.desc._remove_op(index, index + 1)
         del self.ops[index]
 
-    def slice_ops(self, start, end):
+    def _slice_ops(self, start, end):
         """
         Return the Operator between start and end.
 
@@ -1098,13 +1254,13 @@ class Block(object):
         """
         return self.ops[start:end]
 
-    def prepend_op(self, *args, **kwargs):
-        op_desc = self.desc.prepend_op()
+    def _prepend_op(self, *args, **kwargs):
+        op_desc = self.desc._prepend_op()
         op = Operator(self, op_desc, *args, **kwargs)
         self.ops.insert(0, op)
         return op
 
-    def sync_with_cpp(self):
+    def _sync_with_cpp(self):
         """
         Sync from the desc on the c++ end. This method is used to synchronize
         the c++ desc instance generated by backward.
@@ -1115,8 +1271,8 @@ class Block(object):
                 self.create_var(name=var.name(), desc=var, type=var.type())
 
         # sync variables removed from c++ end
-        for var in self.vars.keys():
-            if not self.desc.find_var(var):
+        for var in list(self.vars.keys()):
+            if not self.desc.find_var(cpt.to_bytes(var)):
                 self.vars.pop(var)
 
         # sync operators from cpp
@@ -1170,7 +1326,7 @@ class Block(object):
         for index in range(len(self.ops)):
             assert self.ops[index].desc == ops_in_cpp[index]
 
-    def copy_param_info_from(self, other):
+    def _copy_param_info_from(self, other):
         """
         Copy the information of parameters from the other block.
 
@@ -1185,12 +1341,13 @@ class Block(object):
             None
         """
         if not isinstance(other, Block):
-            raise TypeError("copy_param_info_from should be invoked with Block")
+            raise TypeError(
+                "_copy_param_info_from should be invoked with Block")
         for p in other.iter_parameters():
             assert isinstance(p, Parameter)
             v = self.vars.get(p.name, None)
             if v is None:
-                raise ValueError("copy_param_info_from should be invoked with "
+                raise ValueError("_copy_param_info_from should be invoked with "
                                  "same topology")
             assert isinstance(v, Variable)
             new_p = Parameter(
@@ -1208,7 +1365,7 @@ class Block(object):
                 name=v.name)
             self.vars[new_p.name] = new_p
 
-    def clone_variable(self, var):
+    def _clone_variable(self, var):
         """
         Clone a variable into current block.
 
@@ -1282,6 +1439,13 @@ class Program(object):
         self._current_role = core.op_proto_and_checker_maker.OpRole.Forward
         self._op_role_var = []
 
+        # for distribute
+        self._is_distributed = False
+        self._is_chief = False
+        self._slice_vars_and_attrs = []
+        self._endpoints = []
+        self._distributed_lookup_table = None
+
     @property
     def op_role(self):
         """
@@ -1319,7 +1483,7 @@ class Program(object):
         self._op_role_var = [var_name]
 
     @contextlib.contextmanager
-    def optimized_guard(self, var):
+    def optimized_guard(self, param_and_grads):
         """
         A with guard to set :code:`Optimization` :code:`OpRole` and
         :code:`OpRoleVar` automatically.
@@ -1327,17 +1491,20 @@ class Program(object):
         Notes: This is a very low level API. Users should not use it directly.
 
         Args:
-            var(Variable|str): The variable (name) to be optimized.
+            param_and_grads(list): The variables (names) to be optimized.
 
         Examples:
 
             >>> p, g = backward(...)
-            >>> with program.optimized_guard(p):
+            >>> with program.optimized_guard([p,g]):
             >>>     p = p - 0.001 * g
         """
         OpRole = core.op_proto_and_checker_maker.OpRole
         self._current_role = OpRole.Optimize
-        self._op_role_var = [var.name if isinstance(var, Variable) else var]
+        self._op_role_var = [
+            var.name if isinstance(var, Variable) else var
+            for var in param_and_grads
+        ]
         yield
         self._op_role_var = []
         self._current_role = OpRole.Forward
@@ -1382,7 +1549,8 @@ class Program(object):
                 res_str += block.to_string(throw_on_error, with_details)
         else:
             protostr = self.desc.serialize_to_string()
-            proto = framework_pb2.ProgramDesc.FromString(str(protostr))
+            proto = framework_pb2.ProgramDesc.FromString(
+                six.binary_type(protostr))
             res_str = _debug_string_(proto, throw_on_error)
         return res_str
 
@@ -1396,6 +1564,9 @@ class Program(object):
         """
         return self.desc
 
+    def _version(self):
+        return self.desc._version()
+
     def clone(self, for_test=False):
         """
         Create a new, duplicated program.
@@ -1476,14 +1647,22 @@ class Program(object):
             The two code snippets above will generate same programs.
         """
         if for_test:
-            p = self.inference_optimize()
+            p = self.inference_optimize(export_for_deployment=False)
         else:
             p = Program()
+            p.current_block_idx = self.current_block_idx
+            p._seed = self._seed
             p.desc = core.ProgramDesc(self.desc)
-            p.blocks = [Block(p, i) for i in xrange(self.desc.num_blocks())]
-            p.sync_with_cpp()
+            p.blocks = [
+                Block(p, i) for i in six.moves.range(self.desc.num_blocks())
+            ]
+
+            p._current_role = self._current_role
+            p._op_role_var = self._op_role_var
 
-        p.copy_param_info_from(self)
+            p._sync_with_cpp()
+
+        p._copy_param_info_from(self)
         p.copy_data_info_from(self)
         return p
 
@@ -1532,16 +1711,27 @@ class Program(object):
             targets_idx.append([t.block.idx, t.idx])
         res = Program()
         res.desc = core.prune(self.desc, targets_idx)
-        res.blocks = [Block(res, i) for i in xrange(res.desc.num_blocks())]
-        res.sync_with_cpp()
+        res.blocks = [
+            Block(res, i) for i in six.moves.range(res.desc.num_blocks())
+        ]
+        res._sync_with_cpp()
         return res
 
-    def inference_optimize(self):
+    def inference_optimize(self, export_for_deployment=True):
         """
-        This method will create a new program and change the :code:`is_test`
+        This method will create a new program and do following adjustments on it:
+        1. Remove all reader variables and their creator ops if exist.
+
+        2. Remove the :code:`read_op` if exists.
+
+        3. change the :code:`is_test`
         attribute of operators to :code:`True`. All the :code:`Parameter`
         information will be lost.
 
+        Args:
+            export_for_deployment(bool): remove the read ops that are added by py_reader
+                                        for cpp inference library
+
         Notes: This API is a very low level API. Use
         :code:`Program.clone(for_test=True)` instead.
 
@@ -1552,14 +1742,33 @@ class Program(object):
         # core.inference_optimize being fixed.
         res = Program()
         res.desc = core.ProgramDesc(self.desc)
-        for i in xrange(res.desc.num_blocks()):
+
+        # remove all readers and the read_op if exist
+        read_op_idx = 0
+        root_block = res.desc.block(0)
+        if export_for_deployment:
+            while True:
+                if read_op_idx >= root_block.op_size() or root_block.op(
+                        read_op_idx).type() == 'read':
+                    break
+                read_op_idx += 1
+            if read_op_idx < root_block.op_size():
+                root_block._remove_op(0, read_op_idx + 1)
+            for var in root_block.all_vars():
+                if var.type() == core.VarDesc.VarType.READER:
+                    root_block._remove_var(cpt.to_bytes(var.name()))
+
+        # change all `is_test` attributes to True
+        for i in six.moves.range(res.desc.num_blocks()):
             block = res.desc.block(i)
-            for j in xrange(block.op_size()):
+            for j in six.moves.range(block.op_size()):
                 op = block.op(j)
                 if op.has_attr('is_test'):
                     op.set_attr('is_test', True)
-        res.blocks = [Block(res, i) for i in xrange(res.desc.num_blocks())]
-        res.sync_with_cpp()
+        res.blocks = [
+            Block(res, i) for i in six.moves.range(res.desc.num_blocks())
+        ]
+        res._sync_with_cpp()
         return res
 
     @staticmethod
@@ -1571,15 +1780,15 @@ class Program(object):
         and deserialization.
 
         Args:
-            binary_str(str): The binary prootbuf string.
+            binary_str_type(str): The binary prootbuf string.
 
         Returns:
             Program: A deserialized program desc.
         """
         p = Program()
         p.desc = core.ProgramDesc(binary_str)
-        p.blocks = [Block(p, i) for i in xrange(p.desc.num_blocks())]
-        p.sync_with_cpp()
+        p.blocks = [Block(p, i) for i in six.moves.range(p.desc.num_blocks())]
+        p._sync_with_cpp()
         return p
 
     @property
@@ -1606,7 +1815,7 @@ class Program(object):
         self._seed = seed
 
     def __repr__(self):
-        return str(self)
+        return self.__str__()
 
     def global_block(self):
         """
@@ -1659,7 +1868,7 @@ class Program(object):
         """
         self.current_block_idx = self.current_block().parent_idx
 
-    def sync_with_cpp(self):
+    def _sync_with_cpp(self):
         """
         Synchronize Python instance to its binding C++ object instance.
         If the program is modified in C++ space, this method should be invoked.
@@ -1673,9 +1882,9 @@ class Program(object):
         for block_idx in range(len(self.blocks), self.desc.num_blocks()):
             self.blocks.append(Block(self, block_idx))
         for block in self.blocks:
-            block.sync_with_cpp()
+            block._sync_with_cpp()
 
-    def copy_param_info_from(self, other):
+    def _copy_param_info_from(self, other):
         """
         Copy the information of parameters from other program.
 
@@ -1689,13 +1898,13 @@ class Program(object):
             None
         """
         if not isinstance(other, Program):
-            raise TypeError("copy_param_info_from should be invoked with "
+            raise TypeError("_copy_param_info_from should be invoked with "
                             "Program")
 
         if len(self.blocks) != len(other.blocks):
-            raise ValueError("copy_param_info_from should be invoked with two "
+            raise ValueError("_copy_param_info_from should be invoked with two "
                              "program, with represent the same topology")
-        self.global_block().copy_param_info_from(other.global_block())
+        self.global_block()._copy_param_info_from(other.global_block())
 
     def copy_data_info_from(self, other):
         """
@@ -1711,13 +1920,13 @@ class Program(object):
             None
         """
         if not isinstance(other, Program):
-            raise TypeError("copy_param_info_from should be invoked with "
+            raise TypeError("_copy_param_info_from should be invoked with "
                             "Program")
 
         if len(self.blocks) != len(other.blocks):
-            raise ValueError("copy_param_info_from should be invoked with two "
+            raise ValueError("_copy_param_info_from should be invoked with two "
                              "program, with represent the same topology")
-        for var in other.global_block().vars.itervalues():
+        for var in list(other.global_block().vars.values()):
             if var.is_data:
                 self.global_block().var(var.name).is_data = True
 
@@ -1729,15 +1938,15 @@ class Program(object):
             iterable: The generator will yield every variable in this program.
         """
         for each_block in self.blocks:
-            for each_var in each_block.vars.itervalues():
+            for each_var in list(each_block.vars.values()):
                 yield each_var
 
 
 class Parameter(Variable):
     """
-    Parameter is derived from Variable. A parameter is a persistable 
+    Parameter is derived from Variable. A parameter is a persistable
     Variable, and will be updated by optimizers after each iteration.
-    The training of a neural network is essentially the updating of 
+    The training of a neural network is essentially the updating of
     its parameters.
 
     Relative to a general Variable, a Parameter has several its own
@@ -1803,8 +2012,8 @@ class Parameter(Variable):
             additional_attr = ("trainable", "optimize_attr", "regularizer",
                                "gradient_clip_attr", "do_model_average")
             for attr_name in additional_attr:
-                res_str += "%s: %s\n" % (attr_name,
-                                         str(getattr(self, attr_name)))
+                res_str += "%s: %s\n" % (
+                    attr_name, six.binary_type(getattr(self, attr_name)))
         else:
             res_str = Variable.to_string(self, throw_on_error, False)
         return res_str
diff --git a/python/paddle/fluid/graphviz.py b/python/paddle/fluid/graphviz.py
index 125b4efa9d476e561bd78d0365cd92bbf7e66605..2b18d854d18bcbebce2a0eb30b8690db49d9d246 100644
--- a/python/paddle/fluid/graphviz.py
+++ b/python/paddle/fluid/graphviz.py
@@ -12,14 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import os
 import random
+import six
+import functools
 import subprocess
 import logging
 
 
 def crepr(v):
-    if type(v) is str or type(v) is unicode:
+    if isinstance(v, six.string_types):
         return '"%s"' % v
     return str(v)
 
@@ -104,8 +108,9 @@ class Graph(object):
 
     def _rank_repr(self):
         ranks = sorted(
-            self.rank_groups.items(),
-            cmp=lambda a, b: a[1].priority > b[1].priority)
+            six.iteritems(self.rank_groups),
+            key=functools.cmp_to_key(
+                lambda a, b: a[1].priority > b[1].priority))
         repr = []
         for x in ranks:
             repr.append(str(x[1]))
@@ -148,7 +153,7 @@ class Node(object):
             name=self.name,
             label=self.label,
             extra=',' + ','.join("%s=%s" % (key, crepr(value))
-                                 for key, value in self.attrs.items())
+                                 for key, value in six.iteritems(self.attrs))
             if self.attrs else "")
         return reprs
 
@@ -172,7 +177,7 @@ class Edge(object):
             target=self.target.name,
             extra="" if not self.attrs else
             "[" + ','.join("{}={}".format(attr[0], crepr(attr[1]))
-                           for attr in self.attrs.items()) + "]")
+                           for attr in six.iteritems(self.attrs)) + "]")
         return repr
 
 
diff --git a/python/paddle/fluid/inferencer.py b/python/paddle/fluid/inferencer.py
index a81e39695b78f235d6ae896d90117dd392692634..a9b94a20720615dbfca97749463f27dbc88ac64f 100644
--- a/python/paddle/fluid/inferencer.py
+++ b/python/paddle/fluid/inferencer.py
@@ -12,16 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import contextlib
 
-import core
+from . import core
 
-import executor
-import framework
-import io
-import parallel_executor
-import unique_name
-from trainer import check_and_get_place
+from . import executor
+from . import framework
+from . import io
+from . import parallel_executor
+from . import unique_name
+from .trainer import check_and_get_place
 
 __all__ = ['Inferencer', ]
 
@@ -96,10 +98,9 @@ class Inferencer(object):
             raise ValueError(
                 "inputs should be a map of {'input_name': input_var}")
 
-        with executor.scope_guard(self.scope):
-            results = self.exe.run(self.inference_program,
-                                   feed=inputs,
-                                   fetch_list=[self.predict_var],
+        with self._prog_and_scope_guard():
+            results = self.exe.run(feed=inputs,
+                                   fetch_list=[self.predict_var.name],
                                    return_numpy=return_numpy)
 
         return results
diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py
index 373e9c060de1ee27c165ccd2380cd8c38612c4d9..bd46ed8e50c9344d471578eb0f89b7e214d62722 100644
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -12,11 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import framework
+from __future__ import print_function
+
+from . import framework
 import numpy as np
 import contextlib
-from framework import convert_np_dtype_to_dtype_
-from core import VarDesc
+from .core import VarDesc
 
 __all__ = [
     'Constant', 'Uniform', 'Normal', 'Xavier', 'Bilinear', 'MSRA',
@@ -148,7 +149,7 @@ class ConstantInitializer(Initializer):
         assert isinstance(var, framework.Variable)
         assert isinstance(block, framework.Block)
         # Initialization Ops should be prepended and not appended
-        op = block.prepend_op(
+        op = block._prepend_op(
             type="fill_constant",
             outputs={"Out": var},
             attrs={
@@ -202,7 +203,7 @@ class UniformInitializer(Initializer):
         # Initialization Ops should be prepended and not appended
         if self._seed == 0:
             self._seed = block.program.random_seed
-        op = block.prepend_op(
+        op = block._prepend_op(
             type="uniform_random",
             outputs={"Out": var},
             attrs={
@@ -256,7 +257,7 @@ class NormalInitializer(Initializer):
         # Initialization Ops should be prepended and not appended
         if self._seed == 0:
             self._seed = block.program.random_seed
-        op = block.prepend_op(
+        op = block._prepend_op(
             type="gaussian_random",
             outputs={"Out": var},
             attrs={
@@ -264,7 +265,8 @@ class NormalInitializer(Initializer):
                 "dtype": int(var.dtype),
                 "mean": self._mean,
                 "std": self._std_dev,
-                "seed": self._seed
+                "seed": self._seed,
+                "use_mkldnn": False
             })
         var.op = op
         return op
@@ -346,7 +348,7 @@ class XavierInitializer(Initializer):
 
         if self._uniform:
             limit = np.sqrt(6.0 / float(fan_in + fan_out))
-            op = block.prepend_op(
+            op = block._prepend_op(
                 type="uniform_random",
                 outputs={"Out": var},
                 attrs={
@@ -359,7 +361,7 @@ class XavierInitializer(Initializer):
 
         else:
             std = np.sqrt(2.0 / float(fan_in + fan_out))
-            op = block.prepend_op(
+            op = block._prepend_op(
                 type="gaussian_random",
                 outputs={"Out": var},
                 attrs={
@@ -444,7 +446,7 @@ class MSRAInitializer(Initializer):
 
         if self._uniform:
             limit = np.sqrt(6.0 / float(fan_in))
-            op = block.prepend_op(
+            op = block._prepend_op(
                 type="uniform_random",
                 outputs={"Out": var},
                 attrs={
@@ -457,7 +459,7 @@ class MSRAInitializer(Initializer):
 
         else:
             std = np.sqrt(2.0 / float(fan_in))
-            op = block.prepend_op(
+            op = block._prepend_op(
                 type="gaussian_random",
                 outputs={"Out": var},
                 attrs={
diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index 5c8f4f6507c7dd9b3d005639d962ce1e55b2c704..656fafa0cb54d70e0eba8ec2bef21488c50d8d94 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -12,10 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import os
 import errno
 import time
 import shutil
+import six
 
 from paddle.fluid.evaluator import Evaluator
 from paddle.fluid.framework import Program, Parameter, default_main_program, default_startup_program, Variable
@@ -24,10 +27,7 @@ from . import core
 __all__ = [
     'save_vars', 'save_params', 'save_persistables', 'load_vars', 'load_params',
     'load_persistables', 'save_inference_model', 'load_inference_model',
-    'get_inference_program', 'save_checkpoint', 'load_checkpoint',
-    'clean_checkpoint', 'load_persist_vars_without_grad',
-    'load_lookup_table_vars', 'save_persist_vars_without_grad',
-    'get_latest_checkpoint_serial'
+    'get_inference_program'
 ]
 
 
@@ -69,7 +69,8 @@ def is_persistable(var):
             res = fluid.io.is_persistable(param)
     """
     if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
-            var.desc.type() == core.VarDesc.VarType.FETCH_LIST:
+            var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
+            var.desc.type() == core.VarDesc.VarType.READER:
         return False
     return var.persistable
 
@@ -94,34 +95,34 @@ def save_vars(executor,
     """
     Save variables to the given directory by executor.
 
-    There are two ways to specify variables to be saved: The first way, list 
-    variables in a list and assign it to the `vars`. The second way, assign the 
-    `main_program` with an existing program, then all variables in the program 
-    will be saved. The first way has a higher priority. In other words, if `vars` 
+    There are two ways to specify variables to be saved: The first way, list
+    variables in a list and assign it to the `vars`. The second way, assign the
+    `main_program` with an existing program, then all variables in the program
+    will be saved. The first way has a higher priority. In other words, if `vars`
     are assigned, the `main_program` and the `predicate` will be ignored.
 
-    The `dirname` are used to specify the folder where to save variables. 
-    If you prefer to save variables in separate files in the folder `dirname`, 
-    set `filename` None; if you prefer to save all variables in a single file, 
+    The `dirname` are used to specify the folder where to save variables.
+    If you prefer to save variables in separate files in the folder `dirname`,
+    set `filename` None; if you prefer to save all variables in a single file,
     use `filename` to specify it.
 
     Args:
         executor(Executor): The executor to run for saving variables.
         dirname(str): The directory path.
-        main_program(Program|None): The program whose variables will be saved. 
-                                    If it is None, the default main program will 
+        main_program(Program|None): The program whose variables will be saved.
+                                    If it is None, the default main program will
                                     be used automatically.
                                     Default: None
-        vars(list[Variable]|None): The list that contains all variables to save. 
+        vars(list[Variable]|None): The list that contains all variables to save.
                                    It has a higher priority than the `main_program`.
                                    Default: None
-        predicate(function|None): If it is not None, only variables in the 
-                                  `main_program` that makes predicate(variable)==True 
-                                  will be saved. It only works when we are using the 
-                                  `main_program` to specify variables (In other words 
+        predicate(function|None): If it is not None, only variables in the
+                                  `main_program` that makes predicate(variable)==True
+                                  will be saved. It only works when we are using the
+                                  `main_program` to specify variables (In other words
                                   `vars` is None).
                                   Default: None
-        filename(str|None): The file which to save all variables. If you prefer to save 
+        filename(str|None): The file which to save all variables. If you prefer to save
                             variables separately, set it to None.
                             Default: None
 
@@ -151,7 +152,7 @@ def save_vars(executor,
 
             # The second usage: using `vars` to specify variables
             var_list = [var_a, var_b, var_c]
-            fluid.io.save_vars(executor=exe, dirname=path, vars=var_list, 
+            fluid.io.save_vars(executor=exe, dirname=path, vars=var_list,
                                filename="vars_file")
             # var_a, var_b and var_c will be saved. And they are going to be
             # saved in the same file named 'var_file' in the path "./my_paddle_model".
@@ -165,7 +166,7 @@ def save_vars(executor,
         save_vars(
             executor,
             dirname=dirname,
-            vars=filter(predicate, main_program.list_vars()),
+            vars=list(filter(predicate, main_program.list_vars())),
             filename=filename)
     else:
         save_program = Program()
@@ -205,14 +206,14 @@ def save_params(executor, dirname, main_program=None, filename=None):
     This function filters out all parameters from the give `main_program`
     and then save them to the folder `dirname` or the file `filename`.
 
-    Use the `dirname` to specify the saving folder. If you would like to 
-    save parameters in separate files, set `filename` None; if you would 
-    like to save all parameters in a single file, use `filename` to specify 
+    Use the `dirname` to specify the saving folder. If you would like to
+    save parameters in separate files, set `filename` None; if you would
+    like to save all parameters in a single file, use `filename` to specify
     the file name.
 
-    NOTICE: Some variables are not Parameter while they are necessary for 
-    training. So you can NOT save and continue your training just by 
-    `save_params()` and `load_params()`. Please use `save_persistables()` 
+    NOTICE: Some variables are not Parameter while they are necessary for
+    training. So you can NOT save and continue your training just by
+    `save_params()` and `load_params()`. Please use `save_persistables()`
     and `load_persistables()` instead.
 
     Args:
@@ -222,8 +223,8 @@ def save_params(executor, dirname, main_program=None, filename=None):
                                     saved. If it is None, the default
                                     main program will be used automatically.
                                     Default: None
-        filename(str|None): The file to save all parameters. If you prefer 
-                            to save parameters in differnet files, set it 
+        filename(str|None): The file to save all parameters. If you prefer
+                            to save parameters in differnet files, set it
                             to None.
                             Default: None
 
@@ -236,7 +237,7 @@ def save_params(executor, dirname, main_program=None, filename=None):
             exe = fluid.Executor(fluid.CPUPlace())
             param_path = "./my_paddle_model"
             prog = fluid.default_main_program()
-            fluid.io.save_params(executor=exe, dirname=param_path, 
+            fluid.io.save_params(executor=exe, dirname=param_path,
                                  main_program=None)
     """
     save_vars(
@@ -250,23 +251,23 @@ def save_params(executor, dirname, main_program=None, filename=None):
 
 def save_persistables(executor, dirname, main_program=None, filename=None):
     """
-    This function filters out all variables with `persistable==True` from the 
-    give `main_program` and then saves these variables to the folder `dirname` 
+    This function filters out all variables with `persistable==True` from the
+    give `main_program` and then saves these variables to the folder `dirname`
     or file `filename`.
 
-    The `dirname` is used to specify the folder where persistable variables 
-    are going to be saved. If you would like to save variables in separate 
-    files, set `filename` None; if you would like to save all variables in a 
+    The `dirname` is used to specify the folder where persistable variables
+    are going to be saved. If you would like to save variables in separate
+    files, set `filename` None; if you would like to save all variables in a
     single file, use `filename` to specify the file name.
 
     Args:
         executor(Executor): The executor to run for saving persistable variables.
         dirname(str): The directory path.
-        main_program(Program|None): The program whose persistbale variables will 
-                                    be saved. If it is None, the default main 
+        main_program(Program|None): The program whose persistbale variables will
+                                    be saved. If it is None, the default main
                                     program will be used automatically.
                                     Default: None
-        filename(str|None): The file to saved all variables. If you prefer to 
+        filename(str|None): The file to saved all variables. If you prefer to
                             save variables in differnet files, set it to None.
                             Default: None
 
@@ -279,7 +280,7 @@ def save_persistables(executor, dirname, main_program=None, filename=None):
             exe = fluid.Executor(fluid.CPUPlace())
             param_path = "./my_paddle_model"
             prog = fluid.default_main_program()
-            fluid.io.save_persistables(executor=exe, dirname=param_path, 
+            fluid.io.save_persistables(executor=exe, dirname=param_path,
                                        main_program=None)
     """
     save_vars(
@@ -300,34 +301,34 @@ def load_vars(executor,
     """
     Load variables from the given directory by executor.
 
-    There are two ways to specify variables to be loaded: The first way, list 
-    variables in a list and assign it to the `vars`. The second way, assign the 
-    `main_program` with an existing program, then all variables in the program 
-    will be loaded. The first way has a higher priority. In other words if `vars` 
+    There are two ways to specify variables to be loaded: The first way, list
+    variables in a list and assign it to the `vars`. The second way, assign the
+    `main_program` with an existing program, then all variables in the program
+    will be loaded. The first way has a higher priority. In other words if `vars`
     are assigned, the `main_program` and the `predicate` will be ignored.
 
-    The `dirname` are used to specify the folder where to load variables. 
-    If variables were saved in separate files in the folder `dirname`, 
-    set `filename` None; if all variables were saved in a single file, 
+    The `dirname` are used to specify the folder where to load variables.
+    If variables were saved in separate files in the folder `dirname`,
+    set `filename` None; if all variables were saved in a single file,
     use `filename` to specify it.
 
     Args:
         executor(Executor): The executor to run for loading variables.
         dirname(str): The directory path.
-        main_program(Program|None): The program whose variables will be loaded. 
-                                    If it is None, the default main program will 
+        main_program(Program|None): The program whose variables will be loaded.
+                                    If it is None, the default main program will
                                     be used automatically.
                                     Default: None
-        vars(list[Variable]|None): The list that contains all variables to load. 
+        vars(list[Variable]|None): The list that contains all variables to load.
                                    It has a higher priority than the `main_program`.
                                    Default: None
-        predicate(function|None): If it is not None, only variables in the 
-                                  `main_program` that makes predicate(variable)==True 
-                                  will be loaded. It only works when we are using the 
-                                  `main_program` to specify variables (In other words 
+        predicate(function|None): If it is not None, only variables in the
+                                  `main_program` that makes predicate(variable)==True
+                                  will be loaded. It only works when we are using the
+                                  `main_program` to specify variables (In other words
                                   `vars` is None).
                                   Default: None
-        filename(str|None): The file which saved all required variables. If variables 
+        filename(str|None): The file which saved all required variables. If variables
                             were saved in differnet files, set it to None.
                             Default: None
 
@@ -357,9 +358,9 @@ def load_vars(executor,
 
             # The second usage: using `vars` to specify variables
             var_list = [var_a, var_b, var_c]
-            fluid.io.load_vars(executor=exe, dirname=path, vars=var_list, 
+            fluid.io.load_vars(executor=exe, dirname=path, vars=var_list,
                                filename="vars_file")
-            # var_a, var_b and var_c will be loaded. And they are supposed to haven 
+            # var_a, var_b and var_c will be loaded. And they are supposed to haven
             # been saved in the same file named 'var_file' in the path "./my_paddle_model".
     """
     if vars is None:
@@ -371,7 +372,8 @@ def load_vars(executor,
         load_vars(
             executor,
             dirname=dirname,
-            vars=filter(predicate, main_program.list_vars()),
+            main_program=main_program,
+            vars=list(filter(predicate, main_program.list_vars())),
             filename=filename)
     else:
         load_prog = Program()
@@ -402,9 +404,15 @@ def load_vars(executor,
                 inputs={},
                 outputs={"Out": load_var_list},
                 attrs={'file_path': os.path.join(dirname, filename)})
-
         executor.run(load_prog)
 
+        if main_program is None:
+            main_program = default_main_program()
+
+        # load slice vars on pserver, if have it.
+        _load_slice_up_vars(executor, dirname,
+                            main_program._slice_vars_and_attrs)
+
 
 def load_params(executor, dirname, main_program=None, filename=None):
     """
@@ -412,15 +420,15 @@ def load_params(executor, dirname, main_program=None, filename=None):
     and then trys to load these parameters from the folder `dirname` or
     the file `filename`.
 
-    Use the `dirname` to specify the folder where parameters were saved. If 
-    parameters were saved in separate files in the folder `dirname`, set 
-    `filename` None; if all parameters were saved in a single file, use 
+    Use the `dirname` to specify the folder where parameters were saved. If
+    parameters were saved in separate files in the folder `dirname`, set
+    `filename` None; if all parameters were saved in a single file, use
     `filename` to specify the file name.
 
-    NOTICE: Some variables are not Parameter while they are necessary for 
-    training. So you can NOT save and continue your training just by 
-    `save_params()` and `load_params()`. Please use `save_persistables()` 
-    and `load_persistables()` instead. 
+    NOTICE: Some variables are not Parameter while they are necessary for
+    training. So you can NOT save and continue your training just by
+    `save_params()` and `load_params()`. Please use `save_persistables()`
+    and `load_persistables()` instead.
 
     Args:
         executor(Executor): The executor to run for loading parameters.
@@ -429,7 +437,7 @@ def load_params(executor, dirname, main_program=None, filename=None):
                                     loaded. If it is None, the default
                                     main program will be used automatically.
                                     Default: None
-        filename(str|None): The file which saved all parameters. If parameters 
+        filename(str|None): The file which saved all parameters. If parameters
                             were saved in differnet files, set it to None.
                             Default: None
 
@@ -442,7 +450,7 @@ def load_params(executor, dirname, main_program=None, filename=None):
             exe = fluid.Executor(fluid.CPUPlace())
             param_path = "./my_paddle_model"
             prog = fluid.default_main_program()
-            fluid.io.load_params(executor=exe, dirname=param_path, 
+            fluid.io.load_params(executor=exe, dirname=param_path,
                                 main_program=None)
     """
     load_vars(
@@ -455,23 +463,23 @@ def load_params(executor, dirname, main_program=None, filename=None):
 
 def load_persistables(executor, dirname, main_program=None, filename=None):
     """
-    This function filters out all variables with `persistable==True` from the 
-    give `main_program` and then trys to load these variables from the folder 
+    This function filters out all variables with `persistable==True` from the
+    give `main_program` and then trys to load these variables from the folder
     `dirname` or the file `filename`.
 
-    Use the `dirname` to specify the folder where persistable variables were 
-    saved. If variables were saved in separate files, set `filename` None; 
-    if all variables were saved in a single file, use `filename` to specify 
+    Use the `dirname` to specify the folder where persistable variables were
+    saved. If variables were saved in separate files, set `filename` None;
+    if all variables were saved in a single file, use `filename` to specify
     the file name.
 
     Args:
         executor(Executor): The executor to run for loading persistable variables.
         dirname(str): The directory path.
-        main_program(Program|None): The program whose persistbale variables will 
-                                    be loaded. If it is None, the default main 
+        main_program(Program|None): The program whose persistbale variables will
+                                    be loaded. If it is None, the default main
                                     program will be used automatically.
                                     Default: None
-        filename(str|None): The file which saved all variables. If variables were 
+        filename(str|None): The file which saved all variables. If variables were
                             saved in differnet files, set it to None.
                             Default: None
 
@@ -484,7 +492,7 @@ def load_persistables(executor, dirname, main_program=None, filename=None):
             exe = fluid.Executor(fluid.CPUPlace())
             param_path = "./my_paddle_model"
             prog = fluid.default_main_program()
-            fluid.io.load_persistables(executor=exe, dirname=param_path, 
+            fluid.io.load_persistables(executor=exe, dirname=param_path,
                                        main_program=None)
     """
     load_vars(
@@ -526,7 +534,7 @@ def prepend_feed_ops(inference_program,
 
     for i, name in enumerate(feed_target_names):
         out = global_block.var(name)
-        global_block.prepend_op(
+        global_block._prepend_op(
             type='feed',
             inputs={'X': [feed_var]},
             outputs={'Out': [out]},
@@ -556,28 +564,31 @@ def save_inference_model(dirname,
                          executor,
                          main_program=None,
                          model_filename=None,
-                         params_filename=None):
+                         params_filename=None,
+                         export_for_deployment=True):
     """
     Prune the given `main_program` to build a new program especially for inference,
     and then save it and all related parameters to given `dirname` by the `executor`.
 
     Args:
         dirname(str): The directory path to save the inference model.
-        feeded_var_names(list[str]): Names of variables that need to be feeded data 
+        feeded_var_names(list[str]): Names of variables that need to be feeded data
                                      during inference.
-        target_vars(list[Variable]): Variables from which we can get inference 
+        target_vars(list[Variable]): Variables from which we can get inference
                                      results.
         executor(Executor): The executor that saves the inference model.
-        main_program(Program|None): The original program, which will be pruned to 
-                                    build the inference model. If is setted None, 
+        main_program(Program|None): The original program, which will be pruned to
+                                    build the inference model. If is setted None,
                                     the default main program will be used.
                                     Default: None.
-        model_filename(str|None): The name of file to save the inference program 
-                                  itself. If is setted None, a default filename 
+        model_filename(str|None): The name of file to save the inference program
+                                  itself. If is setted None, a default filename
                                   `__model__` will be used.
-        params_filename(str|None): The name of file to save all related parameters. 
-                                   If it is setted None, parameters will be saved 
+        params_filename(str|None): The name of file to save all related parameters.
+                                   If it is setted None, parameters will be saved
                                    in separate files .
+        export_for_deployment(bool): remove the read ops that are added by py_reader
+                                    for cpp inference lib. Default True
 
     Returns:
         None
@@ -594,19 +605,21 @@ def save_inference_model(dirname,
             fluid.io.save_inference_model(dirname=path, feeded_var_names=['img'],
                          target_vars=[predict_var], executor=exe)
 
-            # In this exsample, the function will prune the default main program 
-            # to make it suitable for infering the `predict_var`. The pruned 
-            # inference program is going to be saved in the "./infer_model/__model__" 
+            # In this exsample, the function will prune the default main program
+            # to make it suitable for infering the `predict_var`. The pruned
+            # inference program is going to be saved in the "./infer_model/__model__"
             # and parameters are going to be saved in separate files under folder
-            # "./infer_model". 
+            # "./infer_model".
 
     """
-    if isinstance(feeded_var_names, basestring):
+    if isinstance(feeded_var_names, six.string_types):
         feeded_var_names = [feeded_var_names]
     else:
         if len(feeded_var_names) > 0:
+            # TODO(paddle-dev): polish these code blocks
             if not (bool(feeded_var_names) and all(
-                    isinstance(name, basestring) for name in feeded_var_names)):
+                    isinstance(name, six.string_types)
+                    for name in feeded_var_names)):
                 raise ValueError("'feed_var_names' should be a list of str.")
 
     if isinstance(target_vars, Variable):
@@ -628,11 +641,12 @@ def save_inference_model(dirname,
     for i, op in enumerate(global_block.ops):
         op.desc.set_is_target(False)
         if op.type == "feed" or op.type == "fetch":
-            global_block.remove_op(i)
+            global_block._remove_op(i)
     copy_program.desc.flush()
 
     pruned_program = copy_program.prune(targets=target_vars)
-    inference_program = pruned_program.inference_optimize()
+    inference_program = pruned_program.inference_optimize(
+        export_for_deployment=export_for_deployment)
     fetch_var_names = [v.name for v in target_vars]
 
     prepend_feed_ops(inference_program, feeded_var_names)
@@ -652,11 +666,19 @@ def save_inference_model(dirname,
 
     save_persistables(executor, dirname, inference_program, params_filename)
 
+    # if there is lookup table, the trainer 0 will notify all pserver to save.
+    if main_program._is_distributed and main_program._is_chief and main_program._distributed_lookup_table:
+        lookup_table_filename = os.path.join(dirname, "__lookup_table__")
+        _save_lookup_tables_by_notify(executor, lookup_table_filename,
+                                      main_program._distributed_lookup_table,
+                                      main_program._endpoints)
+
 
 def load_inference_model(dirname,
                          executor,
                          model_filename=None,
-                         params_filename=None):
+                         params_filename=None,
+                         pserver_endpoints=None):
     """
     Load inference model from a directory
 
@@ -664,22 +686,26 @@ def load_inference_model(dirname,
         dirname(str): The directory path
         executor(Executor): The executor to run for loading inference model.
         model_filename(str|None): The name of file to load inference program.
-                                  If it is None, the default filename 
+                                  If it is None, the default filename
                                   '__model__' will be used.
                                   Default: None
         params_filename(str|None): The name of file to load all parameters.
-                                   It is only used for the case that all 
-                                   parameters were saved in a single binary 
-                                   file. If parameters were saved in separate 
+                                   It is only used for the case that all
+                                   parameters were saved in a single binary
+                                   file. If parameters were saved in separate
                                    files, set it as 'None'.
+        pserver_endpoints(list|None): This only need by distributed inference.
+                                    When use distributed look up table in training,
+                                    We also need it in inference.The parameter is
+                                    a list of pserver endpoints.
 
     Returns:
         tuple: The return of this function is a tuple with three elements:
-        (program, feed_target_names, fetch_targets). The `program` is a 
-        Program, it's the program for inference. The `feed_target_names` is 
-        a list of str, it contains Names of variables that need to feed 
-        data in the inference program. The `fetch_targets` is a list of 
-        Variable. It contains variables from which we can get inference 
+        (program, feed_target_names, fetch_targets). The `program` is a
+        Program, it's the program for inference. The `feed_target_names` is
+        a list of str, it contains Names of variables that need to feed
+        data in the inference program. The `fetch_targets` is a list of
+        Variable. It contains variables from which we can get inference
         results.
 
     Raises:
@@ -690,17 +716,21 @@ def load_inference_model(dirname,
 
             exe = fluid.Executor(fluid.CPUPlace())
             path = "./infer_model"
-            [inference_program, feed_target_names, fetch_targets] = 
+            endpoints = ["127.0.0.1:2023","127.0.0.1:2024"]
+            [inference_program, feed_target_names, fetch_targets] =
                 fluid.io.load_inference_model(dirname=path, executor=exe)
             results = exe.run(inference_program,
                           feed={feed_target_names[0]: tensor_img},
                           fetch_list=fetch_targets)
 
-            # In this exsample, the inference program was saved in the 
-            # "./infer_model/__model__" and parameters were saved in 
-            # separate files in ""./infer_model". 
-            # After getting inference program, feed target names and 
-            # fetch targets, we can use an Executor to run the inference 
+            # if we need lookup table, we will use:
+            fluid.io.load_inference_model(dirname=path, executor=exe, pserver_endpoints=endpoints)
+
+            # In this exsample, the inference program was saved in the
+            # "./infer_model/__model__" and parameters were saved in
+            # separate files in ""./infer_model".
+            # After getting inference program, feed target names and
+            # fetch targets, we can use an Executor to run the inference
             # program to get the inference result.
 
     """
@@ -720,8 +750,15 @@ def load_inference_model(dirname,
         program_desc_str = f.read()
 
     program = Program.parse_from_string(program_desc_str)
+    if not core._is_program_version_supported(program._version()):
+        raise ValueError("Unsupported program version: %d\n" %
+                         program._version())
+    # Binary data also need versioning.
     load_persistables(executor, dirname, program, params_filename)
 
+    if pserver_endpoints:
+        program = _endpoints_replacement(program, pserver_endpoints)
+
     feed_target_names = program.desc.get_feed_target_names()
     fetch_target_names = program.desc.get_fetch_target_names()
     fetch_targets = [
@@ -731,6 +768,61 @@ def load_inference_model(dirname,
     return [program, feed_target_names, fetch_targets]
 
 
+def _save_lookup_tables_by_notify(executor, dirname, lookup_table,
+                                  pserver_endpoints):
+    """
+    This function will send checkpoint notify message from Trainer 0
+    to all the pservers.
+    The checkpoint notify message contains lookup table name,
+    the absolute path on pserver to save lookup_table.
+
+    Args:
+        executor(Executor): The executor to run for send checkpoint notify.
+        dirname(str): The folder where to save.
+        lookup_table(string): the lookup table name, when use distribute
+            lookup table, we can get lookup table name by DistributeTranspiler.
+            table_name
+        ps_endpoint_list(list): the parameter server ip:port list.
+            when use distribute lookup table, we can get ps_endpoint_list by
+            distribute arguments.
+    Return:
+        None
+
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            param_path = "./my_paddle_model"
+            table_name = "share_w"
+            ps_endpoints = ["127.0.0.1:6000","127.0.0.1:6001"]
+
+            _save_pserver_vars_by_notify(executor=exe,
+                    dirname=param_path, lookup_table=table_name,
+                    pserver_endpoints=ps_endpoints)
+    """
+
+    pserver_notify_program = Program()
+    pserver_notify_block = pserver_notify_program.global_block()
+
+    attrs = {}
+    attrs['epmap'] = pserver_endpoints
+    attrs['dir'] = dirname
+    attrs['lookup_table'] = lookup_table
+
+    pserver_notify_block.append_op(
+        type='checkpoint_notify', inputs={}, outputs={}, attrs=attrs)
+    executor.run(pserver_notify_program)
+
+
+def _endpoints_replacement(program, endpoints):
+    ENDPOINT_MAP = "epmap"
+    for op in program.global_block().ops:
+        if op.has_attr(ENDPOINT_MAP):
+            op.set_attr(ENDPOINT_MAP, endpoints)
+    program._sync_with_cpp()
+    return program
+
+
 def get_parameter_value(para, executor):
     """
     Get the LoDTensor value of the given parameter.
@@ -794,681 +886,44 @@ def get_parameter_value_by_name(name, executor, program=None):
     return get_parameter_value(var, executor)
 
 
-SUCCESS_MARK_FILENAME = "_SUCCESS"
-CHECKPOINT_PREFIX = "checkpoint"
-MODEL_DIR = "__model__"
-LOOKUP_TABLE_DIR = "__lookup_table__"
-TRAINER_PREFIX = "trainer"
-CHECKPOINT_SEPARATOR = "_"
-
-
-def save_checkpoint(executor,
-                    checkpoint_dir,
-                    trainer_id,
-                    trainer_args=None,
-                    main_program=None,
-                    max_num_checkpoints=3,
-                    lookup_table=None,
-                    ps_endpoint_list=None):
-    """
-    This function filters out all checkpoint variables from the give
-    main_program and then saves these variables to the `checkpoint_dir` 
-    directory.
-
-    In the training precess, we generally save a checkpoint in each
-    iteration. So there might be a lot of checkpoints in the 
-    `checkpoint_dir`. To avoid them taking too much disk space, the 
-    `max_num_checkpoints` are introduced to limit the total number of 
-    checkpoints. If the number of existing checkpints is greater than 
-    the `max_num_checkpoints`, oldest ones will be scroll deleted.
-
-    A variable is a checkpoint variable and will be saved if it meets
-    all following conditions:
-        1. It's persistable.
-        2. It's type is not FEED_MINIBATCH nor FETCH_LIST nor RAW.
-        3. It's name contains no "@GRAD" nor ".trainer_" nor ".block".
-
-    Args:
-        executor(Executor): The executor to run for save checkpoint.
-        checkpoint_dir(str): The folder where to save checkpoints.
-        trainer_id(int): currect trainer id, if id is equal to 0, the trainer 
-            is chief.
-        trainer_args(dict|None): Current training arguments. Such as 'epoch_id' 
-            and 'step_id'.
-            Defaut: None
-        main_program(Program|None): The program whose checkpoint variables will
-            be saved. If it is None, the default main program will be used.
-        max_num_checkpoints(int): The max number of total number of existing 
-            checkpoints.
-            Default: 3
-        lookup_table(string|None): the lookup table name, when use distribute
-            lookup table, we can get lookup table name by DistributeTranspiler.
-            table_name 
-        ps_endpoint_list(list|None): the parameter server ip:port list.  
-            when use distribute lookup table, we can get ps_endpoint_list by 
-            distribute arguments.
-
-    Returns:
-        None
-
-    Raises:
-        ValueError: If `checkpoint_dir` is None.
-        AssertionError: If `trainer_args` is not a dict.
-
-    Examples:
-        .. code-block:: python
-
-            exe = fluid.Executor(fluid.CPUPlace())
-            path = "./checkpoints"
-            prog = fluid.default_main_program()
-            trainer_args = {"epoch_id": 200,
-                            "step_id": 20} # just an example
-            table_name = "share_w"
-            ps_endpoints = ["127.0.0.1:6000","127.0.0.1:6001"]
-
-            fluid.io.save_checkpoint(executor=exe,
-                                     checkpoint_dir=path,
-                                     trainer_id=0,
-                                     trainer_args=trainer_args,
-                                     main_program=prog,
-                                     max_num_checkpoints=3,
-                                     lookup_table=table_name,
-                                     ps_endpoint_list = ps_endpoints)
-    """
-    if checkpoint_dir is None:
-        raise ValueError("'checkpoint_dir' should not be None")
-    assert checkpoint_dir
-
-    if trainer_args:
-        assert isinstance(trainer_args, dict)
-
-    is_chief = trainer_id == 0
-
-    _make_chekcpoint_dirs(checkpoint_dir)
-    serial = get_latest_checkpoint_serial(checkpoint_dir) + 1
-    cur_dir = _get_serial_dir(checkpoint_dir, serial)
-
-    save_trainer_args(cur_dir, trainer_id, trainer_args)
-
-    if is_chief:
-        save_persist_vars_without_grad(executor, cur_dir, main_program)
-
-    if is_chief and lookup_table and ps_endpoint_list:
-        save_pserver_vars_by_notify(executor, cur_dir, lookup_table,
-                                    ps_endpoint_list)
-
-    _scroll_delete(checkpoint_dir, max_num_checkpoints)
-
-
-def load_checkpoint(executor, checkpoint_dir, serial, main_program):
-    """
-    This function filters out all checkpoint variables from the give
-    main_program and then try to load these variables from the
-    `checkpoint_dir` directory.
-
-    In the training precess, we generally save a checkpoint in each
-    iteration. So there are more than one checkpoint in the 
-    `checkpoint_dir` (each checkpoint has its own sub folder), use 
-    `serial` to specify which serial of checkpoint you would like to
-    load.
-
-    A variable is a checkpoint variable and will be loaded if it meets
-    all following conditions:
-        1. It's persistable.
-        2. It's type is not FEED_MINIBATCH nor FETCH_LIST nor RAW.
-        3. It's name contains no "@GRAD" nor ".trainer_" nor ".block".
-
-    Args:
-        executor(Executor): The executor to run for loading checkpoint.
-        checkpoint_dir(str): The folder where all checkpoints are.
-        serial(int): The serial of checkpoint you would like to load.
-        main_program(Program): The program whose checkpoint variables will
-                               be loaded.
-
-    Returns:
-        None
-
-    Raises:
-        ValueError: If `checkpoint_dir` is None.
-        ValueError: If `serial` is None or `serial` is less than 0.
-        ValueError: If `main_program` is None.
-
-    Examples:
-        .. code-block:: python
-
-            exe = fluid.Executor(fluid.CPUPlace())
-            path = "./checkpoints"
-            prog = fluid.default_main_program()
-            fluid.io.load_checkpoint(executor=exe, checkpoint_dir=path,
-                    serial=9, main_program=prog)
-
-            # In this example, `load_checkpoint` function
-            # will first filters out all checkpoint variables in the default
-            # main program, and then try to load these variables form the
-            # folder "./checkpoints/checkpoint_9/__model__".
-    """
-
-    if checkpoint_dir is None:
-        raise ValueError("'checkpoint_dir' should not be None")
-
-    if serial is None or serial < 0:
-        raise ValueError("'serial' should not be None or <0 ")
-
-    if main_program is None:
-        raise ValueError('main_program should not be None.')
-
-    cur_dir = _get_serial_dir(checkpoint_dir, serial)
-    load_persist_vars_without_grad(executor, cur_dir, main_program, True)
-
-
-def clean_checkpoint(checkpoint_dir, delete_dir=False):
-    """
-    clean the checkpoint dir, when the train exits normally, 
-    the trainer will call clean_checkpoint to delete checkpoint directory saved before.
-    delete_dir only works when the directory is empty, otherwise, OSError is raised.  
-
-    : param checkpoint_dir
-    : param delete_dir
-    """
-
-    if checkpoint_dir is None:
-        raise ValueError("'checkpoint_dir' should not be None")
-    _scroll_delete(checkpoint_dir, max_num_checkpoints=0)
-
-    if delete_dir and not os.listdir(checkpoint_dir):
-        os.rmdir(checkpoint_dir)
-
-
-def load_persist_vars_without_grad(executor,
-                                   dirname,
-                                   program,
-                                   has_model_dir=False):
-    """
-    This function filters out all checkpoint variables from the give
-    program and then trys to load these variables from the given directory.
-
-    A variable is a checkpoint variable if it meets all following
-    conditions:
-        1. It's persistable.
-        2. It's type is not FEED_MINIBATCH nor FETCH_LIST nor RAW.
-        3. It's name contains no "@GRAD" nor ".trainer_" nor ".block".
-
-    Args:
-        executor(Executor): The executor to run for loading variables.
-        dirname(str): The directory path.
-        program(Program): The program whose checkpoint variables will
-                          be loaded.
-        has_model_dir(bool): if True, the function loads variables
-                             from a sub directory named '__model__'.
-                             Default: False
-
-    Returns:
-        None
-
-    Examples:
-        .. code-block:: python
-
-            exe = fluid.Executor(fluid.CPUPlace())
-            param_path = "./my_paddle_model"
-            prog = fluid.default_main_program()
-            fluid.io.load_persist_vars_without_grad(executor=exe,
-                    dirname=param_path, program=prog, has_model_dir=True)
-
-            # In this example, `load_persist_vars_without_grad` function
-            # will first filters out all checkpoint variables in the default
-            # main program, and then trys to load these variables form the
-            # folder "./my_paddle_model/__model__".
-    """
-
-    if has_model_dir:
-        dirname = _get_model_dir(dirname)
-
-    load_vars(
-        executor,
-        dirname=dirname,
-        main_program=program,
-        predicate=_is_checkpoint_var,
-        filename=None)
-
-
-def load_lookup_table_vars(executor, dirname, program, pserver_id, table_name):
-    """
-    The parameter server will load lookup table's local file in 
-    selectedrows variable.
-
-    Args:
-        executor(Executor): The executor to run for loading persistable variables
-        dirname(str): The directory path
-        main_program(Program): Find the variable named table_name in main_program
-        pserver_id(int): the serial number in pserver_endpoints list
-        table_name(str): lookup table name
-
-    Returns:
-        None
-
-    Examples:
-        .. code-block:: python
-
-            exe = fluid.Executor(fluid.CPUPlace())
-            dirname = "./checkpoints/checkpoint_9/__model__"
-            prog = fluid.default_main_program()
-            pserver_id = 1
-            table_name = "share_w"
-            fluid.io.load_lookup_table_vars(executor=exe,
-                    dirname=dirname, program=prog, pserver_id=pserver_id,
-                    table_name=table_name)
-    """
-
-    for var in program.list_vars():
-        if var.name == table_name:
-            lookup_table_var = var
-            break
-
-    assert lookup_table_var is not None
-
-    lookup_table_dir = os.path.join(dirname, LOOKUP_TABLE_DIR)
-    table_file = table_name + CHECKPOINT_SEPARATOR + str(pserver_id)
+def _load_slice_up_vars(executor, dirname, slice_vars_and_attrs):
+    if not slice_vars_and_attrs:
+        return
 
     load_prog = Program()
     load_block = load_prog.global_block()
 
-    load_block.append_op(
-        type='load',
-        inputs={},
-        outputs={'Out': [lookup_table_var]},
-        attrs={'file_path': os.path.join(lookup_table_dir, table_file)})
+    for var_tuple in slice_vars_and_attrs:
+        orig_var = var_tuple[0]
+        start = var_tuple[1]
+        slice_var = var_tuple[2]
+        end = start + reduce(lambda x, y: x * y, slice_var.shape)
+
+        clone_orig_var = load_block.create_var(
+            name=orig_var.name,
+            type=orig_var.type,
+            shape=orig_var.shape,
+            dtype=orig_var.dtype,
+            persistable=True)
+
+        clone_slice_var = load_block.create_var(
+            name=slice_var.name,
+            type=slice_var.type,
+            shape=slice_var.shape,
+            dtype=slice_var.dtype,
+            persistable=True)
+
+        load_block.append_op(
+            type='load',
+            inputs={},
+            outputs={'Out': [clone_orig_var]},
+            attrs={'file_path': os.path.join(dirname, clone_orig_var.name)})
+        load_block.append_op(
+            type="slice",
+            inputs={'Input': clone_orig_var},
+            outputs={'Out': clone_slice_var},
+            attrs={'axes': [0],
+                   'starts': [start],
+                   'ends': [end]})
 
     executor.run(load_prog)
-
-
-def save_persist_vars_without_grad(executor, dirname, program):
-    """
-    This function filters out all checkpoint variables from the give
-    program and then save these variables to a sub-folder '__model__' of 
-    the given directory.
-
-    A variable is a checkpoint variable if it meets all following
-    conditions:
-        1. It's persistable.
-        2. It's type is not FEED_MINIBATCH nor FETCH_LIST nor RAW.
-        3. It's name contains no "@GRAD" nor ".trainer_" nor ".block".
-
-    Args:
-        executor(Executor): The executor to run for saving variables.
-        dirname(str): The directory path.
-        program(Program): The program whose checkpoint variables will
-                          be saved.
-
-    Returns:
-        None
-
-    Examples:
-        .. code-block:: python
-
-            exe = fluid.Executor(fluid.CPUPlace())
-            param_path = "./my_paddle_model"
-            prog = fluid.default_main_program()
-            fluid.io.save_persist_vars_without_grad(executor=exe,
-                    dirname=param_path, program=prog)
-
-            # In this example, `save_persist_vars_without_grad` function
-            # will first filters out all checkpoint variables in the default
-            # main program, and then saves these variables to the folder 
-            # "./my_paddle_model/__model__".
-    """
-    cur_dir = _get_model_dir(dirname)
-    save_vars(
-        executor,
-        dirname=cur_dir,
-        main_program=program,
-        vars=None,
-        predicate=_is_checkpoint_var,
-        filename=None)
-    _write_success(cur_dir)
-
-
-def save_pserver_vars_by_notify(executor, dirname, lookup_table,
-                                ps_endpoint_list):
-    """
-    This function will send checkpoint notify message from Trainer 0
-    to all the pservers.
-    The checkpoint notify message contains lookup table name, 
-    the absolute path on pserver to save lookup_table.
-
-    Args:
-        executor(Executor): The executor to run for send checkpoint notify.
-        dirname(str): The folder where to save checkpoints.
-        lookup_table(string): the lookup table name, when use distribute
-            lookup table, we can get lookup table name by DistributeTranspiler.
-            table_name 
-        ps_endpoint_list(list): the parameter server ip:port list.  
-            when use distribute lookup table, we can get ps_endpoint_list by 
-            distribute arguments.
-    Return:
-        None
-    
-    Examples:
-        .. code-block:: python
-
-            exe = fluid.Executor(fluid.CPUPlace())
-            param_path = "./my_paddle_model"
-            prog = fluid.default_main_program()
-            table_name = "share_w"
-            ps_endpoints = ["127.0.0.1:6000","127.0.0.1:6001"]
-
-            fluid.io.save_pserver_vars_by_notify(executor=exe,
-                    dirname=param_path, lookup_table=table_name, 
-                    ps_endpoint_list=ps_endpoints)
-    """
-    cur_dir = _get_lookuptable_dir(dirname)
-
-    checkpoint_notify_program = Program()
-    checkpoint_notify_block = checkpoint_notify_program.global_block()
-
-    attrs = {}
-    attrs['epmap'] = ps_endpoint_list
-    attrs['dir'] = cur_dir
-    attrs['lookup_table'] = lookup_table
-
-    checkpoint_notify_block.append_op(
-        type='checkpoint_notify', inputs={}, outputs={}, attrs=attrs)
-    executor.run(checkpoint_notify_program)
-
-
-def save_trainer_args(dirname, trainer_id, trainer_args):
-    assert isinstance(trainer_args, dict)
-
-    cur_dir = _get_trainer_dir(dirname, trainer_id)
-
-    for name, value in trainer_args.iteritems():
-        args_file = os.path.join(cur_dir, name)
-        with open(args_file, 'w') as f:
-            f.write(str(value))
-    _write_success(cur_dir)
-
-
-def load_trainer_args(checkpoint_dir, serial, trainer_id, trainer_args):
-    """
-    trainer will load some args from it's independent directory, 
-    such as epoch_id and step_id.
-
-    Args:
-        checkpoint_dir(str): The folder where all checkpoints are.
-        serial(int): The serial of checkpoint you would like to load.
-        trainer_id(int): current trainer id.
-        trainer_args(list): list about load trainer args
-    Return:
-        None
-
-    Examples:
-        .. code-block:: python
-
-            param_path = "./checkpoint/"
-            serial = 7
-            trainer_id = 2
-            trainer_args = ["epoch_id", "step_id"]
-
-            fluid.io.load_trainer_args(checkpoint_dir=param_path, serial=serial,
-            trainer_id=trainer_id, trainer_args=trainer_args)
-    """
-    assert isinstance(trainer_args, list)
-
-    cur_dir = _get_serial_dir(checkpoint_dir, serial)
-    cur_dir = _get_trainer_dir(cur_dir, trainer_id)
-
-    ret_values = []
-
-    for arg in trainer_args:
-        cur_file = os.path.join(cur_dir, arg)
-        with open(cur_file, 'r') as f:
-            contents = f.read()
-            ret_values.append(contents.strip())
-    return ret_values
-
-
-def _is_checkpoint_var(var):
-    """
-    the checkpoint will not save or load all the variables.
-    var type is FEED_MINIBATCH/FETCH_LIST/RAW or var name ends with @GRAD are discarded.
-
-    : param var(Variable)
-    """
-    if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
-            var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
-            var.desc.type() == core.VarDesc.VarType.RAW:
-        return False
-    # @GRAD are named for gradient variables, checkpoint will not save it.
-    if "@GRAD" in var.name:
-        return False
-    # .trainer_ are named for distribute train variables, checkpoint will not save it.
-    if ".trainer_" in var.name:
-        return False
-
-    # .block is named for distribute train variables, checkpoint will not save it.
-    if ".block" in var.name:
-        return False
-
-    return var.persistable
-
-
-def _make_chekcpoint_dirs(dirs):
-    """
-    _make_chekcpoint_dirs will makdir local directory directly, when the directory is exist, it will igore it.
-    """
-    assert dirs is not None
-
-    if os.path.isfile(dirs):
-        raise OSError(errno.ENOTDIR, "dirs path shoule be a Directory.", dirs)
-
-    if not os.path.isdir(dirs):
-        try:
-            os.makedirs(dirs)
-        except OSError as err:
-            if err.errno != errno.EEXIST:
-                raise err
-
-
-def _get_dir_serial(dirname):
-    _, serial = dirname.split(CHECKPOINT_SEPARATOR)
-
-    try:
-        serial_num = int(serial)
-    except ValueError:
-        serial_num = -1
-    return serial_num
-
-
-def _get_serial_dir(dirname, serial):
-    serial_folder = CHECKPOINT_PREFIX + CHECKPOINT_SEPARATOR + str(serial)
-    serial_dir = os.path.join(dirname, serial_folder)
-    _make_chekcpoint_dirs(serial_dir)
-
-    return serial_dir
-
-
-def _get_model_dir(dirname):
-    model_dir = os.path.join(dirname, MODEL_DIR)
-    _make_chekcpoint_dirs(model_dir)
-    return model_dir
-
-
-def _get_lookuptable_dir(dirname):
-    lookuptable_dir = os.path.join(dirname, LOOKUP_TABLE_DIR)
-    _make_chekcpoint_dirs(lookuptable_dir)
-    return lookuptable_dir
-
-
-def _get_trainer_dir(dirname, trainer_id):
-    trainer_folder = TRAINER_PREFIX + CHECKPOINT_SEPARATOR + str(trainer_id)
-    trainer_dir = os.path.join(dirname, trainer_folder)
-    _make_chekcpoint_dirs(trainer_dir)
-    return trainer_dir
-
-
-def _scroll_delete(dirname, max_num_checkpoints=3):
-    dirs = os.listdir(dirname)
-    serial_map = {}
-    for serial in dirs:
-        serial_num = _get_dir_serial(serial)
-        serial_map[serial_num] = serial
-
-    if len(serial_map.keys()) <= max_num_checkpoints:
-        return
-
-    serials = serial_map.keys()
-    serials.sort(reverse=True)
-    serials = serials[max_num_checkpoints:]
-    for serial in serials:
-        cur_dir = _get_serial_dir(dirname, serial)
-        try:
-            shutil.rmtree(cur_dir)
-        except OSError as err:
-            if err.errno != errno.ENOENT:
-                raise err
-
-
-def _write_success(dirname):
-    """
-    write an empty file named "_SUCCESS" in checkpoint dir, indicate this checkpoint is correct.
-
-    : param dirname
-    """
-    success_file = os.path.join(dirname, SUCCESS_MARK_FILENAME)
-    with open(success_file, 'a') as f:
-        now = time.ctime()
-        f.write(now)
-
-
-def get_latest_checkpoint_serial(checkpoint_dir):
-    """
-    get the latest file in checkpoint directory, the _SUCCESS file must exist in the directory
-
-    : param checkpoint_dir
-    """
-    if not checkpoint_dir:
-        return -1
-
-    def has_success(checkpoint_dir, cur_dir):
-        """
-        is _SUCCESS in this dir
-        """
-
-        serial = _get_dir_serial(cur_dir)
-        if serial == -1 or not os.path.isdir(
-                os.path.join(checkpoint_dir, cur_dir)):
-            return -1
-
-        success_path = os.path.join(
-            _get_serial_dir(checkpoint_dir, serial), MODEL_DIR,
-            SUCCESS_MARK_FILENAME)
-        if os.path.isfile(success_path):
-            return serial
-
-    if not os.path.isdir(checkpoint_dir):
-        return -1
-
-    current_dir = -1
-    dirs = os.listdir(checkpoint_dir)
-    for cur_dir in dirs:
-        success_num = has_success(checkpoint_dir, cur_dir)
-        if success_num > current_dir:
-            current_dir = success_num
-    return current_dir
-
-
-def get_test_program(filelist, program=None, startup_program=None):
-    """
-    Transpile current train program to a program to read test dataset
-    if the program is using reader ops like "open_files_op".
-    """
-
-    def _copy_reader_var_(block, var, new_name=None):
-        if new_name == None:
-            new_name = var.name
-        new_var = block.create_var(
-            name=str(new_name), type=core.VarDesc.VarType.READER)
-        new_var.desc.set_shapes(var.desc.shapes())
-        new_var.desc.set_dtypes(var.desc.dtypes())
-        new_var.persistable = True
-        return new_var
-
-    def _get_test_reader_name(train_reader_name):
-        return train_reader_name + "_test"
-
-    def _is_reader_op(op):
-        block = op.block
-        if "Out" in op.output_names:
-            reader_out = block.vars[op.output("Out")[0]]
-            if reader_out.type == core.VarDesc.VarType.READER:
-                return True
-        return False
-
-    if program == None:
-        program = default_main_program()
-    if startup_program == None:
-        startup_program = default_startup_program()
-    startup_block = startup_program.global_block()
-
-    # 1. find out the orignal reader var name
-    startup_reader_op_list = []
-
-    for op in startup_block.ops:
-        if _is_reader_op(op):
-            startup_reader_op_list.append(op)
-
-    if len(startup_reader_op_list) == 0:
-        return program
-
-    root_reader_op = startup_reader_op_list[0]
-    train_test_reader_map = {}
-    # 2. add operators to startup to read open and read test data files
-    for op in startup_reader_op_list:
-        assert (len(op.output("Out")) == 1)
-        train_reader_name = op.output("Out")[0]
-        train_reader = startup_block.vars[train_reader_name]
-        test_reader = _copy_reader_var_(
-            startup_block,
-            train_reader,
-            new_name=_get_test_reader_name(train_reader_name))
-        train_test_reader_map[train_reader.name] = test_reader
-
-        test_op_inputs = {}
-        for name in op.input_names:
-            train_arg_names = op.input(name)
-            test_arg_vars = []
-            for arg_name in train_arg_names:
-                arg_var = train_test_reader_map[
-                    arg_name] if name == "UnderlyingReader" else startup_block.vars[
-                        arg_name]
-                test_arg_vars.append(arg_var)
-            test_op_inputs[name] = test_arg_vars
-
-        test_op = startup_block.append_op(
-            type=op.type,
-            inputs=test_op_inputs,
-            outputs={'Out': [test_reader]},
-            attrs=op.attrs)
-        # root reader op's filelist attr for read test files
-        if op.type == root_reader_op.type:
-            test_op.set_attr("file_names", filelist)
-        if op.type == "create_multi_pass_reader":
-            test_op.set_attr("pass_num", 1)
-
-    # 3. rename reader vars in inference program to different name
-    #    to avoid read from train data.
-    main_block = program.global_block()
-    for var in main_block.vars.values():
-        if var.type == core.VarDesc.VarType.READER:
-            main_block.rename_var(
-                str(var.name), str(_get_test_reader_name(var.name)))
-
-    for op in main_block.ops:
-        if op.type == root_reader_op.type:
-            test_op.set_attr("file_names", filelist)
-        if op.type == "create_multi_pass_reader":
-            test_op.set_attr("pass_num", 1)
-
-    startup_program.sync_with_cpp()
-    program.sync_with_cpp()
-
-    return program
diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py
index 86efd1ff51cf29485ee28b4d60ffb1439af1aad9..bd9727b6ac0208b199091db00bd0fd5fae74d53b 100644
--- a/python/paddle/fluid/layer_helper.py
+++ b/python/paddle/fluid/layer_helper.py
@@ -12,14 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import copy
 import itertools
+import six
 
-from framework import Variable, Parameter, default_main_program, default_startup_program, dtype_is_floating
-import unique_name
+from .framework import Variable, Parameter, default_main_program, default_startup_program, dtype_is_floating
+from . import unique_name
 from paddle.fluid.initializer import Constant, Xavier
-from param_attr import ParamAttr, WeightNormParamAttr
-import core
+from .param_attr import ParamAttr, WeightNormParamAttr
+from . import core
+from six.moves import zip
 
 
 class LayerHelper(object):
@@ -68,11 +72,11 @@ class LayerHelper(object):
 
     @property
     def param_attr(self):
-        return ParamAttr.to_attr(self.kwargs.get('param_attr', None))
+        return ParamAttr._to_attr(self.kwargs.get('param_attr', None))
 
     @property
     def bias_attr(self):
-        return ParamAttr.to_attr(self.kwargs.get('bias_attr', None))
+        return ParamAttr._to_attr(self.kwargs.get('bias_attr', None))
 
     def multiple_param_attr(self, length):
         param_attr = self.param_attr
@@ -83,7 +87,7 @@ class LayerHelper(object):
             raise ValueError("parameter number mismatch")
         elif len(param_attr) == 1 and length != 1:
             tmp = [None] * length
-            for i in xrange(length):
+            for i in six.moves.range(length):
                 tmp[i] = copy.deepcopy(param_attr[0])
             param_attr = tmp
         return param_attr
@@ -91,7 +95,7 @@ class LayerHelper(object):
     def iter_inputs_and_params(self, input_param_name='input'):
         inputs = self.multiple_input(input_param_name)
         param_attrs = self.multiple_param_attr(len(inputs))
-        for ipt, param_attr in itertools.izip(inputs, param_attrs):
+        for ipt, param_attr in zip(inputs, param_attrs):
             yield ipt, param_attr
 
     def input_dtype(self, input_param_name='input'):
@@ -218,7 +222,7 @@ class LayerHelper(object):
                 norm = __norm_op(reshape, dim=0, block=block)
                 __reshape_op(norm, out=out, shape=out_shape, block=block)
             else:
-                perm = range(len(x.shape))
+                perm = list(range(len(x.shape)))
                 perm[0], perm[dim] = dim, 0
                 transpose = __transpose_op(x, perm, block=block)
                 norm = __norm_op(transpose, dim=0, block=block)
@@ -262,11 +266,11 @@ class LayerHelper(object):
         g_param = self.startup_program.global_block().create_parameter(
             dtype=dtype,
             shape=g_param_shape,
-            **g_param_attr.to_kwargs(with_initializer=False))
+            **g_param_attr._to_kwargs(with_initializer=False))
         v_param = self.startup_program.global_block().create_parameter(
             dtype=dtype,
             shape=v_param_shape,
-            **v_param_attr.to_kwargs(with_initializer=True))
+            **v_param_attr._to_kwargs(with_initializer=True))
         __norm_except_dim(
             x=v_param,
             out=g_param,
@@ -275,9 +279,9 @@ class LayerHelper(object):
 
         # Add weight normalization to main_program
         g_param = self.main_program.global_block().create_parameter(
-            dtype=dtype, shape=g_param_shape, **g_param_attr.to_kwargs())
+            dtype=dtype, shape=g_param_shape, **g_param_attr._to_kwargs())
         v_param = self.main_program.global_block().create_parameter(
-            dtype=dtype, shape=v_param_shape, **v_param_attr.to_kwargs())
+            dtype=dtype, shape=v_param_shape, **v_param_attr._to_kwargs())
         w_param = __weight_normalize(g_param, v_param, dim=attr.dim)
         return w_param
 
@@ -296,11 +300,11 @@ class LayerHelper(object):
 
         if default_initializer is None and attr.initializer is None:
             if is_bias:
-                attr.set_default_bias_initializer()
+                attr._set_default_bias_initializer()
             else:
-                attr.set_default_param_initializer()
+                attr._set_default_param_initializer()
         else:
-            attr.set_default_initializer(default_initializer)
+            attr._set_default_initializer(default_initializer)
 
         # If weight normalization is set, insert extra parameters and ops.
         # Refer to https://arxiv.org/pdf/1602.07868.pdf
@@ -310,9 +314,9 @@ class LayerHelper(object):
             return param
 
         self.startup_program.global_block().create_parameter(
-            dtype=dtype, shape=shape, **attr.to_kwargs(with_initializer=True))
+            dtype=dtype, shape=shape, **attr._to_kwargs(with_initializer=True))
         return self.main_program.global_block().create_parameter(
-            dtype=dtype, shape=shape, **attr.to_kwargs())
+            dtype=dtype, shape=shape, **attr._to_kwargs())
 
     def get_parameter(self, name):
         param = self.main_program.global_block().var(name)
@@ -397,8 +401,10 @@ class LayerHelper(object):
         act = self.kwargs.get('act', None)
         if act is None:
             return input_var
-        if isinstance(act, basestring):
+        if isinstance(act, six.string_types):
             act = {'type': act}
+        else:
+            raise TypeError(str(act) + " should be unicode or str")
 
         if 'use_cudnn' in self.kwargs and self.kwargs.get('use_cudnn'):
             act['use_cudnn'] = self.kwargs.get('use_cudnn')
diff --git a/python/paddle/fluid/layers/__init__.py b/python/paddle/fluid/layers/__init__.py
index cd1492da24d5e9d09a9eaac0b1b9c7aaffac6250..a2a808777ddc499570eb9ef92175787a14cf77ca 100644
--- a/python/paddle/fluid/layers/__init__.py
+++ b/python/paddle/fluid/layers/__init__.py
@@ -12,28 +12,29 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import ops
-from ops import *
-import nn
-from nn import *
-import io
-from io import *
-import tensor
-from tensor import *
-import control_flow
-from control_flow import *
-import device
-from device import *
-import math_op_patch
-from math_op_patch import *
-import detection
-from detection import *
-import metric_op
-from metric_op import *
-from learning_rate_scheduler import *
+from __future__ import print_function
+
+from . import ops
+from .ops import *
+from . import nn
+from .nn import *
+from . import io
+from .io import *
+from . import tensor
+from .tensor import *
+from . import control_flow
+from .control_flow import *
+from . import device
+from .device import *
+from . import math_op_patch
+from .math_op_patch import *
+from . import detection
+from .detection import *
+from . import metric_op
+from .metric_op import *
+from .learning_rate_scheduler import *
 
 __all__ = []
-__all__ += math_op_patch.__all__
 __all__ += nn.__all__
 __all__ += io.__all__
 __all__ += tensor.__all__
diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index 849474dc58461ac3772f439da7bf5d57592daa8c..c9a2f8a0abf9c811074e3fbadec0c61cb6dbf681 100644
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -11,40 +11,34 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from __future__ import print_function
 import contextlib
 
-from layer_function_generator import autodoc, templatedoc
-from tensor import assign, fill_constant
+from .layer_function_generator import autodoc, templatedoc
+from .tensor import assign, fill_constant
 from .. import core
 from ..framework import Program, Variable, Operator
 from ..layer_helper import LayerHelper, unique_name
 from ..initializer import force_init_on_cpu
-from ops import logical_and, logical_not, logical_or
+from .ops import logical_and, logical_not, logical_or
 import numpy
+import warnings
+import six
+from functools import reduce
 
 __all__ = [
-    'split_lod_tensor',
-    'merge_lod_tensor',
-    'BlockGuard',
-    'BlockGuardWithCompletion',
-    'WhileGuard',
     'While',
     'Switch',
-    'lod_rank_table',
-    'max_sequence_len',
-    'lod_tensor_to_array',
-    'array_to_lod_tensor',
     'increment',
     'array_write',
     'create_array',
     'less_than',
     'equal',
     'array_read',
-    'shrink_memory',
     'array_length',
     'IfElse',
     'DynamicRNN',
-    'ConditionalBlock',
     'StaticRNN',
     'reorder_lod_tensor_by_rank',
     'ParallelDo',
@@ -195,7 +189,6 @@ def Print(input,
                message="The content of some_layer: ")
     '''
     helper = LayerHelper('print', **locals())
-    out = helper.create_tmp_variable(dtype=helper.input_dtype())
     helper.append_op(
         type='print',
         inputs={'In': input},
@@ -208,9 +201,7 @@ def Print(input,
             'print_tensor_shape': print_tensor_shape,
             'print_tensor_lod': print_tensor_lod,
             'print_phase': print_phase.upper()
-        },
-        outputs={'Out': out})
-    return out
+        })
 
 
 class BlockGuard(object):
@@ -286,11 +277,14 @@ class ParallelDo(object):
           avg_cost = fluid.layers.mean(x=cost)
 
     .. warning::
-    
+
        It will be soon deprecated, please use ParallelExecutor instead.
     """
 
     def __init__(self, places, use_nccl=False, name=None):
+        warnings.warn(
+            "API ParallelDo is deprecated since 0.15.0. Please use ParallelExecutor instead.",
+            Warning)
         self.helper = LayerHelper("parallel_do", name=name)
         self.inputs = []
         self.places = places
@@ -349,7 +343,7 @@ class ParallelDo(object):
 
         return [parent_block.var(name) for name in params]
 
-    def complete_op(self):
+    def _complete_op(self):
         main_program = self.helper.main_program
         current_block = main_program.current_block()
         parent_block = self.parent_block()
@@ -405,7 +399,7 @@ class BlockGuardWithCompletion(BlockGuard):
         if exc_type is not None:
             return False
         self.rnn.status = StaticRNN.AFTER_RNN_BLOCK
-        self.rnn.complete_op()
+        self.rnn._complete_op()
         return super(BlockGuardWithCompletion, self).__exit__(exc_type, exc_val,
                                                               exc_tb)
 
@@ -481,7 +475,7 @@ class StaticRNN(object):
             if shape is None or batch_ref is None:
                 raise ValueError(
                     "if init is None, memory at least need shape and batch_ref")
-            parent_block = self.parent_block()
+            parent_block = self._parent_block()
             var_name = unique_name.generate("@".join(
                 [self.helper.name, "memory_boot"]))
             boot_var = parent_block.create_var(
@@ -538,7 +532,7 @@ class StaticRNN(object):
             outputs={'Out': tmp_o},
             attrs={'dtype': o.dtype})
 
-        out_var = self.parent_block().create_var(
+        out_var = self._parent_block().create_var(
             name=tmp_o.name,
             shape=[self.seq_len] + list(tmp_o.shape),
             dtype=tmp_o.dtype)
@@ -554,7 +548,7 @@ class StaticRNN(object):
             raise TypeError("update memory should take variables")
         self.memories[mem.name].mem = var
 
-    def parent_block(self):
+    def _parent_block(self):
         prog = self.helper.main_program
         parent_idx = prog.current_block().parent_idx
         assert parent_idx >= 0
@@ -571,10 +565,10 @@ class StaticRNN(object):
         else:
             return self.outputs
 
-    def complete_op(self):
+    def _complete_op(self):
         main_program = self.helper.main_program
         rnn_block = main_program.current_block()
-        parent_block = self.parent_block()
+        parent_block = self._parent_block()
 
         local_inputs = set()
 
@@ -608,7 +602,7 @@ class StaticRNN(object):
         boot_memories = []
         pre_memories = []
         memories = []
-        for _, mem in self.memories.iteritems():
+        for _, mem in six.iteritems(self.memories):
             boot_memories.append(mem.init)
             pre_memories.append(mem.pre_mem.name)
             mem_var = rnn_block.var(mem.mem.name)
@@ -654,7 +648,7 @@ class WhileGuard(BlockGuard):
         if exc_type is not None:
             return False
         self.while_op.status = While.AFTER_WHILE_BLOCK
-        self.while_op.complete()
+        self.while_op._complete()
         return super(WhileGuard, self).__exit__(exc_type, exc_val, exc_tb)
 
 
@@ -664,6 +658,7 @@ class While(object):
 
     Args:
         cond (Variable): condition used to compare.
+        is_test(bool): A flag indicating whether execution is in test phase.
         name (str): The name of this layer.
 
     Examples:
@@ -686,7 +681,7 @@ class While(object):
     IN_WHILE_BLOCK = 1
     AFTER_WHILE_BLOCK = 2
 
-    def __init__(self, cond, name=None):
+    def __init__(self, cond, is_test=False, name=None):
         self.helper = LayerHelper("while", name=name)
         self.status = While.BEFORE_WHILE_BLOCK
         if not isinstance(cond, Variable):
@@ -697,11 +692,12 @@ class While(object):
         if reduce(lambda a, b: a * b, cond.shape, 1) != 1:
             raise TypeError("condition should be a bool scalar")
         self.cond_var = cond
+        self.is_test = is_test
 
     def block(self):
         return WhileGuard(self)
 
-    def complete(self):
+    def _complete(self):
         main_program = self.helper.main_program
         while_block = main_program.current_block()
         parent_block = main_program.block(main_program.current_block()
@@ -730,13 +726,16 @@ class While(object):
         parent_block.append_op(
             type='while',
             inputs={
-                'X':
-                [parent_block.var_recursive(x_name) for x_name in x_name_list],
+                'X': [
+                    parent_block._var_recursive(x_name)
+                    for x_name in x_name_list
+                ],
                 'Condition': [self.cond_var]
             },
             outputs={'Out': out_vars,
                      'StepScopes': [step_scope]},
-            attrs={'sub_block': while_block})
+            attrs={'sub_block': while_block,
+                   "is_test": self.is_test})
 
 
 def lod_rank_table(x, level=0):
@@ -824,21 +823,21 @@ def max_sequence_len(rank_table):
 
 
 def lod_tensor_to_array(x, table):
-    """ 
+    """
     Convert a LoDTensor to a LoDTensorArray.
 
-    This function split a LoDTesnor to a LoDTensorArray according to its LoD 
-    information. LoDTensorArray is an alias of C++ std::vector<LoDTensor> in 
-    PaddlePaddle. The generated LoDTensorArray of this function can be further read 
-    or written by `read_from_array()` and `write_to_array()` operators. However, 
-    this function is generally an internal component of PaddlePaddle `DynamicRNN`. 
+    This function split a LoDTesnor to a LoDTensorArray according to its LoD
+    information. LoDTensorArray is an alias of C++ std::vector<LoDTensor> in
+    PaddlePaddle. The generated LoDTensorArray of this function can be further read
+    or written by `read_from_array()` and `write_to_array()` operators. However,
+    this function is generally an internal component of PaddlePaddle `DynamicRNN`.
     Users should not use it directly.
 
     Args:
         x (Variable|list): The LoDTensor to be converted to a LoDTensorArray.
         table (ParamAttr|list): The variable that stores the level of lod
                                 which is ordered by sequence length in
-                                descending order. It is generally generated 
+                                descending order. It is generally generated
                                 by `layers.lod_rank_table()` API.
 
     Returns:
@@ -1072,9 +1071,9 @@ def array_read(array, i):
         Given:
 
         array = [0.6, 0.1, 0.3, 0.1]
-        
+
         And:
-        
+
         i = 2
 
         Then:
@@ -1181,9 +1180,9 @@ def array_length(array):
 
 class ConditionalBlockGuard(BlockGuard):
     """
-    ConditionalBlockGuard is derived from BlockGuard. It is dedicated for 
-    holding a ConditionalBlock, and helping users entering and exiting the 
-    ConditionalBlock via Python's 'with' keyword. However, ConditionalBlockGuard 
+    ConditionalBlockGuard is derived from BlockGuard. It is dedicated for
+    holding a ConditionalBlock, and helping users entering and exiting the
+    ConditionalBlock via Python's 'with' keyword. However, ConditionalBlockGuard
     is generally an internal component of IfElse, users should not use it directly.
     """
 
@@ -1259,7 +1258,7 @@ class ConditionalBlock(object):
         input_set = set([ipt.name for ipt in self.inputs])
 
         param_list = [
-            parent_block.var_recursive(each_name) for each_name in params
+            parent_block._var_recursive(each_name) for each_name in params
             if each_name not in input_set
         ]
 
@@ -1273,8 +1272,8 @@ class ConditionalBlock(object):
         parent_block.append_op(
             type='conditional_block',
             inputs={
-                'X': self.inputs,
-                'Params': param_list,
+                'Cond': self.inputs,
+                'Input': param_list,
             },
             outputs={'Out': out_list,
                      'Scope': [step_scope]},
@@ -1458,7 +1457,7 @@ class IfElse(object):
         if self.status == IfElse.OUT_IF_ELSE_BLOCKS:
             raise ValueError("input must in true/false blocks")
         if id(x) not in self.input_table:
-            parent_block = self.parent_block()
+            parent_block = self._parent_block()
             out_true = parent_block.create_var(
                 name=unique_name.generate('ifelse_input' + self.helper.name),
                 dtype=x.dtype)
@@ -1484,7 +1483,7 @@ class IfElse(object):
         else:
             return out_false
 
-    def parent_block(self):
+    def _parent_block(self):
         current_block = self.helper.main_program.current_block()
         return self.helper.main_program.block(current_block.parent_idx)
 
@@ -1500,7 +1499,7 @@ class IfElse(object):
 
         out_table = self.output_table[1 if self.status ==
                                       self.IN_IF_ELSE_TRUE_BLOCKS else 0]
-        parent_block = self.parent_block()
+        parent_block = self._parent_block()
         for each_out in outs:
             if not isinstance(each_out, Variable):
                 raise TypeError("Each output should be a variable")
@@ -1517,7 +1516,7 @@ class IfElse(object):
     def __call__(self):
         if self.status != self.OUT_IF_ELSE_BLOCKS:
             raise ValueError("IfElse::__call__ must be out of sub-block")
-        false_len, true_len = map(len, self.output_table)
+        false_len, true_len = list(map(len, self.output_table))
         if false_len == 0 and true_len == 0:
             raise ValueError("Must invoke true_block/false_block before "
                              "__call__")
@@ -1937,7 +1936,7 @@ def is_empty(x, cond=None, **ignored):
 
     Args:
         x (Variable): The Variable to be tested.
-        cond (Variable|None): Output parameter. Returns the test result 
+        cond (Variable|None): Output parameter. Returns the test result
                               of given 'x'. Default: None
 
     Returns:
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index 6af01297df54ffd4201776d20d51a88f5808ccb0..1c73c837e2aa422b67704e171f66f5cd48e171ce 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -15,12 +15,19 @@
 All layers just related to the detection neural network.
 """
 
-from layer_function_generator import generate_layer_fn
-from layer_function_generator import autodoc, templatedoc
+from __future__ import print_function
+
+from .layer_function_generator import generate_layer_fn
+from .layer_function_generator import autodoc, templatedoc
 from ..layer_helper import LayerHelper
-import tensor
-import nn
+from . import tensor
+from . import nn
+from . import ops
+from ... import compat as cpt
 import math
+import six
+import numpy
+from functools import reduce
 
 __all__ = [
     'prior_box',
@@ -30,12 +37,16 @@ __all__ = [
     'detection_output',
     'ssd_loss',
     'detection_map',
+    'rpn_target_assign',
     'anchor_generator',
+    'generate_proposal_labels',
+    'generate_proposals',
 ]
 
 __auto__ = [
     'iou_similarity',
     'box_coder',
+    'polygon_box_transform',
 ]
 
 __all__ += __auto__
@@ -44,6 +55,145 @@ for _OP in set(__auto__):
     globals()[_OP] = generate_layer_fn(_OP)
 
 
+def rpn_target_assign(bbox_pred,
+                      cls_logits,
+                      anchor_box,
+                      anchor_var,
+                      gt_boxes,
+                      is_crowd,
+                      im_info,
+                      rpn_batch_size_per_im=256,
+                      rpn_straddle_thresh=0.0,
+                      rpn_fg_fraction=0.5,
+                      rpn_positive_overlap=0.7,
+                      rpn_negative_overlap=0.3,
+                      use_random=True):
+    """
+    ** Target Assign Layer for region proposal network (RPN) in Faster-RCNN detection. **
+
+    This layer can be, for given the  Intersection-over-Union (IoU) overlap
+    between anchors and ground truth boxes, to assign classification and
+    regression targets to each each anchor, these target labels are used for
+    train RPN. The classification targets is a binary class label (of being
+    an object or not). Following the paper of Faster-RCNN, the positive labels
+    are two kinds of anchors: (i) the anchor/anchors with the highest IoU
+    overlap with a ground-truth box, or (ii) an anchor that has an IoU overlap
+    higher than rpn_positive_overlap(0.7) with any ground-truth box. Note
+    that a single ground-truth box may assign positive labels to multiple
+    anchors. A non-positive anchor is when its IoU ratio is lower than
+    rpn_negative_overlap (0.3) for all ground-truth boxes. Anchors that are
+    neither positive nor negative do not contribute to the training objective.
+    The regression targets are the encoded ground-truth boxes associated with
+    the positive anchors.
+
+    Args:
+        bbox_pred(Variable): A 3-D Tensor with shape [N, M, 4] represents the
+            predicted locations of M bounding bboxes. N is the batch size,
+            and each bounding box has four coordinate values and the layout
+            is [xmin, ymin, xmax, ymax].
+        cls_logits(Variable): A 3-D Tensor with shape [N, M, 1] represents the
+            predicted confidence predictions. N is the batch size, 1 is the
+            frontground and background sigmoid, M is number of bounding boxes.
+        anchor_box(Variable): A 2-D Tensor with shape [M, 4] holds M boxes,
+            each box is represented as [xmin, ymin, xmax, ymax],
+            [xmin, ymin] is the left top coordinate of the anchor box,
+            if the input is image feature map, they are close to the origin
+            of the coordinate system. [xmax, ymax] is the right bottom
+            coordinate of the anchor box.
+        anchor_var(Variable): A 2-D Tensor with shape [M,4] holds expanded 
+            variances of anchors.
+        gt_boxes (Variable): The ground-truth boudding boxes (bboxes) are a 2D
+            LoDTensor with shape [Ng, 4], Ng is the total number of ground-truth
+            bboxes of mini-batch input.
+        is_crowd (Variable): A 1-D LoDTensor which indicates groud-truth is crowd.
+        im_info (Variable): A 2-D LoDTensor with shape [N, 3]. N is the batch size,
+        3 is the height, width and scale.
+        rpn_batch_size_per_im(int): Total number of RPN examples per image.
+        rpn_straddle_thresh(float): Remove RPN anchors that go outside the image
+            by straddle_thresh pixels.
+        rpn_fg_fraction(float): Target fraction of RoI minibatch that is labeled
+            foreground (i.e. class > 0), 0-th class is background.
+        rpn_positive_overlap(float): Minimum overlap required between an anchor
+            and ground-truth box for the (anchor, gt box) pair to be a positive
+            example.
+        rpn_negative_overlap(float): Maximum overlap allowed between an anchor
+            and ground-truth box for the (anchor, gt box) pair to be a negative
+            examples.
+
+    Returns:
+        tuple:
+               A tuple(predicted_scores, predicted_location, target_label,
+               target_bbox) is returned. The predicted_scores and
+               predicted_location is the predicted result of the RPN.
+               The target_label and target_bbox is the ground truth,
+               respectively. The predicted_location is a 2D Tensor with shape
+               [F, 4], and the shape of target_bbox is same as the shape of
+               the predicted_location, F is the number of the foreground
+               anchors. The predicted_scores is a 2D Tensor with shape
+               [F + B, 1], and the shape of target_label is same as the shape
+               of the predicted_scores, B is the number of the background
+               anchors, the F and B is depends on the input of this operator.
+
+    Examples:
+        .. code-block:: python
+
+        bbox_pred = layers.data(name='bbox_pred', shape=[100, 4],
+                          append_batch_size=False, dtype='float32')
+        cls_logits = layers.data(name='cls_logits', shape=[100, 1],
+                          append_batch_size=False, dtype='float32')
+        anchor_box = layers.data(name='anchor_box', shape=[20, 4],
+                          append_batch_size=False, dtype='float32')
+        gt_boxes = layers.data(name='gt_boxes', shape=[10, 4],
+                         append_batch_size=False, dtype='float32')
+        loc_pred, score_pred, loc_target, score_target =
+            fluid.layers.rpn_target_assign(bbox_pred=bbox_pred,
+                                          cls_logits=cls_logits,
+                                          anchor_box=anchor_box,
+                                          gt_boxes=gt_boxes)
+    """
+
+    helper = LayerHelper('rpn_target_assign', **locals())
+    # Assign target label to anchors
+    loc_index = helper.create_tmp_variable(dtype='int32')
+    score_index = helper.create_tmp_variable(dtype='int32')
+    target_label = helper.create_tmp_variable(dtype='int32')
+    target_bbox = helper.create_tmp_variable(dtype=anchor_box.dtype)
+    helper.append_op(
+        type="rpn_target_assign",
+        inputs={
+            'Anchor': anchor_box,
+            'GtBoxes': gt_boxes,
+            'IsCrowd': is_crowd,
+            'ImInfo': im_info
+        },
+        outputs={
+            'LocationIndex': loc_index,
+            'ScoreIndex': score_index,
+            'TargetLabel': target_label,
+            'TargetBBox': target_bbox
+        },
+        attrs={
+            'rpn_batch_size_per_im': rpn_batch_size_per_im,
+            'rpn_straddle_thresh': rpn_straddle_thresh,
+            'rpn_positive_overlap': rpn_positive_overlap,
+            'rpn_negative_overlap': rpn_negative_overlap,
+            'rpn_fg_fraction': rpn_fg_fraction,
+            'use_random': use_random
+        })
+
+    loc_index.stop_gradient = True
+    score_index.stop_gradient = True
+    target_label.stop_gradient = True
+    target_bbox.stop_gradient = True
+
+    cls_logits = nn.reshape(x=cls_logits, shape=(-1, 1))
+    bbox_pred = nn.reshape(x=bbox_pred, shape=(-1, 4))
+    predicted_cls_logits = nn.gather(cls_logits, score_index)
+    predicted_bbox_pred = nn.gather(bbox_pred, loc_index)
+
+    return predicted_cls_logits, predicted_bbox_pred, target_label, target_bbox
+
+
 def detection_output(loc,
                      scores,
                      prior_box,
@@ -98,8 +248,8 @@ def detection_output(loc,
         nms_eta(float): The parameter for adaptive NMS.
 
     Returns:
-        Variable: 
-        
+        Variable:
+
             The detection outputs is a LoDTensor with shape [No, 6].
             Each row has six values: [label, confidence, xmin, ymin, xmax, ymax].
             `No` is the total number of detections in this mini-batch. For each
@@ -132,10 +282,11 @@ def detection_output(loc,
         prior_box_var=prior_box_var,
         target_box=loc,
         code_type='decode_center_size')
-    old_shape = scores.shape
-    scores = nn.reshape(x=scores, shape=(-1, old_shape[-1]))
+    compile_shape = scores.shape
+    run_shape = ops.shape(scores)
+    scores = nn.flatten(x=scores, axis=2)
     scores = nn.softmax(input=scores)
-    scores = nn.reshape(x=scores, shape=old_shape)
+    scores = nn.reshape(x=scores, shape=compile_shape, actual_shape=run_shape)
     scores = nn.transpose(scores, perm=[0, 2, 1])
     scores.stop_gradient = True
     nmsed_outs = helper.create_tmp_variable(dtype=decoded_box.dtype)
@@ -369,7 +520,7 @@ def target_assign(input,
 
     Assumed that the row offset for each instance in `neg_indices` is called neg_lod,
     for i-th instance and each `id` of neg_indices in this instance:
-    
+
     .. code-block:: text
 
         out[i][id][0 : K] = {mismatch_value, mismatch_value, ...}
@@ -387,12 +538,11 @@ def target_assign(input,
        mismatch_value (float32): Fill this value to the mismatched location.
 
     Returns:
-        tuple: 
-        
-               A tuple(out, out_weight) is returned. out is a 3D Tensor with 
-               shape [N, P, K], N and P is the same as they are in 
-               `neg_indices`, K is the same as it in input of X. If 
-               `match_indices[i][j]`. out_weight is the weight for output with 
+        tuple:
+               A tuple(out, out_weight) is returned. out is a 3D Tensor with
+               shape [N, P, K], N and P is the same as they are in
+               `neg_indices`, K is the same as it in input of X. If
+               `match_indices[i][j]`. out_weight is the weight for output with
                the shape of [N, P, 1].
 
     Examples:
@@ -546,9 +696,10 @@ def ssd_loss(location,
         raise ValueError("Only support mining_type == max_negative now.")
 
     num, num_prior, num_class = confidence.shape
+    conf_shape = ops.shape(confidence)
 
     def __reshape_to_2d(var):
-        return nn.reshape(x=var, shape=[-1, var.shape[-1]])
+        return nn.flatten(x=var, axis=2)
 
     # 1. Find matched boundding box by prior box.
     #   1.1 Compute IOU similarity between ground-truth boxes and prior boxes.
@@ -559,7 +710,8 @@ def ssd_loss(location,
 
     # 2. Compute confidence for mining hard examples
     # 2.1. Get the target label based on matched indices
-    gt_label = nn.reshape(x=gt_label, shape=gt_label.shape + (1, ))
+    gt_label = nn.reshape(
+        x=gt_label, shape=(len(gt_label.shape) - 1) * (0, ) + (-1, 1))
     gt_label.stop_gradient = True
     target_label, _ = target_assign(
         gt_label, matched_indices, mismatch_value=background_label)
@@ -570,9 +722,12 @@ def ssd_loss(location,
     target_label = __reshape_to_2d(target_label)
     target_label.stop_gradient = True
     conf_loss = nn.softmax_with_cross_entropy(confidence, target_label)
-
     # 3. Mining hard examples
-    conf_loss = nn.reshape(x=conf_loss, shape=(num, num_prior))
+    conf_loss = nn.reshape(
+        x=conf_loss,
+        shape=(num, num_prior),
+        actual_shape=ops.slice(
+            conf_shape, axes=[0], starts=[0], ends=[2]))
     conf_loss.stop_gradient = True
     neg_indices = helper.create_tmp_variable(dtype='int32')
     dtype = matched_indices.dtype
@@ -591,7 +746,7 @@ def ssd_loss(location,
         },
         attrs={
             'neg_pos_ratio': neg_pos_ratio,
-            'neg_dist_threshold': neg_pos_ratio,
+            'neg_dist_threshold': neg_overlap,
             'mining_type': mining_type,
             'sample_size': sample_size,
         })
@@ -641,7 +796,11 @@ def ssd_loss(location,
     # 5.3 Compute overall weighted loss.
     loss = conf_loss_weight * conf_loss + loc_loss_weight * loc_loss
     # reshape to [N, Np], N is the batch size and Np is the prior box number.
-    loss = nn.reshape(x=loss, shape=[-1, num_prior])
+    loss = nn.reshape(
+        x=loss,
+        shape=(num, num_prior),
+        actual_shape=ops.slice(
+            conf_shape, axes=[0], starts=[0], ends=[2]))
     loss = nn.reduce_sum(loss, dim=1, keep_dim=True)
     if normalize:
         normalizer = nn.reduce_sum(target_loc_weight)
@@ -660,7 +819,8 @@ def prior_box(input,
               clip=False,
               steps=[0.0, 0.0],
               offset=0.5,
-              name=None):
+              name=None,
+              min_max_aspect_ratios_order=False):
     """
     **Prior Box Operator**
 
@@ -689,6 +849,11 @@ def prior_box(input,
             Default: [0., 0.]
        offset(float): Prior boxes center offset. Default: 0.5
        name(str): Name of the prior box op. Default: None.
+       min_max_aspect_ratios_order(bool): If set True, the output prior box is
+            in order of [min, max, aspect_ratios], which is consistent with
+            Caffe. Please note, this order affects the weights order of
+            convolution layer followed by and does not affect the final
+            detection results. Default: False.
 
     Returns:
         tuple: A tuple with two Variable (boxes, variances)
@@ -742,7 +907,8 @@ def prior_box(input,
         'clip': clip,
         'step_w': steps[0],
         'step_h': steps[1],
-        'offset': offset
+        'offset': offset,
+        'min_max_aspect_ratios_order': min_max_aspect_ratios_order
     }
     if max_sizes is not None and len(max_sizes) > 0 and max_sizes[0] > 0:
         if not _is_list_or_tuple_(max_sizes):
@@ -782,7 +948,8 @@ def multi_box_head(inputs,
                    kernel_size=1,
                    pad=0,
                    stride=1,
-                   name=None):
+                   name=None,
+                   min_max_aspect_ratios_order=False):
     """
     Generate prior boxes for SSD(Single Shot MultiBox Detector)
     algorithm. The details of this algorithm, please refer the
@@ -825,6 +992,11 @@ def multi_box_head(inputs,
        pad(int|list|tuple): The padding of conv2d. Default:0.
        stride(int|list|tuple): The stride of conv2d. Default:1,
        name(str): Name of the prior box layer. Default: None.
+       min_max_aspect_ratios_order(bool): If set True, the output prior box is
+            in order of [min, max, aspect_ratios], which is consistent with
+            Caffe. Please note, this order affects the weights order of
+            convolution layer followed by and does not affect the fininal
+            detection results. Default: False.
 
     Returns:
         tuple: A tuple with four Variables. (mbox_loc, mbox_conf, boxes, variances)
@@ -861,13 +1033,7 @@ def multi_box_head(inputs,
     """
 
     def _reshape_with_axis_(input, axis=1):
-        if not (axis > 0 and axis < len(input.shape)):
-            raise ValueError("The axis should be smaller than "
-                             "the arity of input and bigger than 0.")
-        new_shape = [
-            -1, reduce(lambda x, y: x * y, input.shape[axis:len(input.shape)])
-        ]
-        out = nn.reshape(x=input, shape=new_shape)
+        out = nn.flatten(x=input, axis=axis)
         return out
 
     def _is_list_or_tuple_(data):
@@ -889,7 +1055,7 @@ def multi_box_head(inputs,
         min_sizes = []
         max_sizes = []
         step = int(math.floor(((max_ratio - min_ratio)) / (num_layer - 2)))
-        for ratio in xrange(min_ratio, max_ratio + 1, step):
+        for ratio in six.moves.range(min_ratio, max_ratio + 1, step):
             min_sizes.append(base_size * ratio / 100.)
             max_sizes.append(base_size * (ratio + step) / 100.)
         min_sizes = [base_size * .10] + min_sizes
@@ -939,7 +1105,8 @@ def multi_box_head(inputs,
         step = [step_w[i] if step_w else 0.0, step_h[i] if step_w else 0.0]
 
         box, var = prior_box(input, image, min_size, max_size, aspect_ratio,
-                             variance, flip, clip, step, offset)
+                             variance, flip, clip, step, offset, None,
+                             min_max_aspect_ratios_order)
 
         box_results.append(box)
         var_results.append(var)
@@ -956,11 +1123,13 @@ def multi_box_head(inputs,
             stride=stride)
 
         mbox_loc = nn.transpose(mbox_loc, perm=[0, 2, 3, 1])
-        new_shape = [
-            mbox_loc.shape[0],
-            mbox_loc.shape[1] * mbox_loc.shape[2] * mbox_loc.shape[3] / 4, 4
+        compile_shape = [
+            mbox_loc.shape[0], cpt.floor_division(
+                mbox_loc.shape[1] * mbox_loc.shape[2] * mbox_loc.shape[3], 4), 4
         ]
-        mbox_loc_flatten = nn.reshape(mbox_loc, shape=new_shape)
+        run_shape = tensor.assign(numpy.array([0, -1, 4]).astype("int32"))
+        mbox_loc_flatten = nn.reshape(
+            mbox_loc, shape=compile_shape, actual_shape=run_shape)
         mbox_locs.append(mbox_loc_flatten)
 
         # get conf
@@ -972,11 +1141,16 @@ def multi_box_head(inputs,
             padding=pad,
             stride=stride)
         conf_loc = nn.transpose(conf_loc, perm=[0, 2, 3, 1])
-        new_shape = [
-            conf_loc.shape[0], conf_loc.shape[1] * conf_loc.shape[2] *
-            conf_loc.shape[3] / num_classes, num_classes
+        new_shape = [0, -1, num_classes]
+        compile_shape = [
+            conf_loc.shape[0],
+            cpt.floor_division(conf_loc.shape[1] * conf_loc.shape[2] *
+                               conf_loc.shape[3], num_classes), num_classes
         ]
-        conf_loc_flatten = nn.reshape(conf_loc, shape=new_shape)
+        run_shape = tensor.assign(
+            numpy.array([0, -1, num_classes]).astype("int32"))
+        conf_loc_flatten = nn.reshape(
+            conf_loc, shape=compile_shape, actual_shape=run_shape)
         mbox_confs.append(conf_loc_flatten)
 
     if len(box_results) == 1:
@@ -1091,3 +1265,135 @@ def anchor_generator(input,
     anchor.stop_gradient = True
     var.stop_gradient = True
     return anchor, var
+
+
+def generate_proposal_labels(rpn_rois,
+                             gt_classes,
+                             is_crowd,
+                             gt_boxes,
+                             im_info,
+                             batch_size_per_im=256,
+                             fg_fraction=0.25,
+                             fg_thresh=0.25,
+                             bg_thresh_hi=0.5,
+                             bg_thresh_lo=0.0,
+                             bbox_reg_weights=[0.1, 0.1, 0.2, 0.2],
+                             class_nums=None,
+                             use_random=True):
+    """
+    ** Generate proposal labels Faster-RCNN **
+    TODO(buxingyuan): Add Document
+    """
+
+    helper = LayerHelper('generate_proposal_labels', **locals())
+
+    rois = helper.create_tmp_variable(dtype=rpn_rois.dtype)
+    labels_int32 = helper.create_tmp_variable(dtype=gt_classes.dtype)
+    bbox_targets = helper.create_tmp_variable(dtype=rpn_rois.dtype)
+    bbox_inside_weights = helper.create_tmp_variable(dtype=rpn_rois.dtype)
+    bbox_outside_weights = helper.create_tmp_variable(dtype=rpn_rois.dtype)
+
+    helper.append_op(
+        type="generate_proposal_labels",
+        inputs={
+            'RpnRois': rpn_rois,
+            'GtClasses': gt_classes,
+            'IsCrowd': is_crowd,
+            'GtBoxes': gt_boxes,
+            'ImInfo': im_info
+        },
+        outputs={
+            'Rois': rois,
+            'LabelsInt32': labels_int32,
+            'BboxTargets': bbox_targets,
+            'BboxInsideWeights': bbox_inside_weights,
+            'BboxOutsideWeights': bbox_outside_weights
+        },
+        attrs={
+            'batch_size_per_im': batch_size_per_im,
+            'fg_fraction': fg_fraction,
+            'fg_thresh': fg_thresh,
+            'bg_thresh_hi': bg_thresh_hi,
+            'bg_thresh_lo': bg_thresh_lo,
+            'bbox_reg_weights': bbox_reg_weights,
+            'class_nums': class_nums,
+            'use_random': use_random
+        })
+
+    rois.stop_gradient = True
+    labels_int32.stop_gradient = True
+    bbox_targets.stop_gradient = True
+    bbox_inside_weights.stop_gradient = True
+    bbox_outside_weights.stop_gradient = True
+
+    return rois, labels_int32, bbox_targets, bbox_inside_weights, bbox_outside_weights
+
+
+def generate_proposals(scores,
+                       bbox_deltas,
+                       im_info,
+                       anchors,
+                       variances,
+                       pre_nms_top_n=6000,
+                       post_nms_top_n=1000,
+                       nms_thresh=0.5,
+                       min_size=0.1,
+                       eta=1.0,
+                       name=None):
+    """
+    ** Generate proposal labels Faster-RCNN **
+	
+	This operation proposes RoIs according to each box with their probability to be a foreground object and 
+	the box can be calculated by anchors. Bbox_deltais and scores to be an object are the output of RPN. Final proposals
+	could be used to train detection net.
+
+	For generating proposals, this operation performs following steps:
+
+	1. Transposes and resizes scores and bbox_deltas in size of (H*W*A, 1) and (H*W*A, 4)
+ 	2. Calculate box locations as proposals candidates. 
+	3. Clip boxes to image
+	4. Remove predicted boxes with small area. 
+	5. Apply NMS to get final proposals as output.
+	
+      
+	Args:
+		scores(Variable): A 4-D Tensor with shape [N, A, H, W] represents the probability for each box to be an object.
+			N is batch size, A is number of anchors, H and W are height and width of the feature map.
+		bbox_deltas(Variable): A 4-D Tensor with shape [N, 4*A, H, W] represents the differece between predicted box locatoin and anchor location. 
+		im_info(Variable): A 2-D Tensor with shape [N, 3] represents origin image information for N batch. Info contains height, width and scale
+			between origin image size and the size of feature map.
+		anchors(Variable):   A 4-D Tensor represents the anchors with a layout of [H, W, A, 4]. H and W are height and width of the feature map,
+              		num_anchors is the box count of each position. Each anchor is in (xmin, ymin, xmax, ymax) format an unnormalized.
+		variances(Variable): The expanded variances of anchors with a layout of [H, W, num_priors, 4]. Each variance is in (xcenter, ycenter, w, h) format.
+		pre_nms_top_n(float): Number of total bboxes to be kept per image before NMS. 6000 by default.
+		post_nms_top_n(float): Number of total bboxes to be kept per image after NMS. 1000 by default.
+		nms_thresh(float): Threshold in NMS, 0.5 by default.
+		min_size(float): Remove predicted boxes with either height or width < min_size. 0.1 by default.
+		eta(float): Apply in adaptive NMS, if adaptive threshold > 0.5, adaptive_threshold = adaptive_threshold * eta in each iteration.
+    """
+    helper = LayerHelper('generate_proposals', **locals())
+
+    rpn_rois = helper.create_tmp_variable(dtype=bbox_deltas.dtype)
+    rpn_roi_probs = helper.create_tmp_variable(dtype=scores.dtype)
+    helper.append_op(
+        type="generate_proposals",
+        inputs={
+            'Scores': scores,
+            'BboxDeltas': bbox_deltas,
+            'ImInfo': im_info,
+            'Anchors': anchors,
+            'Variances': variances
+        },
+        attrs={
+            'pre_nms_topN': pre_nms_top_n,
+            'post_nms_topN': post_nms_top_n,
+            'nms_thresh': nms_thresh,
+            'min_size': min_size,
+            'eta': eta
+        },
+        outputs={'RpnRois': rpn_rois,
+                 'RpnRoiProbs': rpn_roi_probs})
+    rpn_rois.stop_gradient = True
+    rpn_roi_probs.stop_gradient = True
+
+    return rpn_rois, rpn_roi_probs
diff --git a/python/paddle/fluid/layers/device.py b/python/paddle/fluid/layers/device.py
index e0c1aab230aeed7fb858e91e7da7eae58032ee16..43ebd160de3fd3d2a491a3ec1fbe0e4085fbd0b1 100644
--- a/python/paddle/fluid/layers/device.py
+++ b/python/paddle/fluid/layers/device.py
@@ -15,13 +15,17 @@
 All util layers.
 """
 
-from layer_function_generator import autodoc
+from __future__ import print_function
+
+from .layer_function_generator import autodoc
 from ..framework import unique_name
 from ..layer_helper import LayerHelper
+from ..annotations import deprecated
 
-__all__ = ['get_places']
+__all__ = []
 
 
+@deprecated(since='0.15.0', instead="ParallelExecutor")
 @autodoc()
 def get_places(device_count=None, device_type=None):
     helper = LayerHelper('get_places', **locals())
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index f33ae76aea95ceeca73c5bae6e4e490cdff29bf3..0cf7aaef4ab75ca6976465d1b404004a9f2f64c5 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -11,20 +11,27 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from __future__ import print_function
 import contextlib
+import multiprocessing
+import six
+import threading
 
+from ..data_feeder import DataFeeder
+from .control_flow import BlockGuard
+from .layer_function_generator import templatedoc
 from .. import core
-from ..framework import convert_np_dtype_to_dtype_, default_main_program, default_startup_program, Program
-from ..unique_name import generate as unique_name
-from control_flow import BlockGuard
-from ..layer_helper import LayerHelper
 from ..executor import global_scope
-from layer_function_generator import generate_layer_fn, templatedoc
+from ..framework import convert_np_dtype_to_dtype_, default_main_program, \
+    default_startup_program, program_guard, Program, Variable
+from ..layer_helper import LayerHelper
+from ..unique_name import generate as unique_name
 
 __all__ = [
-    'data', 'BlockGuardServ', 'ListenAndServ', 'Send', 'Recv',
-    'open_recordio_file', 'open_files', 'read_file', 'shuffle', 'batch',
-    'double_buffer', 'random_data_generator', 'Preprocessor', 'load'
+    'data', 'open_recordio_file', 'open_files', 'read_file', 'shuffle', 'batch',
+    'double_buffer', 'random_data_generator', 'py_reader', 'Preprocessor',
+    'load'
 ]
 
 
@@ -65,7 +72,7 @@ def data(name,
     """
     helper = LayerHelper('data', **locals())
     shape = list(shape)
-    for i in xrange(len(shape)):
+    for i in six.moves.range(len(shape)):
         if shape[i] is None:
             shape[i] = -1
             append_batch_size = False
@@ -202,7 +209,7 @@ class ListenAndServ(object):
             })
 
 
-def Send(endpoints, send_vars, sync=True):
+def Send(endpoints, send_vars, dummy_output=None, sync=True):
     """
     Send variables to the server side, and get vars from server
     side when server have finished running server side program.
@@ -216,6 +223,13 @@ def Send(endpoints, send_vars, sync=True):
     """
     assert (type(send_vars) == list)
 
+    if dummy_output is None:
+        dummy_output = []
+    elif isinstance(dummy_output, Variable):
+        dummy_output = [dummy_output]
+
+    assert (type(dummy_output) == list)
+
     epmap = endpoints.split(",")
     endpoints = list(set(epmap))
 
@@ -225,16 +239,21 @@ def Send(endpoints, send_vars, sync=True):
     helper.append_op(
         type="send",
         inputs={"X": send_vars},
+        outputs={"Out": dummy_output},
         attrs={
             "endpoints": endpoints,
             "epmap": epmap,
             rpc_op_role_name: core.op_proto_and_checker_maker.OpRole.RPC
         })
     if sync:
-        helper.append_op(type="send_barrier", attrs={"endpoints": endpoints})
+        helper.append_op(
+            type="send_barrier",
+            inputs={"X": dummy_output},
+            outputs={"Out": []},
+            attrs={"endpoints": endpoints})
 
 
-def Recv(endpoints, get_vars, sync=True):
+def Recv(endpoints, get_vars, dummy_input=None, sync=True):
     """
     Receive variables from server side
 
@@ -249,18 +268,28 @@ def Recv(endpoints, get_vars, sync=True):
     """
     assert (type(get_vars) == list)
 
+    if dummy_input is None:
+        dummy_input = []
+    elif isinstance(dummy_input, Variable):
+        dummy_input = [dummy_input]
+
+    assert (type(dummy_input) == list)
+
     epmap = endpoints.split(",")
     endpoints = list(set(epmap))
 
     helper = LayerHelper("Recv", **locals())
     helper.append_op(
         type="recv",
-        inputs={"X": get_vars},
+        inputs={"X": dummy_input},
         outputs={"Out": get_vars},
         attrs={"endpoints": endpoints,
                "epmap": epmap})
     if sync:
-        helper.append_op(type="fetch_barrier", attrs={"endpoints": endpoints})
+        helper.append_op(
+            type="fetch_barrier",
+            outputs={"Out": get_vars},
+            attrs={"endpoints": endpoints})
     return get_vars
 
 
@@ -375,9 +404,6 @@ def open_recordio_file(filename,
     if pass_num > 1:
         main_prog_var = multi_pass(reader=main_prog_var, pass_num=pass_num)
 
-    if for_parallel:
-        main_prog_var = parallel(reader=main_prog_var)
-
     return monkey_patch_reader_methods(main_prog_var)
 
 
@@ -386,9 +412,9 @@ def random_data_generator(low, high, shapes, lod_levels, for_parallel=True):
     Create a uniform random data generator
 
     This layer returns a Reader Variable.
-    Instead of opening a file and reading data from it, this 
-    Reader Variable generates float uniform random data by itself. 
-    It can be used as a dummy reader to test a network without 
+    Instead of opening a file and reading data from it, this
+    Reader Variable generates float uniform random data by itself.
+    It can be used as a dummy reader to test a network without
     opening a real file.
 
     Args:
@@ -442,40 +468,287 @@ def random_data_generator(low, high, shapes, lod_levels, for_parallel=True):
     main_prog_var = _copy_reader_var_(default_main_program().current_block(),
                                       startup_var)
 
-    if for_parallel:
-        main_prog_var = parallel(reader=main_prog_var)
-
     return monkey_patch_reader_methods(main_prog_var)
 
 
+def py_reader(capacity,
+              shapes,
+              dtypes,
+              lod_levels=None,
+              name=None,
+              use_double_buffer=True):
+    """
+    Create a Python reader for data feeding in Python
+
+    This layer returns a Reader Variable.
+    The Reader provides :code:`decorate_paddle_reader()` and
+    :code:`decorate_tensor_provider()` to set a Python generator as the data
+    source in Python side. When :code:`Executor::Run()` is invoked in C++
+    side, the data from the generator would be read automatically. Unlike
+    :code:`DataFeeder.feed()`, the data reading process and
+    :code:`Executor::Run()` process can run in parallel using
+    :code:`py_reader`. The :code:`start()` method of the Reader should be
+    called when each pass begins, while the :code:`reset()` method should be
+    called when the pass ends and :code:`fluid.core.EOFException` raises.
+    Note that :code:`Program.clone()` method cannot clone :code:`py_reader`.
+
+    Args:
+       capacity(int): The buffer capacity maintained by :code:`py_reader`.
+       shapes(list|tuple): List of tuples which declaring data shapes.
+       dtypes(list|tuple): List of strs which declaring data type.
+       lod_levels(list|tuple): List of ints which declaring data lod_level.
+       name(basestring): The prefix Python queue name and Reader name. None will
+            be generated automatically.
+       use_double_buffer(bool): Whether use double buffer or not.
+
+    Returns:
+       Variable: A Reader from which we can get feeding data.
+
+    Examples:
+
+        1. The basic usage of :code:`py_reader` is as follows:
+
+        >>> import paddle.v2
+        >>> import paddle.fluid as fluid
+        >>> import paddle.dataset.mnist as mnist
+        >>>
+        >>> reader = fluid.layers.py_reader(capacity=64,
+        >>>                                 shapes=[(-1,3,224,224), (-1,1)],
+        >>>                                 dtypes=['float32', 'int64'])
+        >>> reader.decorate_paddle_reader(
+        >>>     paddle.v2.reader.shuffle(paddle.batch(mnist.train())
+        >>>
+        >>> img, label = fluid.layers.read_file(reader)
+        >>> loss = network(img, label) # some network definition
+        >>>
+        >>> fluid.Executor(fluid.CUDAPlace(0)).run(fluid.default_startup_program())
+        >>>
+        >>> exe = fluid.ParallelExecutor(use_cuda=True, loss_name=loss.name)
+        >>> for epoch_id in range(10):
+        >>>     reader.start()
+        >>>     try:
+        >>>         while True:
+        >>>             exe.run(fetch_list=[loss.name])
+        >>>     except fluid.core.EOFException:
+        >>>         reader.reset()
+
+        2. When training and testing are both performed, two different
+        :code:`py_reader` should be created with different names, e.g.:
+
+        >>> import paddle.v2
+        >>> import paddle.fluid as fluid
+        >>> import paddle.dataset.mnist as mnist
+        >>>
+        >>> def network(reader):
+        >>>     img, label = fluid.layers.read_file(reader)
+        >>>     # Here, we omitted the network definition
+        >>>     return loss
+        >>>
+        >>> train_reader = fluid.layers.py_reader(capacity=64,
+        >>>                                       shapes=[(-1,3,224,224), (-1,1)],
+        >>>                                       dtypes=['float32', 'int64'],
+        >>>                                       name='train_reader')
+        >>> train_reader.decorate_paddle_reader(
+        >>>     paddle.v2.reader.shuffle(paddle.batch(mnist.train())
+        >>>
+        >>> test_reader = fluid.layers.py_reader(capacity=32,
+        >>>                                      shapes=[(-1,3,224,224), (-1,1)],
+        >>>                                      dtypes=['float32', 'int64'],
+        >>>                                      name='test_reader')
+        >>> test_reader.decorate_paddle_reader(paddle.batch(mnist.test(), 512))
+        >>>
+        >>> # Create train_main_prog and train_startup_prog
+        >>> train_main_prog = fluid.Program()
+        >>> train_startup_prog = fluid.Program()
+        >>> with fluid.program_guard(train_main_prog, train_startup_prog):
+        >>>     # Use fluid.unique_name.guard() to share parameters with test program
+        >>>     with fluid.unique_name.guard():
+        >>>         train_loss = network(train_reader) # some network definition
+        >>>         adam = fluid.optimizer.Adam(learning_rate=0.01)
+        >>>         adam.minimize(loss)
+        >>>
+        >>> # Create test_main_prog and test_startup_prog
+        >>> test_main_prog = fluid.Program()
+        >>> test_startup_prog = fluid.Program()
+        >>> with fluid.program_guard(test_main_prog, test_startup_prog):
+        >>>     # Use fluid.unique_name.guard() to share parameters with train program
+        >>>     with fluid.unique_name.guard():
+        >>>         test_loss = network(test_reader)
+        >>>
+        >>> fluid.Executor(fluid.CUDAPlace(0)).run(train_startup_prog)
+        >>> fluid.Executor(fluid.CUDAPlace(0)).run(test_startup_prog)
+        >>>
+        >>> train_exe = fluid.ParallelExecutor(use_cuda=True,
+        >>>                 loss_name=train_loss.name, main_program=train_main_prog)
+        >>> test_exe = fluid.ParallelExecutor(use_cuda=True,
+        >>>                 loss_name=test_loss.name, main_program=test_main_prog)
+        >>> for epoch_id in range(10):
+        >>>     train_reader.start()
+        >>>     try:
+        >>>         while True:
+        >>>             train_exe.run(fetch_list=[train_loss.name])
+        >>>     except fluid.core.EOFException:
+        >>>         train_reader.reset()
+        >>>
+        >>>     test_reader.start()
+        >>>     try:
+        >>>         while True:
+        >>>             test_exe.run(fetch_list=[test_loss.name])
+        >>>     except fluid.core.EOFException:
+        >>>         test_reader.reset()
+    """
+    dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes]
+    shape_concat = []
+    ranks = []
+
+    for shape in shapes:
+        shape_concat.extend(shape)
+        ranks.append(len(shape))
+
+    if lod_levels is None:
+        lod_levels = [0] * len(shapes)
+
+    if name is None:
+        queue_name = unique_name('lod_tensor_blocking_queue')
+        reader_name = unique_name('create_py_reader')
+        double_buffer_name = unique_name('double_buffer')
+    else:
+        queue_name = "_".join([name, "queue"])
+        reader_name = "_".join([name, "reader"])
+        double_buffer_name = "_".join([name, "double_buffer"])
+
+    var = global_scope().var(queue_name)
+    feed_queue = core.init_lod_tensor_blocking_queue(var, capacity, shapes)
+
+    startup_blk = default_startup_program().current_block()
+    startup_var = startup_blk.create_var(name=reader_name)
+    startup_blk.append_op(
+        type='create_py_reader',
+        inputs={'blocking_queue': [queue_name]},
+        outputs={'Out': [startup_var]},
+        attrs={
+            'shape_concat': shape_concat,
+            'lod_levels': lod_levels,
+            'ranks': ranks
+        })
+
+    startup_var.desc.set_dtypes(dtypes)
+    startup_var.persistable = True
+
+    main_prog_var = _copy_reader_var_(default_main_program().current_block(),
+                                      startup_var)
+
+    reader = monkey_patch_reader_methods(main_prog_var)
+    if use_double_buffer:
+        double_buffer_reader = double_buffer(reader, name=double_buffer_name)
+        # we return a double buffer reader. However, the reset method comes from
+        # py_reader.
+        double_buffer_reader.reset = reader.reset
+        reader = double_buffer_reader
+
+    # monkey patch py_reader special methods
+    reader.queue = feed_queue
+    current_reset_method = reader.reset
+    reader.thread = None
+    reader.tensor_provider = None
+    reader.exited = False
+
+    def start_provide_thread(func):
+        def __provider_thread__():
+            for tensors in func():
+                array = core.LoDTensorArray()
+                for item in tensors:
+                    if not isinstance(item, core.LoDTensor):
+                        tmp = core.LoDTensor()
+                        tmp.set(item, core.CPUPlace())
+                        item = tmp
+
+                    array.append(item)
+
+                if reader.exited:
+                    break
+                feed_queue.push(array)
+                if reader.exited:
+                    break
+            feed_queue.close()
+
+        reader.thread = threading.Thread(target=__provider_thread__)
+        reader.thread.daemon = True
+        reader.thread.start()
+
+    def __set_tensor_provider__(func):
+        reader.tensor_provider = func
+
+    def __set_paddle_reader__(paddle_reader):
+        with program_guard(Program(), Program()):
+            feed_list = []
+            counter = 0
+            for dtype, shape, lod_level in zip(dtypes, shapes, lod_levels):
+                name = str(counter)
+                feed_list.append(
+                    data(
+                        name=name,
+                        dtype=dtype,
+                        shape=shape,
+                        lod_level=lod_level))
+                counter += 1
+
+            feeder = DataFeeder(feed_list=feed_list, place=core.CPUPlace())
+            paddle_reader = feeder.decorate_reader(
+                paddle_reader, multi_devices=False)
+
+        def __tensor_provider__():
+            for slots in paddle_reader():
+                yield [slots[str(idx)] for idx in six.moves.xrange(counter)]
+
+        __set_tensor_provider__(__tensor_provider__)
+
+    def __reset__():
+        current_reset_method()
+        if reader.thread is not None and reader.tensor_provider is not None:
+            reader.exited = True
+            reader.thread.join()
+            reader.exited = False
+
+    def __start__():
+        start_provide_thread(reader.tensor_provider)
+
+    reader.reset = __reset__
+    reader.decorate_tensor_provider = __set_tensor_provider__
+    reader.decorate_paddle_reader = __set_paddle_reader__
+    reader.start = __start__
+
+    return reader
+
+
 def open_files(filenames,
                shapes,
                lod_levels,
                dtypes,
-               thread_num=1,
+               thread_num=None,
                buffer_size=None,
                pass_num=1,
-               for_parallel=True):
+               is_test=None):
     """
     Open files
 
-    This layer takes a list of files to read from and returns a Reader Variable. 
-    Via the Reader Variable, we can get data from given files. All files must 
-    have name suffixs to indicate their formats, e.g., '*.recordio'. 
+    This layer takes a list of files to read from and returns a Reader Variable.
+    Via the Reader Variable, we can get data from given files. All files must
+    have name suffixs to indicate their formats, e.g., '*.recordio'.
 
     Args:
        filenames(list): The list of file names.
        shapes(list): List of tuples which declaring data shapes.
        lod_levels(list): List of ints which declaring data lod_level.
        dtypes(list): List of strs which declaring data type.
-       thread_num(int): The maximal concurrent prefetch thread number.
-       buffer_size(int|None): The size of prefetch buffer. If it is setted None, 
-            buffer size will be thread_num * 3.
-            Default: None
+       thread_num(None): The number of thread to read files.
+            Default: min(len(filenames), cpu_number).
+       buffer_size(None): The buffer size of reader. Default: 3 * thread_num
        pass_num(int): Number of passes to run.
-       for_parallel(Bool): Set it as True if you are going to run 
-            subsequent operators in parallel.
-            Default: True
+       is_test(bool|None): Whether `open_files` used for testing or not. If it
+            is used for testing, the order of data generated is same as the file
+            order. Otherwise, it is not guaranteed the order of data is same
+            between every epoch. [Default: False].
 
     Returns:
        Variable: A Reader Variable via which we can get file data.
@@ -487,16 +760,22 @@ def open_files(filenames,
                                                      './data2.recordio'],
                                              shapes=[(3,224,224), (1)],
                                              lod_levels=[0, 0],
-                                             dtypes=['float32', 'int64'],
-                                             thread_num=2,
-                                             buffer_size=2)
+                                             dtypes=['float32', 'int64'])
 
          # Via the reader, we can use 'read_file' layer to get data:
          image, label = fluid.layers.io.read_file(reader)
     """
+    if thread_num is None:
+        thread_num = min(len(filenames), multiprocessing.cpu_count())
+    else:
+        thread_num = int(thread_num)
+
     if buffer_size is None:
-        buffer_size = thread_num * 3
-    if isinstance(filenames, basestring):
+        buffer_size = 3 * thread_num
+    else:
+        buffer_size = int(buffer_size)
+
+    if isinstance(filenames, six.string_types):
         filenames = [filenames]
     dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes]
     shape_concat = []
@@ -509,17 +788,18 @@ def open_files(filenames,
     multi_file_reader_name = unique_name('multi_file_reader')
     startup_blk = default_startup_program().current_block()
     startup_reader = startup_blk.create_var(name=multi_file_reader_name)
+    attrs = {
+        'shape_concat': shape_concat,
+        'lod_levels': lod_levels,
+        'ranks': ranks,
+        'file_names': filenames,
+        'thread_num': thread_num,
+        'buffer_size': buffer_size
+    }
+    if is_test is not None:
+        attrs['is_test'] = is_test
     startup_blk.append_op(
-        type='open_files',
-        outputs={'Out': [startup_reader]},
-        attrs={
-            'shape_concat': shape_concat,
-            'lod_levels': lod_levels,
-            'ranks': ranks,
-            'file_names': filenames,
-            'thread_num': thread_num,
-            'buffer_size': buffer_size
-        })
+        type='open_files', outputs={'Out': [startup_reader]}, attrs=attrs)
 
     startup_reader.desc.set_dtypes(dtypes)
     startup_reader.persistable = True
@@ -529,9 +809,6 @@ def open_files(filenames,
         main_prog_reader = multi_pass(
             reader=main_prog_reader, pass_num=pass_num)
 
-    if for_parallel:
-        main_prog_reader = parallel(reader=main_prog_reader)
-
     return monkey_patch_reader_methods(main_prog_reader)
 
 
@@ -573,9 +850,9 @@ def shuffle(reader, buffer_size):
 
 def batch(reader, batch_size):
     """
-    This layer is a reader decorator. It takes a reader and adds 
-    'batching' decoration on it. When reading with the result 
-    decorated reader, output data will be automatically organized 
+    This layer is a reader decorator. It takes a reader and adds
+    'batching' decoration on it. When reading with the result
+    decorated reader, output data will be automatically organized
     to the form of batches.
 
     Args:
@@ -600,11 +877,11 @@ def batch(reader, batch_size):
             # If we read data with the raw_reader:
             #     data = fluid.layers.read_file(raw_reader)
             # We can only get data instance by instance.
-            # 
+            #
             # However, if we read data with the batch_reader:
             #     data = fluid.layers.read_file(batch_reader)
-            # Each 5 adjacent instances will be automatically combined together 
-            # to become a batch. So what we get('data') is a batch data instead 
+            # Each 5 adjacent instances will be automatically combined together
+            # to become a batch. So what we get('data') is a batch data instead
             # of an instance.
     """
     return __create_unshared_decorated_reader__(
@@ -647,17 +924,12 @@ def multi_pass(reader, pass_num):
         'create_multi_pass_reader', reader, {'pass_num': int(pass_num)})
 
 
-def parallel(reader):
-    return __create_shared_decorated_reader__('create_threaded_reader', reader,
-                                              {})
-
-
 def read_file(reader):
     """
     Execute the given reader and get data via it.
 
-    A reader is also a Variable. It can be a raw reader generated by 
-    `fluid.layers.open_files()` or a decorated one generated by 
+    A reader is also a Variable. It can be a raw reader generated by
+    `fluid.layers.open_files()` or a decorated one generated by
     `fluid.layers.double_buffer()` and so on.
 
     Args:
@@ -730,7 +1002,7 @@ class Preprocessor(object):
         self.sink_var_names = None
         self.status = Preprocessor.BEFORE_SUB_BLOCK
 
-    def is_completed(self):
+    def _is_completed(self):
         return self.sub_block and self.source_var_names and self.sink_var_names
 
     @contextlib.contextmanager
@@ -740,7 +1012,7 @@ class Preprocessor(object):
         yield
         self.main_prog.rollback()
         self.status = Preprocessor.AFTER_SUB_BLOCK
-        if not self.is_completed():
+        if not self._is_completed():
             raise RuntimeError(
                 "The definition of preprocessor is incompleted! "
                 "Please make sure that you have set input and output "
@@ -758,7 +1030,7 @@ class Preprocessor(object):
         source_lod_levels = self.underlying_reader.desc.lod_levels()
         self.source_var_names = [
             unique_name("preprocessor_source")
-            for _ in xrange(len(source_shapes))
+            for _ in six.moves.range(len(source_shapes))
         ]
         source_vars = []
         for var_name, shape, dtype, lod_level in zip(
diff --git a/python/paddle/fluid/layers/layer_function_generator.py b/python/paddle/fluid/layers/layer_function_generator.py
index 3096389101a5e5b302c78145b8bc9f1d71f6b8cb..8963d74de014d69c590276d5ff7080111f614230 100644
--- a/python/paddle/fluid/layers/layer_function_generator.py
+++ b/python/paddle/fluid/layers/layer_function_generator.py
@@ -11,12 +11,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from __future__ import print_function
 import re
-import cStringIO
 import functools
 import warnings
 import string
 
+from six.moves import cStringIO
 from ..proto import framework_pb2
 from ..framework import OpProtoHolder, Variable
 from ..layer_helper import LayerHelper
@@ -70,7 +72,7 @@ def _generate_doc_string_(op_proto):
     if not isinstance(op_proto, framework_pb2.OpProto):
         raise TypeError("OpProto should be `framework_pb2.OpProto`")
 
-    buf = cStringIO.StringIO()
+    buf = cStringIO()
     buf.write(escape_math(op_proto.comment))
     buf.write('\nArgs:\n')
     for each_input in op_proto.inputs:
@@ -119,9 +121,9 @@ def generate_layer_fn(op_type):
     """
     op_proto = OpProtoHolder.instance().get_op_proto(op_type)
     not_intermediate_outputs = \
-        filter(lambda output: not output.intermediate, op_proto.outputs)
+        [output for output in op_proto.outputs if not output.intermediate]
     intermediate_outputs = \
-        filter(lambda output: output.intermediate, op_proto.outputs)
+        [output for output in op_proto.outputs if output.intermediate]
 
     if len(not_intermediate_outputs) != 1:
         raise ValueError("Only one non intermediate output operator can be",
diff --git a/python/paddle/fluid/layers/learning_rate_scheduler.py b/python/paddle/fluid/layers/learning_rate_scheduler.py
index 6071e3e74218e4db4cddc223818d3a9b7086fd86..be368007dd7061ba7fc97414dbadfce00d158776 100644
--- a/python/paddle/fluid/layers/learning_rate_scheduler.py
+++ b/python/paddle/fluid/layers/learning_rate_scheduler.py
@@ -20,10 +20,12 @@ User can also implement their own learning_rate_decay
 strategy according to this module.
 """
 
-import control_flow
-import nn
-import ops
-import tensor
+from __future__ import print_function
+
+from . import control_flow
+from . import nn
+from . import ops
+from . import tensor
 from ..initializer import init_on_cpu
 from ..framework import default_main_program, Parameter
 
@@ -62,20 +64,20 @@ def noam_decay(d_model, warmup_steps):
         The decayed learning rate.
     """
     global_step = _decay_step_counter(1)
-    with init_on_cpu():
-        a = global_step**-0.5
-        b = (warmup_steps**-1.5) * global_step
-        lr_value = (d_model**-0.5) * ops.elementwise_min(a, b)
+
+    a = global_step**-0.5
+    b = (warmup_steps**-1.5) * global_step
+    lr_value = (d_model**-0.5) * ops.elementwise_min(a, b)
 
     return lr_value
 
 
 def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False):
     """
-    Applies exponential decay to the learning rate. 
+    Applies exponential decay to the learning rate.
 
-    When training a model, it is often recommended to lower the learning rate as the 
-    training progresses. By using this function, the learning rate will be decayed by 
+    When training a model, it is often recommended to lower the learning rate as the
+    training progresses. By using this function, the learning rate will be decayed by
     'decay_rate' every 'decay_steps' steps.
 
     >>> if staircase == True:
@@ -108,12 +110,10 @@ def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False):
     """
     global_step = _decay_step_counter()
 
-    with init_on_cpu():
-        # update learning_rate
-        div_res = global_step / decay_steps
-        if staircase:
-            div_res = ops.floor(div_res)
-        decayed_lr = learning_rate * (decay_rate**div_res)
+    div_res = global_step / decay_steps
+    if staircase:
+        div_res = ops.floor(div_res)
+    decayed_lr = learning_rate * (decay_rate**div_res)
 
     return decayed_lr
 
@@ -138,11 +138,10 @@ def natural_exp_decay(learning_rate, decay_steps, decay_rate, staircase=False):
     """
     global_step = _decay_step_counter()
 
-    with init_on_cpu():
-        div_res = global_step / decay_steps
-        if staircase:
-            div_res = ops.floor(div_res)
-        decayed_lr = learning_rate * ops.exp(-1 * decay_rate * div_res)
+    div_res = global_step / decay_steps
+    if staircase:
+        div_res = ops.floor(div_res)
+    decayed_lr = learning_rate * ops.exp(-1 * decay_rate * div_res)
 
     return decayed_lr
 
@@ -151,8 +150,8 @@ def inverse_time_decay(learning_rate, decay_steps, decay_rate, staircase=False):
     """
     Applies inverse time decay to the initial learning rate.
 
-    When training a model, it is often recommended to lower the learning rate as the 
-    training progresses. By using this function, an inverse decay function will be 
+    When training a model, it is often recommended to lower the learning rate as the
+    training progresses. By using this function, an inverse decay function will be
     applied to the initial learning rate.
 
     >>> if staircase == True:
@@ -184,12 +183,11 @@ def inverse_time_decay(learning_rate, decay_steps, decay_rate, staircase=False):
     """
     global_step = _decay_step_counter()
 
-    with init_on_cpu():
-        div_res = global_step / decay_steps
-        if staircase:
-            div_res = ops.floor(div_res)
+    div_res = global_step / decay_steps
+    if staircase:
+        div_res = ops.floor(div_res)
 
-        decayed_lr = learning_rate / (1 + decay_rate * div_res)
+    decayed_lr = learning_rate / (1 + decay_rate * div_res)
 
     return decayed_lr
 
@@ -224,25 +222,22 @@ def polynomial_decay(learning_rate,
     """
     global_step = _decay_step_counter()
 
-    with init_on_cpu():
-        if cycle:
-            div_res = ops.ceil(global_step / decay_steps)
-            zero_var = tensor.fill_constant(
-                shape=[1], dtype='float32', value=0.0)
-            one_var = tensor.fill_constant(
-                shape=[1], dtype='float32', value=1.0)
-
-            with control_flow.Switch() as switch:
-                with switch.case(global_step == zero_var):
-                    tensor.assign(input=one_var, output=div_res)
-            decay_steps = decay_steps * div_res
-        else:
-            decay_steps_var = tensor.fill_constant(
-                shape=[1], dtype='float32', value=float(decay_steps))
-            global_step = ops.elementwise_min(x=global_step, y=decay_steps_var)
+    if cycle:
+        div_res = ops.ceil(global_step / decay_steps)
+        zero_var = tensor.fill_constant(shape=[1], dtype='float32', value=0.0)
+        one_var = tensor.fill_constant(shape=[1], dtype='float32', value=1.0)
 
-        decayed_lr = (learning_rate - end_learning_rate) * \
-                     ((1 - global_step / decay_steps) ** power) + end_learning_rate
+        with control_flow.Switch() as switch:
+            with switch.case(global_step == zero_var):
+                tensor.assign(input=one_var, output=div_res)
+        decay_steps = decay_steps * div_res
+    else:
+        decay_steps_var = tensor.fill_constant(
+            shape=[1], dtype='float32', value=float(decay_steps))
+        global_step = ops.elementwise_min(x=global_step, y=decay_steps_var)
+
+    decayed_lr = (learning_rate - end_learning_rate) * \
+        ((1 - global_step / decay_steps) ** power) + end_learning_rate
     return decayed_lr
 
 
@@ -277,28 +272,28 @@ def piecewise_decay(boundaries, values):
 
     global_step = _decay_step_counter()
 
-    with init_on_cpu():
-        lr = tensor.create_global_var(
-            shape=[1],
-            value=0.0,
-            dtype='float32',
-            persistable=True,
-            name="learning_rate")
+    lr = tensor.create_global_var(
+        shape=[1],
+        value=0.0,
+        dtype='float32',
+        persistable=True,
+        name="learning_rate")
 
-        with control_flow.Switch() as switch:
-            for i in range(len(boundaries)):
-                boundary_val = tensor.fill_constant(
-                    shape=[1], dtype='float32', value=float(boundaries[i]))
-                value_var = tensor.fill_constant(
-                    shape=[1], dtype='float32', value=float(values[i]))
-                with switch.case(global_step < boundary_val):
-                    tensor.assign(value_var, lr)
-            last_value_var = tensor.fill_constant(
+    with control_flow.Switch() as switch:
+        for i in range(len(boundaries)):
+            boundary_val = tensor.fill_constant(
                 shape=[1],
                 dtype='float32',
-                value=float(values[len(values) - 1]))
-            with switch.default():
-                tensor.assign(last_value_var, lr)
+                value=float(boundaries[i]),
+                force_cpu=True)
+            value_var = tensor.fill_constant(
+                shape=[1], dtype='float32', value=float(values[i]))
+            with switch.case(global_step < boundary_val):
+                tensor.assign(value_var, lr)
+        last_value_var = tensor.fill_constant(
+            shape=[1], dtype='float32', value=float(values[len(values) - 1]))
+        with switch.default():
+            tensor.assign(last_value_var, lr)
 
     return lr
 
@@ -333,9 +328,9 @@ def append_LARS(params_grads, learning_rate, weight_decay):
         grad_norm = ops.sqrt(nn.reduce_sum(input=ops.square(grad)))
         if type(param_lr) == float and param_lr == 1.0:
             decayed_lr = learning_rate * param_norm \
-                         / _balanced_weight(param_norm, grad_norm)
+                / _balanced_weight(param_norm, grad_norm)
         else:
             decayed_lr = learning_rate * param_lr * param_norm \
-                         / _balanced_weight(param_norm, grad_norm)
+                / _balanced_weight(param_norm, grad_norm)
         # set back param local learning rate
         param.optimize_attr['learning_rate'] = decayed_lr
diff --git a/python/paddle/fluid/layers/math_op_patch.py b/python/paddle/fluid/layers/math_op_patch.py
index 1754061c4ba6f5b97bced3548bc412dfb1b7932c..a458cebfb194a068d040a8919fd4abcb4b4bea80 100644
--- a/python/paddle/fluid/layers/math_op_patch.py
+++ b/python/paddle/fluid/layers/math_op_patch.py
@@ -12,12 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 from ..framework import Variable, unique_name
-from layer_function_generator import OpProtoHolder
+from .layer_function_generator import OpProtoHolder
 from ..initializer import force_init_on_cpu
 
-__all__ = ['monkey_patch_variable']
-
 
 def monkey_patch_variable():
     def unique_tmp_name():
diff --git a/python/paddle/fluid/layers/metric_op.py b/python/paddle/fluid/layers/metric_op.py
index 99e82fdd04282177fae63f1fb94b5e32d41c612e..b1598bfec210474ae1e17f9f88e8b57aa80b8452 100644
--- a/python/paddle/fluid/layers/metric_op.py
+++ b/python/paddle/fluid/layers/metric_op.py
@@ -15,12 +15,14 @@
 All layers just related to metric.
 """
 
+from __future__ import print_function
+
 import warnings
 from ..layer_helper import LayerHelper
 from ..initializer import Normal, Constant
 from ..framework import Variable
 from ..param_attr import ParamAttr
-import nn
+from . import nn
 
 __all__ = ['accuracy', 'auc']
 
@@ -76,14 +78,14 @@ def accuracy(input, label, k=1, correct=None, total=None):
     return acc_out
 
 
-def auc(input, label, curve='ROC', num_thresholds=200):
+def auc(input, label, curve='ROC', num_thresholds=2**12 - 1, topk=1):
     """
     **Area Under the Curve (AUC) Layer**
 
     This implementation computes the AUC according to forward output and label.
-    It is used very widely in binary classification evaluation. 
+    It is used very widely in binary classification evaluation.
 
-    Note: If input label contains values other than 0 and 1, it will be cast 
+    Note: If input label contains values other than 0 and 1, it will be cast
     to `bool`. Find the relevant definitions `here <https://en.wikipedia.org\
     /wiki/Receiver_operating_characteristic#Area_under_the_curve>`_.
 
@@ -93,46 +95,55 @@ def auc(input, label, curve='ROC', num_thresholds=200):
         2. PR: Precision Recall
 
     Args:
-        input(Variable): A floating-point 2D Variable, values are in the range 
-                         [0, 1]. Each row is sorted in descending order. This 
-                         input should be the output of topk. Typically, this 
+        input(Variable): A floating-point 2D Variable, values are in the range
+                         [0, 1]. Each row is sorted in descending order. This
+                         input should be the output of topk. Typically, this
                          Variable indicates the probability of each label.
-        label(Variable): A 2D int Variable indicating the label of the training 
+        label(Variable): A 2D int Variable indicating the label of the training
                          data. The height is batch size and width is always 1.
         curve(str): Curve type, can be 'ROC' or 'PR'. Default 'ROC'.
-        num_thresholds(int): The number of thresholds to use when discretizing 
+        num_thresholds(int): The number of thresholds to use when discretizing
                              the roc curve. Default 200.
+        topk(int): only topk number of prediction output will be used for auc.
 
     Returns:
         Variable: A scalar representing the current AUC.
 
     Examples:
         .. code-block:: python
-        
+
             # network is a binary classification model and label the ground truth
             prediction = network(image, is_infer=True)
             auc_out=fluid.layers.auc(input=prediction, label=label)
     """
-
-    warnings.warn(
-        "This interface not recommended, fluid.layers.auc compute the auc at every minibatch, \
-        but can not aggregate them and get the pass AUC, because pass \
-        auc can not be averaged with weighted from the minibatch auc value. \
-        Please use fluid.metrics.Auc, it can compute the auc value via Python natively, \
-        which can get every minibatch and every pass auc value.", Warning)
     helper = LayerHelper("auc", **locals())
-    topk_out = helper.create_tmp_variable(dtype=input.dtype)
-    topk_indices = helper.create_tmp_variable(dtype="int64")
-    topk_out, topk_indices = nn.topk(input, k=k)
-    auc_out = helper.create_tmp_variable(dtype="float32")
+    auc_out = helper.create_tmp_variable(dtype="float64")
+    batch_auc_out = helper.create_tmp_variable(dtype="float64")
+    # make tp, tn, fp, fn persistable, so that can accumulate all batches.
+    stat_pos = helper.create_global_variable(
+        persistable=True, dtype='int64', shape=[num_thresholds + 1])
+    stat_neg = helper.create_global_variable(
+        persistable=True, dtype='int64', shape=[num_thresholds + 1])
+
+    for var in [stat_pos, stat_neg]:
+        helper.set_variable_initializer(
+            var, Constant(
+                value=0.0, force_cpu=True))
+
     helper.append_op(
         type="auc",
         inputs={
-            "Out": [topk_out],
-            "Indices": [topk_indices],
-            "Label": [label]
+            "Predict": [input],
+            "Label": [label],
+            "StatPos": [stat_pos],
+            "StatNeg": [stat_neg]
         },
         attrs={"curve": curve,
                "num_thresholds": num_thresholds},
-        outputs={"AUC": [auc_out], })
-    return auc_out
+        outputs={
+            "AUC": [auc_out],
+            "BatchAUC": [batch_auc_out],
+            "StatPosOut": [stat_pos],
+            "StatNegOut": [stat_neg]
+        })
+    return auc_out, batch_auc_out, [stat_pos, stat_neg]
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index bcf520d5a4e3bbe1d949d08f42199dd8c5cdc947..3ae0fac4bef5c47964f9a9cd8dd45b57e705e1f8 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -15,15 +15,18 @@
 All layers just related to the neural network.
 """
 
+from __future__ import print_function
+
+import numpy as np
 from ..layer_helper import LayerHelper
 from ..initializer import Normal, Constant
 from ..framework import Variable
 from ..param_attr import ParamAttr
-from layer_function_generator import autodoc, templatedoc
-from tensor import concat
-import utils
-import random
+from .layer_function_generator import autodoc, templatedoc
+from .tensor import concat
+from . import utils
 from .. import unique_name
+from functools import reduce
 
 __all__ = [
     'fc',
@@ -51,6 +54,7 @@ __all__ = [
     'conv2d_transpose',
     'conv3d_transpose',
     'sequence_expand',
+    'sequence_pad',
     'lstm_unit',
     'reduce_sum',
     'reduce_mean',
@@ -71,6 +75,7 @@ __all__ = [
     'transpose',
     'im2sequence',
     'nce',
+    'hsigmoid',
     'beam_search',
     'row_conv',
     'multiplex',
@@ -80,9 +85,12 @@ __all__ = [
     'one_hot',
     'autoincreased_step_counter',
     'reshape',
+    'squeeze',
+    'unsqueeze',
     'lod_reset',
     'lrn',
     'pad',
+    'pad_constant_like',
     'label_smooth',
     'roi_pool',
     'dice_loss',
@@ -90,11 +98,20 @@ __all__ = [
     'image_resize_short',
     'resize_bilinear',
     'gather',
+    'scatter',
     'random_crop',
     'mean_iou',
     'relu',
     'log',
     'crop',
+    'rank_loss',
+    'prelu',
+    'flatten',
+    'sequence_mask',
+    'stack',
+    'pad2d',
+    'unstack',
+    'sequence_enumerate',
 ]
 
 
@@ -151,7 +168,8 @@ def fc(input,
         param_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for learnable
             parameters/weights of this layer.
         bias_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for the bias
-            of this layer. If it is set to None, no bias will be added to the output units.
+            of this layer. If it is set to False, no bias will be added to the output units.
+            If it is set to None, the bias is initialized zero. Default: None.
         act (str, default None): Activation to be applied to the output of this layer.
         is_test(bool): A flag indicating whether execution is in test phase.
         use_mkldnn(bool): Use mkldnn kernel or not, it is valid only when the mkldnn
@@ -342,7 +360,7 @@ def dynamic_lstm(input,
     """
 
     helper = LayerHelper('lstm', **locals())
-    size = size / 4
+    size = size // 4
     weight = helper.create_parameter(
         attr=helper.param_attr, shape=[size, 4 * size], dtype=dtype)
     bias_size = [1, 7 * size]
@@ -532,7 +550,7 @@ def dynamic_lstmp(input,
     """
 
     helper = LayerHelper('lstmp', **locals())
-    size = size / 4
+    size = size // 4
     weight = helper.create_parameter(
         attr=helper.param_attr, shape=[proj_size, 4 * size], dtype=dtype)
     proj_weight = helper.create_parameter(
@@ -760,7 +778,7 @@ def gru_unit(input,
 
     helper = LayerHelper('gru_unit', **locals())
     dtype = helper.input_dtype()
-    size = size / 3
+    size = size // 3
 
     # create weight
     weight = helper.create_parameter(
@@ -932,6 +950,10 @@ def dropout(x, dropout_prob, is_test=False, seed=None, name=None):
     helper = LayerHelper('dropout', **locals())
     out = helper.create_tmp_variable(dtype=x.dtype)
     mask = helper.create_tmp_variable(dtype=x.dtype, stop_gradient=True)
+
+    if (seed is None or seed == 0) and helper.main_program.random_seed != 0:
+        seed = helper.main_program.random_seed
+
     helper.append_op(
         type='dropout',
         inputs={'X': [x]},
@@ -946,7 +968,7 @@ def dropout(x, dropout_prob, is_test=False, seed=None, name=None):
     return out
 
 
-def cross_entropy(input, label, soft_label=False):
+def cross_entropy(input, label, soft_label=False, ignore_index=-100):
     """
     **Cross Entropy Layer**
 
@@ -990,7 +1012,10 @@ def cross_entropy(input, label, soft_label=False):
                                tensor<float/double> with shape [N x D].
         soft_label (bool): a flag indicating whether to
                                            interpretate the given labels as soft
-                                           labels, default `False`.
+                                           labels. Default: `False`.
+        ignore_index (int): Specifies a target value that is ignored and does 
+                            not contribute to the input gradient. Only valid 
+                            if soft_label is set to False. Default: -100
 
     Returns:
          A 2-D tensor with shape [N x 1], the cross entropy loss.
@@ -1015,7 +1040,8 @@ def cross_entropy(input, label, soft_label=False):
         inputs={'X': [input],
                 'Label': [label]},
         outputs={'Y': [out]},
-        attrs={"soft_label": soft_label})
+        attrs={"soft_label": soft_label,
+               "ignore_index": ignore_index})
     return out
 
 
@@ -1240,7 +1266,7 @@ def sequence_conv(input,
         outputs={"Out": pre_bias},
         attrs={
             'contextStride': filter_stride,
-            'contextStart': -int(filter_size / 2),
+            'contextStart': -int(filter_size // 2),
             'contextLength': filter_size
         })
     pre_act = helper.append_bias_op(pre_bias)
@@ -1296,13 +1322,16 @@ def sequence_softmax(input, param_attr=None, bias_attr=None, use_cudnn=True):
 
 def softmax(input, param_attr=None, bias_attr=None, use_cudnn=True, name=None):
     """
-    The input of the softmax layer is a 2-D tensor with shape N x K (N is the
-    batch_size, K is the dimension of input feature). The output tensor has the
-    same shape as the input tensor.
+    The input of the softmax operator is a tensor of any rank. The output tensor
+    has the same shape as the input.
 
-    For each row of the input tensor, the softmax operator squashes the
-    K-dimensional vector of arbitrary real values to a K-dimensional vector of real
-    values in the range [0, 1] that add up to 1.
+    The input tensor will first be logically flattened to a 2-D matrix. The matrix's
+    second dimension(row length) is as same as the last dimension of the input
+    tensor, and the first dimension(column length) is the product of all other
+    dimensions of the input tensor. For each row of the matrix, the softmax operator
+    squashes the K-dimensional(K is the width of the matrix, which is also the size
+    of the input tensor's last dimension) vector of arbitrary real values to a
+    K-dimensional vector of real values in the range [0, 1] that add up to 1.
 
     It computes the exponential of the given dimension and the sum of exponential
     values of all the other dimensions in the K-dimensional vector input.
@@ -1310,7 +1339,7 @@ def softmax(input, param_attr=None, bias_attr=None, use_cudnn=True, name=None):
     exponential values of all the other dimensions is the output of the softmax
     operator.
 
-    For each row :math:`i` and each column :math:`j` in Input(X), we have:
+    For each row :math:`i` and each column :math:`j` in the matrix, we have:
 
     .. math::
 
@@ -1469,7 +1498,7 @@ def conv2d(input,
     else:
         if num_channels % groups != 0:
             raise ValueError("num_channels must be divisible by groups.")
-        num_filter_channels = num_channels / groups
+        num_filter_channels = num_channels // groups
 
     filter_size = utils.convert_to_list(filter_size, 2, 'filter_size')
     stride = utils.convert_to_list(stride, 2, 'stride')
@@ -1480,7 +1509,7 @@ def conv2d(input,
         raise ValueError("use_cudnn should be True or False")
 
     input_shape = input.shape
-    filter_shape = [num_filters, num_filter_channels] + filter_size
+    filter_shape = [num_filters, int(num_filter_channels)] + filter_size
 
     def _get_default_param_initializer():
         std = (2.0 / (filter_size[0]**2 * num_channels))**0.5
@@ -1631,7 +1660,7 @@ def conv3d(input,
     else:
         if num_channels % groups != 0:
             raise ValueError("num_channels must be divisible by groups.")
-        num_filter_channels = num_channels / groups
+        num_filter_channels = num_channels // groups
 
     filter_size = utils.convert_to_list(filter_size, 3, 'filter_size')
     stride = utils.convert_to_list(stride, 3, 'stride')
@@ -2366,16 +2395,16 @@ def conv2d_transpose(input,
         w_in = input.shape[3]
 
         filter_size_h = (output_size[0] - (h_in - 1) * stride[0] + 2 *
-                         padding[0] - 1) / dilation[0] + 1
+                         padding[0] - 1) // dilation[0] + 1
         filter_size_w = (output_size[1] - (w_in - 1) * stride[1] + 2 *
-                         padding[1] - 1) / dilation[1] + 1
+                         padding[1] - 1) // dilation[1] + 1
         filter_size = [filter_size_h, filter_size_w]
     else:
         filter_size = utils.convert_to_list(filter_size, 2,
                                             'conv2d_transpose.filter_size')
 
     groups = 1 if groups is None else groups
-    filter_shape = [input_channel, num_filters / groups] + filter_size
+    filter_shape = [input_channel, num_filters // groups] + filter_size
     img_filter = helper.create_parameter(
         dtype=input.dtype, shape=filter_shape, attr=helper.param_attr)
 
@@ -2533,18 +2562,18 @@ def conv3d_transpose(input,
         w_in = input.shape[4]
 
         filter_size_d = (output_size[0] - (d_in - 1) * stride[0] + 2 *
-                         padding[0] - 1) / dilation[0] + 1
+                         padding[0] - 1) // dilation[0] + 1
         filter_size_h = (output_size[1] - (h_in - 1) * stride[1] + 2 *
-                         padding[1] - 1) / dilation[1] + 1
+                         padding[1] - 1) // dilation[1] + 1
         filter_size_w = (output_size[2] - (w_in - 1) * stride[2] + 2 *
-                         padding[2] - 1) / dilation[2] + 1
+                         padding[2] - 1) // dilation[2] + 1
         filter_size = [filter_size_d, filter_size_h, filter_size_w]
     else:
         filter_size = utils.convert_to_list(filter_size, 3,
                                             'conv3d_transpose.filter_size')
 
     groups = 1 if groups is None else groups
-    filter_shape = [input_channel, num_filters / groups] + filter_size
+    filter_shape = [input_channel, num_filters // groups] + filter_size
     img_filter = helper.create_parameter(
         dtype=input.dtype, shape=filter_shape, attr=helper.param_attr)
 
@@ -2637,6 +2666,51 @@ def sequence_expand(x, y, ref_level=-1, name=None):
     return tmp
 
 
+@templatedoc()
+def sequence_pad(x, pad_value, maxlen=None):
+    """
+    ${comment}
+
+    Args:
+        x(Variable): Input variable which should contain lod information.
+        pad_value(Variable): The Variable that holds values that will be fill 
+            into padded steps. It can be a scalar or a tensor whose shape 
+            equals to time steps in sequences. If it's a scalar, it will be 
+            automatically broadcasted to the shape of time step.
+        maxlen(int, default None): The length of padded sequences. It can be 
+            None or any positive int. When it is None, all sequences will be 
+            padded up to the length of the longest one among them; when it a 
+            certain positive value, it must be greater than the length of the 
+            longest original sequence."
+    
+    Returns:
+        Variable: The padded sequence batch. All sequences has the same length.
+    
+    Examples:
+        .. code-block:: python
+
+            import numpy
+
+            x = fluid.layers.data(name='y', shape=[10, 5],
+                             dtype='float32', lod_level=1)
+            pad_value = fluid.layers.assign(input=numpy.array([0]))
+            out = fluid.layers.sequence_pad(x=x, pad_value=pad_value)
+    """
+
+    helper = LayerHelper('sequence_pad', input=x, **locals())
+    dtype = helper.input_dtype()
+    out = helper.create_tmp_variable(dtype)
+    if maxlen is None:
+        maxlen = -1
+    helper.append_op(
+        type='sequence_pad',
+        inputs={'X': x,
+                'PadValue': pad_value},
+        outputs={'Out': out},
+        attrs={'padded_length': maxlen})
+    return out
+
+
 def beam_search(pre_ids,
                 pre_scores,
                 ids,
@@ -2651,15 +2725,15 @@ def beam_search(pre_ids,
 
     Refer to `Beam search <https://en.wikipedia.org/wiki/Beam_search>`_
     for more details.
-    
-    This layer does the search in beams for one time step. Specifically, it 
+
+    This layer does the search in beams for one time step. Specifically, it
     selects the top-K candidate word ids of current step from :attr:`ids`
     according to their :attr:`scores` for all source sentences, where K is
     :attr:`beam_size` and :attr:`ids, scores` are predicted results from the
     computation cell. Additionally, :attr:`pre_ids` and :attr:`pre_scores` are
     the output of beam_search at previous step, they are needed for special use
     to handle ended candidate translations.
- 
+
     Note that the :attr:`scores` passed in should be accumulated scores, and
     length penalty should be done with extra operators before calculating the
     accumulated scores if needed, also suggest finding top-K before it and
@@ -2945,7 +3019,7 @@ def reduce_sum(input, dim=None, keep_dim=False, name=None):
             # x is a Tensor variable with following elements:
             #    [[0.2, 0.3, 0.5, 0.9]
             #     [0.1, 0.2, 0.6, 0.7]]
-            # Each example is followed by the correspending output tensor.
+            # Each example is followed by the corresponding output tensor.
             fluid.layers.reduce_sum(x)  # [3.5]
             fluid.layers.reduce_sum(x, dim=0)  # [0.3, 0.5, 1.1, 1.6]
             fluid.layers.reduce_sum(x, dim=-1)  # [1.9, 1.6]
@@ -2954,7 +3028,7 @@ def reduce_sum(input, dim=None, keep_dim=False, name=None):
             # x is a Tensor variable with shape [2, 2, 2] and elements as below:
             #      [[[1, 2], [3, 4]],
             #      [[5, 6], [7, 8]]]
-            # Each example is followed by the correspending output tensor.
+            # Each example is followed by the corresponding output tensor.
             fluid.layers.reduce_sum(x, dim=[1, 2]) # [10, 26]
             fluid.layers.reduce_sum(x, dim=[0, 1]) # [16, 20]
 
@@ -3476,11 +3550,6 @@ def topk(input, k, name=None):
 
             top5_values, top5_indices = layers.topk(input, k=5)
     """
-    shape = input.shape
-    if k < 1 or k >= shape[-1]:
-        raise ValueError("k must be greater than 0 and less than %d." %
-                         (shape[-1]))
-
     helper = LayerHelper("top_k", **locals())
     values = helper.create_tmp_variable(dtype=input.dtype)
     indices = helper.create_tmp_variable(dtype="int64")
@@ -3857,6 +3926,74 @@ def nce(input,
     return cost / (num_neg_samples + 1)
 
 
+def hsigmoid(input, label, num_classes, param_attr=None, bias_attr=None):
+    """
+    The hierarchical sigmoid operator is used to accelerate the training
+    process of language model. This operator organizes the classes into a
+    complete binary tree, each leaf node represents a class(a word) and each
+    internal node acts as a binary classifier. For each word there's a unique
+    path from root to it's leaf node, hsigmoid calculate the cost for each
+    internal node on the path, and sum them to get a total cost. hsigmoid can
+    achive a acceleration from :math:`O(N)` to :math:`O(logN)`, where :math:`N`
+    represents the size of word dict.
+
+    Refer to `Hierarchical Probabilistic Neural Network Language Model
+    <http://www.iro.umontreal.ca/~lisa/pointeurs/hierarchical-nnlm-aistats05.pdf>`_
+
+    Args:
+        input (Variable): The input tensor variable with shape
+            :math:`[N \\times D]`, where :math:`N` is the size of mini-batch,
+            and :math:`D` is the feature size.
+        label (Variable): The tensor variable contains labels of training data.
+            It's a tensor with shape is :math:`[N \\times 1]`.
+        num_classes: (int), The number of classes, must not be less than 2.
+        param_attr (ParamAttr|list of ParamAttr, default None): The parameter
+             attribute for learnable parameters/weights of this layer.
+        bias_attr (ParamAttr|list of ParamAttr, default None):  The parameter
+             attribute for the bias of this layer. If it is set to False, no
+             bias will be applied.
+
+    Returns:
+        Out: (Tensor) The cost of hierarchical sigmoid operator. the shape is [N, 1]
+
+    Examples:
+
+        .. code-block:: python
+
+            x = fluid.layers.data(name='x', shape=[2], dtype='float32')
+            y = fluid.layers.data(name='y', shape=[1], dtype='int64')
+            out = fluid.layers.hsigmoid(input=x, label=y, num_classes=6)
+    """
+
+    helper = LayerHelper('hierarchical_sigmoid', **locals())
+    dtype = helper.input_dtype()
+    out = helper.create_tmp_variable(dtype)
+    pre_out = helper.create_tmp_variable(dtype)
+    dim = input.shape[1]
+    if num_classes < 2:
+        raise ValueError("num_classes must not be less than 2.")
+    weights = helper.create_parameter(
+        attr=helper.param_attr,
+        shape=[num_classes - 1, dim],
+        is_bias=False,
+        dtype=input.dtype)
+    inputs = {"X": input, "W": weights, "Label": label}
+    if helper.bias_attr:
+        bias = helper.create_parameter(
+            attr=helper.bias_attr,
+            shape=[1, num_classes - 1],
+            is_bias=True,
+            dtype=input.dtype)
+        inputs['Bias'] = bias
+    helper.append_op(
+        type="hierarchical_sigmoid",
+        inputs=inputs,
+        outputs={"Out": out,
+                 "PreOut": pre_out},
+        attrs={"num_classes": num_classes})
+    return out
+
+
 def transpose(x, perm, name=None):
     """
     Permute the dimensions of `input` according to `perm`.
@@ -3892,15 +4029,23 @@ def transpose(x, perm, name=None):
 
     helper = LayerHelper('transpose', **locals())
     out = helper.create_tmp_variable(x.dtype)
+    x_shape = helper.create_tmp_variable(x.dtype)
     helper.append_op(
-        type='transpose',
+        type='transpose2',
         inputs={'X': [x]},
-        outputs={'Out': [out]},
+        outputs={'Out': [out],
+                 'XShape': [x_shape]},
         attrs={'axis': perm})
     return out
 
 
-def im2sequence(input, filter_size=1, stride=1, padding=0, name=None):
+def im2sequence(input,
+                filter_size=1,
+                stride=1,
+                padding=0,
+                input_image_size=None,
+                out_stride=1,
+                name=None):
     """
     Extracts image patches from the input tensor to form a tensor of shape
     {input.batch_size * output_height * output_width, filter_size_H *
@@ -3937,6 +4082,15 @@ def im2sequence(input, filter_size=1, stride=1, padding=0, name=None):
             padding_up = padding_down = padding_left = padding_right = padding
             Default: padding = 0.
 
+        input_image_size(Variable): the input contains image real size.It's dim
+            is [batchsize, 2]. It is dispensable.It is just for batch inference.
+
+        out_stride(int|tuple): The scaling of image through CNN. It is
+            dispensable. It is valid only when input_image_size is not null.
+            If out_stride is tuple,  it must contain two intergers,
+            (out_stride_H, out_stride_W). Otherwise,
+            the out_stride_H = out_stride_W = out_stride.
+
         name (int): The name of this layer. It is optional.
 
     Returns:
@@ -3987,7 +4141,7 @@ def im2sequence(input, filter_size=1, stride=1, padding=0, name=None):
                            [ 5.  7.  2.  4.  1.  3.  9.  0.]
                            [ 7.  9.  4.  8.  3.  5.  0.  8.]]
 
-            output.dims = {8, 9}
+            output.dims = {8, 8}
 
             output.lod = [[4, 4]]
 
@@ -4009,18 +4163,17 @@ def im2sequence(input, filter_size=1, stride=1, padding=0, name=None):
     if len(padding) == 2:
         padding.append(padding[0])
         padding.append(padding[1])
-
+    inputs = {"X": input}
+    attrs = {"kernels": filter_size, "strides": stride, "padding": padding}
+    if input_image_size:
+        if isinstance(out_stride, int):
+            out_stride = [out_stride, out_stride]
+        inputs["Y"] = input_image_size
+        attrs["out_stride"] = out_stride
     helper = LayerHelper('im2sequence', **locals())
     out = helper.create_tmp_variable(dtype=helper.input_dtype())
     helper.append_op(
-        type='im2sequence',
-        inputs={'X': input},
-        outputs={'Out': out},
-        attrs={
-            'kernels': filter_size,
-            'strides': stride,
-            'paddings': padding,
-        })
+        type='im2sequence', inputs=inputs, outputs={'Out': out}, attrs=attrs)
     return out
 
 
@@ -4093,7 +4246,10 @@ def multiplex(inputs, index):
     return out
 
 
-def softmax_with_cross_entropy(logits, label, soft_label=False):
+def softmax_with_cross_entropy(logits,
+                               label,
+                               soft_label=False,
+                               ignore_index=-100):
     """
     **Softmax With Cross Entropy Operator.**
 
@@ -4135,6 +4291,10 @@ def softmax_with_cross_entropy(logits, label, soft_label=False):
             soft_label is set to true, Label is a Tensor<float/double> with
         soft_label (bool): A flag to indicate whether to interpretate the given
             labels as soft labels. By default, `soft_label` is set to False.
+        ignore_index (int): Specifies a target value that is ignored and does 
+                            not contribute to the input gradient. Only valid 
+                            if soft_label is set to False. Default: -100
+
     Returns:
         Variable: The cross entropy loss is a 2-D tensor with shape [N x 1].
 
@@ -4156,7 +4316,8 @@ def softmax_with_cross_entropy(logits, label, soft_label=False):
                 'Label': label},
         outputs={'Softmax': softmax,
                  'Loss': loss},
-        attrs={'soft_label': soft_label})
+        attrs={'soft_label': soft_label,
+               'ignore_index': ignore_index})
     return loss
 
 
@@ -4270,7 +4431,7 @@ def autoincreased_step_counter(counter_name=None, begin=1, step=1):
         helper.set_variable_initializer(
             counter, initializer=Constant(
                 value=begin - 1, force_cpu=True))
-        helper.main_program.global_block().prepend_op(
+        helper.main_program.global_block()._prepend_op(
             type='increment',
             inputs={'X': [counter]},
             outputs={'Out': [counter]},
@@ -4351,7 +4512,7 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
     """
 
     if not (isinstance(shape, list) or isinstance(shape, tuple)):
-        raise ValueError("Input shape must be a python lsit or tuple.")
+        raise ValueError("Input shape must be a python list or tuple.")
     inputs = {"X": x}
     if isinstance(actual_shape, Variable):
         inputs["Shape"] = actual_shape
@@ -4373,16 +4534,104 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
                 "Each dimension size given in shape must not be negtive "
                 "except one unknown dimension.")
 
-    helper = LayerHelper("reshape", **locals())
-    reshaped = helper.create_tmp_variable(dtype=x.dtype)
+    helper = LayerHelper("reshape2", **locals())
+    out = helper.create_tmp_variable(dtype=x.dtype)
+    x_shape = helper.create_tmp_variable(dtype=x.dtype)
     helper.append_op(
-        type="reshape",
+        type="reshape2",
         inputs=inputs,
-        attrs={"shape": shape,
-               "inplace": inplace},
-        outputs={"Out": reshaped})
+        attrs={"shape": shape},
+        outputs={"Out": out,
+                 "XShape": x_shape})
+
+    return helper.append_activation(out)
+
+
+def squeeze(input, axes, name=None):
+    """
+    Remove single-dimensional entries from the shape of a tensor. Takes a 
+    parameter axes with a list of axes to squeeze. If axes is not provided, all 
+    the single dimensions will be removed from the shape. If an axis is 
+    selected with shape entry not equal to one, an error is raised.
+        
+    Examples:
+    Case 1:
+      Given 
+        X.shape = (1, 3, 1, 5)
+      and
+        axes = [0]
+      we get:
+        Out.shape = (3, 1, 5)
+      Case 2:
+        Given
+          X.shape = (1, 3, 1, 5)
+        and 
+          axes = []
+        we get:
+          Out.shape = (3, 5)
+    
+    Args:
+        input (Variable): The input variable to be squeezed.
+        axes (list): List of integers, indicating the dimensions to be squeezed.
+        name (str|None): Name for this layer.
+
+    Returns:
+        Variable: Output squeezed variable.
+
+    Examples:
+        .. code-block:: python
+
+            x = layers.data(name='x', shape=[5, 1, 10])
+            y = layers.sequeeze(input=x, axes=[1])
+    """
+    helper = LayerHelper("squeeze", **locals())
+    out = helper.create_tmp_variable(dtype=input.dtype)
+    x_shape = helper.create_tmp_variable(dtype=input.dtype)
+    helper.append_op(
+        type="squeeze2",
+        inputs={"X": input},
+        attrs={"axes": axes},
+        outputs={"Out": out,
+                 "XShape": x_shape})
+
+    return out
 
-    return helper.append_activation(reshaped)
+
+def unsqueeze(input, axes, name=None):
+    """
+    Insert single-dimensional entries to the shape of a tensor. Takes one 
+    required argument axes, a list of dimensions that will be inserted. 
+    Dimension indices in axes are as seen in the output tensor. 
+
+    For example: 
+      Given a tensor such that tensor with shape [3, 4, 5], 
+      then Unsqueezed tensor with axes=[0, 4] has shape [1, 3, 4, 5, 1].
+    
+    Args:
+        input (Variable): The input variable to be unsqueezed.
+        axes (list): List of integers, indicating the dimensions to be inserted.
+        name (str|None): Name for this layer.
+
+    Returns:
+        Variable: Output unsqueezed variable.
+
+    Examples:
+        .. code-block:: python
+
+            x = layers.data(name='x', shape=[5, 10])
+            y = layers.unsequeeze(input=x, axes=[1])
+    """
+    helper = LayerHelper("unsqueeze", **locals())
+    out = helper.create_tmp_variable(dtype=input.dtype)
+    x_shape = helper.create_tmp_variable(dtype=input.dtype)
+    helper.append_op(
+        type="unsqueeze2",
+        inputs={"X": input},
+        attrs={"axes": axes},
+        outputs={"Out": out,
+                 "XShape": x_shape})
+
+    return out
 
 
 def lod_reset(x, y=None, target_lod=None):
@@ -4609,6 +4858,86 @@ def pad(x, paddings, pad_value=0., name=None):
     return out
 
 
+def pad_constant_like(x, y, pad_value=0., name=None):
+    """
+    Pad input(Y) with :attr:`pad_value`, the number of values padded to
+    the edges of each axis is specified by the difference of the shape
+    of X and Y. ((0, shape_x_0 - shape_y_0), ... (0, shape_x_n - shape_y_n))
+    unique pad widths for each axis. The input should be a k-D
+    tensor(k > 0 and k < 7).
+
+    See below for an example.
+
+    .. code-block:: text
+
+        Given:
+            X = [[[[ 0,  1,  2],
+                   [ 3,  4,  5]],
+                  [[ 6,  7,  8],
+                   [ 9, 10, 11]],
+                  [[12, 13, 14],
+                   [15, 16, 17]]],
+                 [[[18, 19, 20],
+                   [21, 22, 23]],
+                  [[24, 25, 26],
+                   [27, 28, 29]],
+                  [[30, 31, 32],
+                   [33, 34, 35]]]]
+            X.shape = (2, 3, 2, 3)
+
+            Y = [[[[35, 36, 37]],
+                  [[38, 39, 40]],
+                  [[41, 42, 43]]]]
+            Y.shape = (1, 3, 1, 3)
+
+    And
+        pad_value = -1,
+
+    Return:
+        Out = [[[[35, 36, 37],
+                  [-1, -1, -1]],
+                [[38, 39, 40],
+                  [-1, -1, -1]],
+                 [[41, 42, 43],
+                  [-1, -1, -1]]],
+                [[[-1, -1, -1],
+                  [-1, -1, -1]],
+                 [[-1, -1, -1],
+                  [-1, -1, -1]],
+                 [[-1, -1, -1],
+                  [-1, -1, -1]]]]
+        Out.shape = (2, 3, 2, 3)
+
+    Args:
+        x (Variable): The input tensor variable.
+        y (Variable): The input tensor variable.
+        pad_value (float): The constant value used to pad.
+        name(str|None): A name for this layer(optional). If set None, the layer
+                        will be named automatically.
+
+    Returns:
+        Variable: The padded tensor variable.
+
+    Examples:
+        .. code-block:: python
+
+            # x is a rank 4 tensor variable, x.shape = (2, 3, 2, 3)
+            # y is a rank 4 tensor variable, y.shape = (1, 3, 1, 3)
+            out = fluid.layers.pad_constant_like(x=x, y=y, pad_value=0.)
+            # out is a rank 4 tensor variable, and out.shape = [2, 3 ,2 , 3]
+    """
+    helper = LayerHelper('pad_constant_like', input=x, **locals())
+    dtype = helper.input_dtype()
+    out = helper.create_tmp_variable(dtype)
+    helper.append_op(
+        type='pad_constant_like',
+        inputs={'X': x,
+                'Y': y},
+        outputs={'Out': out},
+        attrs={'pad_value': float(pad_value)})
+    return out
+
+
 def label_smooth(label,
                  prior_dist=None,
                  epsilon=0.1,
@@ -4744,7 +5073,7 @@ def dice_loss(input, label, epsilon=0.00001):
             loss = fluid.layers.dice_loss(input=predictions, label=label, 2)
     """
     label = one_hot(label, depth=input.shape[-1])
-    reduce_dim = range(1, len(input.shape))
+    reduce_dim = list(range(1, len(input.shape)))
     inse = reduce_sum(input * label, dim=reduce_dim)
     dice_denominator = reduce_sum(
         input, dim=reduce_dim) + reduce_sum(
@@ -4940,6 +5269,47 @@ def gather(input, index):
     return out
 
 
+def scatter(input, index, updates, name=None):
+    """
+    **Scatter Layer**
+
+    Output is obtained by updating the input on selected indices on the first
+    axis.
+
+    .. math::
+
+        Out = X
+        Out[Ids] = Updates
+
+    Args:
+        input (Variable): The source input with rank>=1.
+        index (Variable): The index input with rank=1. Its dtype should be
+                          int32 or int64 as it is used as indexes.
+        updates (Variable): The updated value of scatter op.
+        name (str|None): The output variable name. Default None.
+
+    Returns:
+        output (Variable): The output is a tensor with the same shape as input.
+
+    Examples:
+
+        .. code-block:: python
+
+            output = fluid.layers.scatter(input, index, updates)
+
+    """
+    helper = LayerHelper('scatter', **locals())
+    dtype = helper.input_dtype()
+    out = helper.create_tmp_variable(dtype)
+    helper.append_op(
+        type="scatter",
+        inputs={"X": input,
+                "Ids": index,
+                "Updates": updates},
+        outputs={"Out": out})
+    return out
+
+
 @templatedoc()
 def random_crop(x, shape, seed=None):
     """
@@ -4962,7 +5332,7 @@ def random_crop(x, shape, seed=None):
     dtype = x.dtype
     out = helper.create_tmp_variable(dtype)
     if seed is None:
-        seed = random.randint(-65536, 65535)
+        seed = np.random.randint(-65536, 65536)
     op_attrs = {"shape": shape}
     if isinstance(seed, int):
         op_attrs["startup_seed"] = seed
@@ -4982,7 +5352,7 @@ def random_crop(x, shape, seed=None):
     return out
 
 
-def log(x):
+def log(x, name=None):
     """
     Calculates the natural log of the given input tensor, element-wise.
 
@@ -4992,6 +5362,8 @@ def log(x):
 
     Args:
         x (Variable): Input tensor.
+        name (str|None, default None): A name for this layer If set None,
+            the layer will be named automatically.
 
     Returns:
         Variable: The natural log of the input tensor computed element-wise.
@@ -5009,7 +5381,7 @@ def log(x):
     return out
 
 
-def relu(x):
+def relu(x, name=None):
     """
     Relu takes one input data (Tensor) and produces one output data (Tensor)
     where the rectified linear function, y = max(0, x), is applied to
@@ -5021,6 +5393,8 @@ def relu(x):
 
     Args:
         x (Variable): The input tensor.
+        name (str|None, default None): A name for this layer If set None,
+            the layer will be named automatically.
 
     Returns:
         Variable: The output tensor with the same shape as input.
@@ -5160,7 +5534,7 @@ def crop(x, shape=None, offsets=None, name=None):
     helper = LayerHelper('crop', **locals())
 
     if not (isinstance(shape, list) or isinstance(shape, tuple) or \
-        isinstance(shape, Variable)):
+                    isinstance(shape, Variable)):
         raise ValueError("The shape should be a list, tuple or Variable.")
 
     if offsets is None:
@@ -5184,3 +5558,453 @@ def crop(x, shape=None, offsets=None, name=None):
         outputs={'Out': out},
         attrs=None if len(attrs) == 0 else attrs)
     return out
+
+
+def rank_loss(label, left, right, name=None):
+    """
+    **Rank loss layer for RankNet**
+
+    RankNet(http://icml.cc/2015/wp-content/uploads/2015/06/icml_ranking.pdf)
+    is a pairwise ranking model with a training sample consisting of a pair
+    of documents, A and B. Label P indicates whether A is ranked higher than B
+    or not:
+
+    P = {0, 1} or {0, 0.5, 1}, where 0.5 means that there is no information
+    about the rank of the input pair.
+
+    Rank loss layer takes three inputs: left (o_i), right (o_j) and
+    label (P_{i,j}). The inputs respectively represent RankNet's output scores
+    for documents A and B and the value of label P. The following equation
+    computes rank loss C_{i,j} from the inputs:
+
+    $$
+      C_{i,j} = -\tilde{P_{ij}} * o_{i,j} + \log(1 + e^{o_{i,j}}) \\
+      o_{i,j} =  o_i - o_j  \\
+      \tilde{P_{i,j}} = \left \{0, 0.5, 1 \right \} \ or \ \left \{0, 1 \right \}
+    $$
+
+    Rank loss layer takes batch inputs with size batch_size (batch_size >= 1).
+
+    Args:
+        label (Variable): Indicats whether A ranked higher than B or not.
+        left (Variable): RankNet's output score for doc A.
+        right (Variable): RankNet's output score for doc B.
+        name(str|None): A name for this layer(optional). If set None, the layer
+                        will be named automatically.
+
+    Returns:
+        list: The value of rank loss.
+
+    Raises:
+        ValueError: Any of label, left, and right is not a variable.
+
+    Examples:
+
+        .. code-block:: python
+
+            label = fluid.layers.data(name="label", shape=[4, 1], dtype="float32")
+            left = fluid.layers.data(name="left", shape=[4, 1], dtype="float32")
+            right = fluid.layers.data(name="right", shape=[4, 1], dtype="float32")
+            out = fluid.layers.rank_loss(label, left, right)
+
+
+    """
+    helper = LayerHelper('rank_loss', **locals())
+
+    if not (isinstance(label, Variable)):
+        raise ValueError("The label should be a Variable")
+
+    if not (isinstance(left, Variable)):
+        raise ValueError("The left should be a Variable")
+
+    if not (isinstance(right, Variable)):
+        raise ValueError("The right should be a Variable")
+
+    out = helper.create_tmp_variable("float32")
+
+    helper.append_op(
+        type='rank_loss',
+        inputs={"Label": label,
+                "Left": left,
+                "Right": right},
+        outputs={'Out': out})
+    return out
+
+
+def pad2d(input,
+          paddings=[0, 0, 0, 0],
+          mode='constant',
+          pad_value=0.0,
+          data_format="NCHW",
+          name=None):
+    """
+    Pad 2-d images accordding to 'paddings' and 'mode'.
+    If mode is 'reflect', paddings[0] and paddings[1] must be no greater
+    than height-1. And the width dimension has the same condition.
+
+    Example:
+
+      Given that X is a channel of image from input:
+      
+      X = [[1, 2, 3],
+           [4, 5, 6]]
+      
+      Case 0:
+      
+        paddings = [0, 1, 2, 3],
+        mode = 'constant'
+        pad_value = 0
+        
+        Out = [[0, 0, 1, 2, 3, 0, 0, 0]
+               [0, 0, 4, 5, 6, 0, 0, 0]
+               [0, 0, 0, 0, 0, 0, 0, 0]]
+      
+      Case 1:
+      
+        paddings = [0, 1, 2, 1],
+        mode = 'reflect'
+        
+        Out = [[3, 2, 1, 2, 3, 2]
+               [6, 5, 4, 5, 6, 5]
+               [3, 2, 1, 2, 3, 2]]
+        
+      Case 2:
+      
+        paddings = [0, 1, 2, 1],
+        mode = 'edge'
+        
+        Out = [[1, 1, 1, 2, 3, 3]
+               [4, 4, 4, 5, 6, 6]
+               [4, 4, 4, 5, 6, 6]]
+    
+  
+    Args:
+        input (Variable): The input image with [N, C, H, W] format or [N, H, W, C] format.
+        paddings (tuple|list): The padding size. If padding is a tuple, it must
+            contain four integers, (padding_top, padding_bottom, padding_left, padding_right).
+            Default: padding = [0, 0, 0, 0].
+        mode (str): Three modes: constant(default), reflect, edge. Default: constant
+        pad_value (float32): The value to fill the padded areas in constant mode. Default: 0
+        data_format (str): An optional string from: "NHWC", "NCHW". Specify the data format of
+                           the input data.
+                           Default: "NCHW"
+        name (str|None): A name for this layer(optional). If set None, the layer
+            will be named automatically.
+
+    Returns:
+        Variable: The tensor variable padded accordding to paddings and mode.
+
+
+    Examples:
+        .. code-block:: python
+
+          data = fluid.layers.data(name='data', shape=[3, 32, 32], dtype='float32')
+          result = fluid.layers.pad2d(input=data, padding=[1,2,3,4], mode='reflect')
+    """
+
+    helper = LayerHelper('pad2d', **locals())
+    dtype = helper.input_dtype(input_param_name='input')
+    out = helper.create_tmp_variable(dtype)
+    helper.append_op(
+        type='pad2d',
+        inputs={'X': input},
+        outputs={"Out": out},
+        attrs={
+            'paddings': paddings,
+            'mode': mode,
+            'pad_value': pad_value,
+            'data_frmat': data_format
+        })
+
+    return out
+
+
+def prelu(x, mode, param_attr=None, name=None):
+    """
+    Equation:
+
+        y = \max(0, x) + alpha \min(0, x)
+
+    Args:
+        x (Variable): The input tensor.
+	  param_attr(ParamAttr|None): The parameter attribute for the learnable
+                                    weight (alpha).
+        mode (string): The mode for weight sharing
+		       all: all elements share same weight
+ 		       channel:elements in a channel share same weight
+ 		       element:each element has a weight
+	name(str|None): A name for this layer(optional). If set None, the layer
+                        will be named automatically. 
+
+    Returns:
+        Variable: The output tensor with the same shape as input.
+
+    Examples:
+
+        .. code-block:: python
+
+         x = fluid.layers.data(name="x", shape=[10,10], dtype="float32")
+            mode = 'channel'
+            output = fluid.layers.prelu(x,mode)
+    """
+    helper = LayerHelper('prelu', **locals())
+    if mode not in ['all', 'channel', 'element']:
+        raise ValueError('mode should be one of all, channel, element.')
+    alpha_shape = [1]
+    if mode == 'channel':
+        alpha_shape = [1, x.shape[1], 1, 1]
+    elif mode == 'element':
+        alpha_shape = x.shape
+    dtype = helper.input_dtype(input_param_name='x')
+    alpha = helper.create_parameter(
+        attr=param_attr,
+        shape=alpha_shape,
+        dtype='float32',
+        is_bias=False,
+        default_initializer=Constant(1.0))
+    out = helper.create_tmp_variable(dtype)
+    helper.append_op(
+        type="prelu",
+        inputs={"X": x,
+                'Alpha': alpha},
+        attrs={"mode": mode},
+        outputs={"Out": out})
+    return out
+
+
+def flatten(x, axis=1, name=None):
+    """
+    **Flatten layer**
+    Flattens the input tensor into a 2D matrix.
+
+    Examples:
+    Case 1:
+      Given
+        X.shape = (3, 100, 100, 4)
+      and
+        axis = 2
+      We get:
+        Out.shape = (3 * 100, 4 * 100)
+
+    Case 2:
+      Given
+        X.shape = (3, 100, 100, 4)
+      and
+        axis = 0
+      We get:
+        Out.shape = (1, 3 * 100 * 100 * 4)
+
+    Args:
+        x (Variable): A tensor of rank >= axis.
+        axis (int): Indicate up to which input dimensions (exclusive) should
+                    be flattened to the outer dimension of the output.
+                    The value for axis must be in the range [0, R], where R
+                    is the rank of the input tensor. When axis = 0, the shape
+                    of the output tensor is (1, (d_0 X d_1 ... d_n), where the
+                    shape of the input tensor is (d_0, d_1, ... d_n).
+        name(str|None): A name for this layer(optional). If set None, the layer
+                        will be named automatically.
+
+    Returns:
+        Variable: A 2D tensor with the contents of the input tensor, with input
+                  dimensions up to axis flattened to the outer dimension of
+                  the output and remaining input dimensions flattened into the
+                  inner dimension of the output.
+
+    Raises:
+        ValueError: If x is not a variable.
+        ValueError: If axis is not in range [0, rank(x)].
+
+    Examples:
+
+        .. code-block:: python
+
+            x = fluid.layers.data(name="x", shape=[4, 4, 3], dtype="float32")
+            out = fluid.layers.flatten(x=x, axis=2)
+    """
+    helper = LayerHelper('flatten', **locals())
+
+    if not (isinstance(x, Variable)):
+        raise ValueError("The input x should be a Variable")
+
+    if not (isinstance(axis, int)) or axis > len(x.shape) or axis < 0:
+        raise ValueError("The axis should be a int, and in range [0, rank(x)]")
+
+    out = helper.create_tmp_variable(x.dtype)
+    x_shape = helper.create_tmp_variable(x.dtype)
+    helper.append_op(
+        type='flatten2',
+        inputs={"X": x},
+        outputs={'Out': out,
+                 'XShape': x_shape},
+        attrs={"axis": axis})
+    return out
+
+
+def sequence_enumerate(input, win_size, pad_value=0, name=None):
+    """
+    Generate a new sequence for the input index sequence, which enumerates all the
+    sub-sequences with length `win_size` of the input. 
+    The enumerated sequence has the same 1st dimension with variable `input`, and
+    the 2nd dimension is `win_size`, padded by `pad_value` if necessary in generation.
+    
+    Examples:
+    Case 1:
+      Input:
+        X.lod = [[0, 3, 5]]
+        X.data = [[1], [2], [3], [4], [5]]
+        X.dims = [5, 1]
+      Attrs:
+        win_size = 2
+        pad_value = 0
+      Output:
+        Out.lod = [[0, 3, 5]]
+        Out.data = [[1, 2], [2, 3], [3, 0], [4, 5], [5, 0]]
+        Out.dims = [5, 2]
+
+    Args:
+        input (Variable): The input variable which is a index sequence.
+        win_size (int): The window size for enumerating all sub-sequences.
+        pad_value (int): The padding value, default 0.
+
+    Returns:
+        Variable: The enumerate sequence variable which is a LoDTensor.
+
+    Examples:
+        .. code-block:: python
+
+            x = fluid.layers.data(shape[30, 1], dtype='int32', lod_level=1)
+            out = fluid.layers.sequence_enumerate(input=x, win_size=3, pad_value=0)
+    """
+    helper = LayerHelper('sequence_enumerate', **locals())
+    out = helper.create_tmp_variable(helper.input_dtype(), stop_gradient=True)
+    helper.append_op(
+        type='sequence_enumerate',
+        inputs={'X': input},
+        outputs={'Out': out},
+        attrs={'win_size': win_size,
+               'pad_value': pad_value})
+
+
+def sequence_mask(x, maxlen=None, dtype='int64', name=None):
+    """
+    **SequenceMask Layer**
+
+    This layer outputs a mask according to the input :code:`x` and
+    :code:`maxlen` with data type of :code:`dtype`.
+
+    Supposing :code:`x` is a Tensor with shape [d_1, d_2, ..., d_n], the
+    :code:`y` is a mask with shape [d_1, d_2, ..., d_n, maxlen], where:
+
+    .. math::
+
+        y(i_1, i_2,..., i_n, j) = (j < x(i_1, i_2,..., i_n))
+
+    Args:
+        x (Variable): Input tensor of sequence_mask layer,
+                      whose elements are integers less than :code:`maxlen`.
+        maxlen (int|None): Maximum length of the sequence. If :code:`maxlen`
+                           is None, it would be replace with :math:`max(x)`.
+        dtype (np.dtype|core.VarDesc.VarType|str): Data type of the output.
+        name (str|None): A name for this layer(optional). If set None, the
+                         layer will be named automatically.
+
+    Returns:
+        Variable: The output sequence mask.
+
+    """
+
+    helper = LayerHelper('sequence_mask', **locals())
+    if name is None:
+        out = helper.create_tmp_variable(dtype=dtype)
+    else:
+        out = helper.create_tmp_variable(dtype=dtype, name=name)
+
+    helper.append_op(
+        type='sequence_mask',
+        inputs={'X': [x]},
+        outputs={'Y': out},
+        attrs={
+            'max_len': maxlen if maxlen is not None else -1,
+            'out_dtype': out.dtype
+        })
+    return out
+
+
+def stack(x, axis=0):
+    """
+    **Stack Layer**
+
+    This layer stacks all of the input :code:`x` along axis.
+
+    Input :code:`x` can be a single variable, a :code:`list` of variables,
+    or a :code:`tuple` of variables. If :code:`x` is a :code:`list` or
+    :code:`tuple`, the shapes of all these variables must be the same.
+    Supposing the shape of each input is :math:`[d_0, d_1, ..., d_{n-1}]`,
+    the shape of the output variable would be
+    :math:`[d_0, d_1, ..., d_{axis}=len(x), ..., d_{n-1}]`.
+    If :code:`axis` < 0, it would be replaced with :code:`axis+rank(x[0])+1`.
+    If :code:`axis` is None, it would be replaced with 0.
+
+    Args:
+        x (Variable|list(Variable)|tuple(Variable)): Input variables.
+        axis (int|None): The axis along which all inputs are stacked.
+
+    Returns:
+        Variable: The stacked variable.
+
+    """
+
+    helper = LayerHelper('stack', **locals())
+    axis = 0 if axis is None else axis
+
+    if not isinstance(x, list) and not isinstance(x, tuple):
+        x = [x]
+
+    out = helper.create_tmp_variable(x[0].dtype)
+    helper.append_op(
+        type='stack', inputs={'X': x}, outputs={'Y': out},
+        attrs={'axis': axis})
+
+    return out
+
+
+def unstack(x, axis=0, num=None):
+    """
+    **UnStack Layer**
+
+    This layer unstacks input :code:`x` into several tensors along axis.
+   
+    If :code:`axis` < 0, it would be replaced with :code:`axis+rank(x)`.
+    If :code:`num` is None, it would be inferred from :code:`x.shape[axis]`,
+    and if :code:`x.shape[axis]` <= 0 or is unknown, :code:`ValueError` is
+    raised. 
+
+    Args:
+        x (Variable): Input variable. 
+        axis (int): The axis along which the input is unstacked.
+        num (int|None): The number of output variables.
+    
+    Returns:
+        list(Variable): The unstacked variables.
+    
+    """
+
+    helper = LayerHelper('unstack', **locals())
+    if num is None:
+        if axis is None or x.shape[axis] <= 0:
+            raise ValueError('unknown unstack number')
+        else:
+            num = x.shape[axis]
+
+    outs = []
+    for _ in num:
+        outs.append(helper.create_tmp_variable(x.dtype))
+
+    helper.append_op(
+        type='unstack',
+        inputs={'X': [x]},
+        outputs={'Y': outs},
+        attrs={'axis': axis,
+               'num': num})
+    return outs
diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py
index 9e97ec9a6f55680a2eb44ad712ac002df4fecda5..129252653dc139b7405626e6fd410704a4ad06d9 100644
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -11,7 +11,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from layer_function_generator import generate_layer_fn
+
+from __future__ import print_function
+from .layer_function_generator import generate_layer_fn
 
 __activations__ = [
     'sigmoid',
@@ -62,13 +64,11 @@ __all__ = [
     'logical_not',
     'uniform_random_batch_size_like',
     'gaussian_random',
+    'sampling_id',
     'gaussian_random_batch_size_like',
-    'scatter',
     'sum',
     'slice',
-    'polygon_box_transform',
     'shape',
-    'iou_similarity',
     'maxout',
 ] + __activations__
 
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index b6614ecf3bc16e73683f4991779769049c6800ed..04e71497aa762e390c4123c0bf3d7f111a772dd4 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -12,13 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 from ..layer_helper import LayerHelper
 from ..param_attr import ParamAttr
 from ..framework import convert_np_dtype_to_dtype_
 from ..framework import Variable
 from ..initializer import Constant, force_init_on_cpu
 from ..core import VarDesc
-from layer_function_generator import templatedoc
+from .layer_function_generator import templatedoc
 import numpy
 
 __all__ = [
diff --git a/python/paddle/fluid/layers/utils.py b/python/paddle/fluid/layers/utils.py
index 49ec3088831dff415e042e1b0a632f63106eb07b..5688f04ab2382f5731e69c60225765a2094bba8c 100644
--- a/python/paddle/fluid/layers/utils.py
+++ b/python/paddle/fluid/layers/utils.py
@@ -11,6 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from __future__ import print_function
 import numpy as np
 
 
diff --git a/python/paddle/fluid/lod_tensor.py b/python/paddle/fluid/lod_tensor.py
index b2b3186c1e8dd84e1527ff18744bd611f1f74c5f..a9de09f31f4ed04ba1aa003e85b25fc5a91557e4 100644
--- a/python/paddle/fluid/lod_tensor.py
+++ b/python/paddle/fluid/lod_tensor.py
@@ -12,7 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import core
+from __future__ import print_function
+
+from . import core
 import numpy as np
 
 __all__ = ['create_lod_tensor', 'create_random_int_lodtensor']
@@ -24,7 +26,7 @@ def create_lod_tensor(data, recursive_seq_lens, place):
 
     Create a lod tensor by doing the following:
 
-    1. Check that the length-based level of detail (LoD) also known as 
+    1. Check that the length-based level of detail (LoD) also known as
        recursive_sequence_lengths of the input is valid.
 
     2. Convert recursive_sequence_lengths to a offset-based LoD.
@@ -33,7 +35,7 @@ def create_lod_tensor(data, recursive_seq_lens, place):
        CPU or GPU device (based on input place).
 
     4. Set the level of detail (LoD) using the offset-based LoD.
-    
+
     Examples:
 
         Suppose we want LoDTensor to hold data for sequences of word, where each
@@ -51,7 +53,7 @@ def create_lod_tensor(data, recursive_seq_lens, place):
     Args:
         data(numpy.ndarray|list|LoDTensor): a numpy array or a LoDTensor or a
             list holding the data to be copied.
-        recursive_seq_lens(list): a list of lists indicating the length-based level of detail 
+        recursive_seq_lens(list): a list of lists indicating the length-based level of detail
             info specified by the user.
         place(Place): CPU or GPU place indicating where the data in the new
             LoDTensor will be stored.
@@ -62,10 +64,10 @@ def create_lod_tensor(data, recursive_seq_lens, place):
     if isinstance(data, core.LoDTensor):
         return create_lod_tensor(np.array(data), recursive_seq_lens, place)
     elif isinstance(data, list):
-        # When input data is a list, it only deal with the case where the base element 
-        # is an index of shape [1] and dtype int64 (e.g., word id). Hence, the generated 
-        # LoDTensor will be of shape [n, 1] and dtype int64, where `n` is the total number 
-        # of words or other indexes in the sequence. 
+        # When input data is a list, it only deal with the case where the base element
+        # is an index of shape [1] and dtype int64 (e.g., word id). Hence, the generated
+        # LoDTensor will be of shape [n, 1] and dtype int64, where `n` is the total number
+        # of words or other indexes in the sequence.
         new_recursive_seq_lens = []
         for seq in data:
             new_recursive_seq_lens.append(len(seq))
@@ -109,12 +111,12 @@ def create_random_int_lodtensor(recursive_seq_lens, base_shape, place, low,
     Suppose we want LoDTensor to hold data for sequences of word, where each
     word is represented by an integer. If we want to create a LoDTensor to
     represent two sentences, one of 2 words, and one of 3 words. Then
-    'base_shape' is [1], input length-based 'recursive_seq_lens' is [[2, 3]]. 
-    Then the overall shape of the LoDTensor would be [5, 1], holding 5 words 
+    'base_shape' is [1], input length-based 'recursive_seq_lens' is [[2, 3]].
+    Then the overall shape of the LoDTensor would be [5, 1], holding 5 words
     for two sentences.
 
     Args:
-        recursive_seq_lens(list): a list of lists indicating the length-based 
+        recursive_seq_lens(list): a list of lists indicating the length-based
             level of detail info specified by the user.
         base_shape(list): the shape of the basic element to be held by the
             LoDTensor.
@@ -124,11 +126,11 @@ def create_random_int_lodtensor(recursive_seq_lens, base_shape, place, low,
         high(int): the upper bound of the random integers.
 
     Returns:
-        A fluid LoDTensor object with tensor data and recursive_seq_lens info. 
+        A fluid LoDTensor object with tensor data and recursive_seq_lens info.
     """
     assert isinstance(base_shape, list), "base_shape should be a list"
     # append the total number of basic elements to the front of its shape
     overall_shape = [sum(recursive_seq_lens[-1])] + base_shape
-    # the range of integer data elements is [low, high]    
+    # the range of integer data elements is [low, high]
     data = np.random.random_integers(low, high, overall_shape).astype("int64")
     return create_lod_tensor(data, recursive_seq_lens, place)
diff --git a/python/paddle/fluid/metrics.py b/python/paddle/fluid/metrics.py
index 17bb0826a6ea86c98a069263dfab84b99e1177ad..0c2800dcf35ed156b71625babea2724f520575e5 100644
--- a/python/paddle/fluid/metrics.py
+++ b/python/paddle/fluid/metrics.py
@@ -14,11 +14,15 @@
 """
 Fluid Metrics
 
-The metrics are accomplished via Python natively. 
+The metrics are accomplished via Python natively.
 """
+
+from __future__ import print_function
+
 import numpy as np
 import copy
 import warnings
+import six
 
 __all__ = [
     'MetricBase',
@@ -79,10 +83,10 @@ class MetricBase(object):
         """
         states = {
             attr: value
-            for attr, value in self.__dict__.iteritems()
+            for attr, value in six.iteritems(self.__dict__)
             if not attr.startswith("_")
         }
-        for attr, value in states.iteritems():
+        for attr, value in six.iteritems(states):
             if isinstance(value, int):
                 setattr(self, attr, 0)
             elif isinstance(value, float):
@@ -105,7 +109,7 @@ class MetricBase(object):
         """
         states = {
             attr: value
-            for attr, value in self.__dict__.iteritems()
+            for attr, value in six.iteritems(self.__dict__)
             if not attr.startswith("_")
         }
         config = {}
@@ -141,10 +145,10 @@ class CompositeMetric(MetricBase):
     """
     Composite multiple metrics in one instance.
     for example, merge F1, accuracy, recall into one Metric.
-    
+
     Examples:
         .. code-block:: python
-    
+
           labels = fluid.layers.data(name="data", shape=[1], dtype="int32")
           data = fluid.layers.data(name="data", shape=[32, 32], dtype="int32")
           pred = fluid.layers.fc(input=data, size=1000, act="tanh")
@@ -554,8 +558,6 @@ class Auc(MetricBase):
         name: metric name
         curve: Specifies the name of the curve to be computed, 'ROC' [default] or
           'PR' for the Precision-Recall-curve.
-        num_thresholds: The number of thresholds to use when discretizing the roc
-            curve.
 
     "NOTE: only implement the ROC curve type via Python now."
 
@@ -570,15 +572,14 @@ class Auc(MetricBase):
                 numpy_auc = metric.eval()
     """
 
-    def __init__(self, name, curve='ROC', num_thresholds=200):
+    def __init__(self, name, curve='ROC', num_thresholds=4095):
         super(Auc, self).__init__(name=name)
         self._curve = curve
         self._num_thresholds = num_thresholds
-        self._epsilon = 1e-6
-        self.tp_list = np.zeros((num_thresholds, ))
-        self.fn_list = np.zeros((num_thresholds, ))
-        self.tn_list = np.zeros((num_thresholds, ))
-        self.fp_list = np.zeros((num_thresholds, ))
+
+        _num_pred_buckets = num_thresholds + 1
+        self._stat_pos = [0] * _num_pred_buckets
+        self._stat_neg = [0] * _num_pred_buckets
 
     def update(self, preds, labels):
         if not _is_numpy_(labels):
@@ -586,41 +587,32 @@ class Auc(MetricBase):
         if not _is_numpy_(preds):
             raise ValueError("The 'predictions' must be a numpy ndarray.")
 
-        kepsilon = 1e-7  # to account for floating point imprecisions
-        thresholds = [(i + 1) * 1.0 / (self._num_thresholds - 1)
-                      for i in range(self._num_thresholds - 2)]
-        thresholds = [0.0 - kepsilon] + thresholds + [1.0 + kepsilon]
-
-        # caculate TP, FN, TN, FP count
-        for idx_thresh, thresh in enumerate(thresholds):
-            tp, fn, tn, fp = 0, 0, 0, 0
-            for i, lbl in enumerate(labels):
-                if lbl:
-                    if preds[i, 1] >= thresh:
-                        tp += 1
-                    else:
-                        fn += 1
-                else:
-                    if preds[i, 1] >= thresh:
-                        fp += 1
-                    else:
-                        tn += 1
-            self.tp_list[idx_thresh] += tp
-            self.fn_list[idx_thresh] += fn
-            self.tn_list[idx_thresh] += tn
-            self.fp_list[idx_thresh] += fp
+        for i, lbl in enumerate(labels):
+            value = preds[i, 1]
+            bin_idx = int(value * self._num_thresholds)
+            assert bin_idx <= self._num_thresholds
+            if lbl:
+                self._stat_pos[bin_idx] += 1.0
+            else:
+                self._stat_neg[bin_idx] += 1.0
+
+    @staticmethod
+    def trapezoid_area(x1, x2, y1, y2):
+        return abs(x1 - x2) * (y1 + y2) / 2.0
 
     def eval(self):
-        epsilon = self._epsilon
-        num_thresholds = self._num_thresholds
-        tpr = (self.tp_list.astype("float32") + epsilon) / (
-            self.tp_list + self.fn_list + epsilon)
-        fpr = self.fp_list.astype("float32") / (
-            self.fp_list + self.tn_list + epsilon)
-        rec = (self.tp_list.astype("float32") + epsilon) / (
-            self.tp_list + self.fp_list + epsilon)
-
-        x = fpr[:num_thresholds - 1] - fpr[1:]
-        y = (tpr[:num_thresholds - 1] + tpr[1:]) / 2.0
-        auc_value = np.sum(x * y)
-        return auc_value
+        tot_pos = 0.0
+        tot_neg = 0.0
+        auc = 0.0
+
+        idx = self._num_thresholds
+        while idx >= 0:
+            tot_pos_prev = tot_pos
+            tot_neg_prev = tot_neg
+            tot_pos += self._stat_pos[idx]
+            tot_neg += self._stat_neg[idx]
+            auc += self.trapezoid_area(tot_neg, tot_neg_prev, tot_pos,
+                                       tot_pos_prev)
+            idx -= 1
+
+        return auc / tot_pos / tot_neg if tot_pos > 0.0 and tot_neg > 0.0 else 0.0
diff --git a/python/paddle/fluid/net_drawer.py b/python/paddle/fluid/net_drawer.py
index 73946a0721dc4a6d03074a4708cf574951412e66..0b61c23d07e95acf7b4564753f748e7fb497e73e 100644
--- a/python/paddle/fluid/net_drawer.py
+++ b/python/paddle/fluid/net_drawer.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import argparse
 import json
 import logging
@@ -24,7 +26,7 @@ logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
 
 try:
-    from graphviz import Digraph
+    from .graphviz import Digraph
 except ImportError:
     logger.info(
         'Cannot import graphviz, which is required for drawing a network. This '
@@ -77,7 +79,7 @@ def parse_graph(program, graph, var_dict, **kwargs):
     # fill the known variables
     for block in program.blocks:
         for var in block.vars:
-            if not var_dict.has_key(var):
+            if var not in var_dict:
                 var_dict[var] = "Feed"
 
     temp_id = 0
@@ -93,17 +95,17 @@ def parse_graph(program, graph, var_dict, **kwargs):
                     var_dict[arg] = op.type
             for e in op.inputs:
                 for arg in e.arguments:
-                    if var_dict.has_key(arg):
+                    if arg in var_dict:
                         graph.edge(**draw_edge(var_dict, op, e, arg))
         break  # only plot the first block
 
 
 def draw_graph(startup_program, main_program, **kwargs):
-    if kwargs.has_key("graph_attr"):
+    if "graph_attr" in kwargs:
         GRAPH_STYLE.update(kwargs[graph_attr])
-    if kwargs.has_key("node_attr"):
+    if "node_attr" in kwargs:
         OP_STYLE.update(kwargs[node_attr])
-    if kwargs.has_key("edge_attr"):
+    if "edge_attr" in kwargs:
         VAR_STYLE.update(kwargs[edge_attr])
 
     graph_id = unique_id()
diff --git a/python/paddle/fluid/nets.py b/python/paddle/fluid/nets.py
index 9b3f2aebee73e56ee820dc8ff4c9cfabd1456aaa..051fe84364639ca6028326c0cb02b204a02531af 100644
--- a/python/paddle/fluid/nets.py
+++ b/python/paddle/fluid/nets.py
@@ -11,7 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import layers
+
+from __future__ import print_function
+import six
+from . import layers
 
 __all__ = [
     "simple_img_conv_pool",
@@ -210,7 +213,7 @@ def img_conv_group(input,
     conv_with_batchnorm = __extend_list__(conv_with_batchnorm)
     conv_batchnorm_drop_rate = __extend_list__(conv_batchnorm_drop_rate)
 
-    for i in xrange(len(conv_num_filter)):
+    for i in six.moves.range(len(conv_num_filter)):
         local_conv_act = conv_act
         if conv_with_batchnorm[i]:
             local_conv_act = None
@@ -488,10 +491,11 @@ def scaled_dot_product_attention(queries,
         trans_x = layers.transpose(x, perm=[0, 2, 1, 3])
         return layers.reshape(
             x=trans_x,
-            shape=map(int, [
-                trans_x.shape[0], trans_x.shape[1],
-                trans_x.shape[2] * trans_x.shape[3]
-            ]))
+            shape=list(
+                map(int, [
+                    trans_x.shape[0], trans_x.shape[1], trans_x.shape[2] *
+                    trans_x.shape[3]
+                ])))
 
     q, k, v = __compute_qkv(queries, keys, values, num_heads)
 
diff --git a/python/paddle/fluid/op.py b/python/paddle/fluid/op.py
index 0b76e94157e378b40baff641c466968e239d8a83..667db10d3ebdd24ddd9efbe2310ebb331e268ee2 100644
--- a/python/paddle/fluid/op.py
+++ b/python/paddle/fluid/op.py
@@ -12,6 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
+import numpy as np
+import six
+
 import paddle.fluid.core as core
 import paddle.fluid.proto.framework_pb2 as framework_pb2
 
@@ -24,13 +29,13 @@ def get_all_op_protos():
     protostrs = core.get_all_op_protos()
     ret_values = []
     for pbstr in protostrs:
-        op_proto = framework_pb2.OpProto.FromString(str(pbstr))
+        op_proto = framework_pb2.OpProto.FromString(six.binary_type(pbstr))
         ret_values.append(op_proto)
     return ret_values
 
 
 def is_str(s):
-    return isinstance(s, str) or isinstance(s, unicode)
+    return isinstance(s, six.string_types)
 
 
 class OpDescCreationMethod(object):
@@ -97,6 +102,8 @@ class OpDescCreationMethod(object):
                 new_attr = op_desc.attrs.add()
                 new_attr.name = attr.name
                 new_attr.type = attr.type
+                if isinstance(user_defined_attr, np.ndarray):
+                    user_defined_attr = user_defined_attr.tolist()
                 if attr.type == framework_pb2.INT:
                     new_attr.i = user_defined_attr
                 elif attr.type == framework_pb2.FLOAT:
@@ -189,7 +196,7 @@ class OperatorFactory(object):
         return self.get_op_info(t).method(**kwargs)
 
     def types(self):
-        return self.op_methods.keys()
+        return list(self.op_methods.keys())
 
     def get_op_info(self, t):
         if t not in self.op_methods:
@@ -197,13 +204,13 @@ class OperatorFactory(object):
         return self.op_methods.get(t)
 
     def get_op_input_names(self, type):
-        return map(lambda x: x[0], self.get_op_info(type).inputs)
+        return [x[0] for x in self.get_op_info(type).inputs]
 
     def get_op_inputs(self, type):
         return self.get_op_info(type).inputs
 
     def get_op_output_names(self, type):
-        return map(lambda x: x[0], self.get_op_info(type).outputs)
+        return [x[0] for x in self.get_op_info(type).outputs]
 
     def get_op_outputs(self, type):
         return self.get_op_info(type).outputs
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 75ee40fa9ca94cdd84ee7acbb62d6e652ac7fa33..215f0cf2fc5ab4fbd06719ac4790a01dd00080eb 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -11,25 +11,27 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from __future__ import print_function
 import re
 from collections import defaultdict
-from paddle.fluid.framework import Program, Variable
-import framework
-import layers
-from backward import append_backward
-from framework import program_guard
-import unique_name
-from initializer import Constant
-from layer_helper import LayerHelper
-from regularizer import append_regularization_ops
-from clip import append_gradient_clip_ops, error_clip_callback
+from paddle.fluid.framework import Program, Variable, name_scope
+from . import framework
+from . import layers
+from .backward import append_backward
+from .framework import program_guard
+from . import unique_name
+from .initializer import Constant
+from .layer_helper import LayerHelper
+from .regularizer import append_regularization_ops
+from .clip import append_gradient_clip_ops, error_clip_callback
 from contextlib import contextmanager
 
 __all__ = [
     'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad', 'Ftrl',
     'SGDOptimizer', 'MomentumOptimizer', 'AdagradOptimizer', 'AdamOptimizer',
     'AdamaxOptimizer', 'DecayedAdagradOptimizer', 'RMSPropOptimizer',
-    'FtrlOptimizer', 'Adadelta', 'ModelAverage', 'Optimizer', 'RMSPropOptimizer'
+    'FtrlOptimizer', 'Adadelta', 'ModelAverage', 'RMSPropOptimizer'
 ]
 
 
@@ -44,10 +46,12 @@ class Optimizer(object):
     def __init__(self,
                  learning_rate,
                  regularization=None,
-                 LARS_weight_decay=0.0):
+                 LARS_weight_decay=0.0,
+                 name=None):
         if not isinstance(learning_rate, float) and \
                 not isinstance(learning_rate, framework.Variable):
             raise TypeError("learning rate should be float or Variable")
+        self._name = name
         self.regularization = regularization
         self._learning_rate = learning_rate
         # the learning rate type should be inferenced from loss
@@ -67,7 +71,7 @@ class Optimizer(object):
         self._LARS_weight_decay = LARS_weight_decay
 
     def _create_global_learning_rate(self):
-        lr = self.global_learning_rate()
+        lr = self._global_learning_rate()
 
         if isinstance(lr, framework.Variable):
             return
@@ -86,7 +90,7 @@ class Optimizer(object):
             dtype='float32' if self._dtype == None else self._dtype,
             persistable=True)
 
-    def global_learning_rate(self, program=None):
+    def _global_learning_rate(self, program=None):
         """
         get global decayed learning rate
         :return:
@@ -110,9 +114,9 @@ class Optimizer(object):
             return param_lr
         else:
             if param_lr == 1.0:
-                return self.global_learning_rate()
+                return self._global_learning_rate()
             else:
-                return self.global_learning_rate() * param_lr
+                return self._global_learning_rate() * param_lr
 
     def _create_accumulators(self, block, parameters):
         """Create all accumulators needed by the parameters
@@ -123,7 +127,7 @@ class Optimizer(object):
         """
         pass
 
-    def _finish_update(self, block):
+    def _finish_update(self, block, parameters_and_grads):
         """Finish any custom updates needed
            before completing an optimization step
 
@@ -132,7 +136,7 @@ class Optimizer(object):
             parameters: list of parameter variables for the optimizer
 
         Returns:
-            list of finish ops or None
+            None
         """
         pass
 
@@ -151,6 +155,8 @@ class Optimizer(object):
             dtype: data type of the accumulator variable
             fill_value: value to initialize the accumulator variable
         """
+        if self._name is not None:
+            name = self._name + "_" + name
         if (name in self._accumulators and
                 param.name in self._accumulators[name]):
             raise Exception("Accumulator {} already exists for parameter {}".
@@ -179,16 +185,18 @@ class Optimizer(object):
         Returns:
             accumulator variable for the parameter
         """
+        if self._name is not None:
+            name = self._name + "_" + name
         if (name not in self._accumulators or
                 param.name not in self._accumulators[name]):
             raise Exception("Accumulator {} does not exist for parameter {}".
                             format(name, param.name))
         return self._accumulators[name][param.name]
 
-    def create_optimization_pass(self,
-                                 parameters_and_grads,
-                                 loss,
-                                 startup_program=None):
+    def _create_optimization_pass(self,
+                                  parameters_and_grads,
+                                  loss,
+                                  startup_program=None):
         """Add optimization operators to update gradients to variables.
 
         Args:
@@ -221,25 +229,26 @@ class Optimizer(object):
             self._create_global_learning_rate()
             if self._LARS_weight_decay > 0.0:
                 layers.append_LARS(parameters_and_grads,
-                                   self.global_learning_rate(),
+                                   self._global_learning_rate(),
                                    self._LARS_weight_decay)
 
             optimize_ops = []
             for param_and_grad in parameters_and_grads:
+                if param_and_grad[1] is None:
+                    continue
                 with param_and_grad[0].block.program.optimized_guard(
-                        param_and_grad[0]):
-                    if param_and_grad[0].trainable is True and param_and_grad[
-                            1] is not None:
+                        param_and_grad), name_scope("optimizer"):
+                    if param_and_grad[0].trainable is True:
                         optimize_op = self._append_optimize_op(loss.block,
                                                                param_and_grad)
                         optimize_ops.append(optimize_op)
 
             # Get custom finish ops for subclasses
             # FIXME: Need to fix this once we figure out how to handle dependencies
-            self._finish_update(loss.block)
+            self._finish_update(loss.block, parameters_and_grads)
 
             end = len(global_block.ops)
-            return global_block.slice_ops(start, end)
+            return global_block._slice_ops(start, end)
 
     def minimize(self,
                  loss,
@@ -262,8 +271,8 @@ class Optimizer(object):
         params_grads = append_regularization_ops(params_grads,
                                                  self.regularization)
 
-        optimize_ops = self.create_optimization_pass(params_grads, loss,
-                                                     startup_program)
+        optimize_ops = self._create_optimization_pass(params_grads, loss,
+                                                      startup_program)
         return optimize_ops, params_grads
 
 
@@ -323,7 +332,7 @@ class MomentumOptimizer(Optimizer):
 
         & if (use\_nesterov):
 
-        &\quad   param = param - gradient * learning\_rate + mu * velocity * learning\_rate
+        &\quad   param = param - (gradient + mu * velocity) * learning\_rate
 
         & else:
 
@@ -486,6 +495,8 @@ class AdamOptimizer(Optimizer):
     """
     _moment1_acc_str = "moment1"
     _moment2_acc_str = "moment2"
+    _beta1_pow_acc_str = "beta1_pow_acc"
+    _beta2_pow_acc_str = "beta2_pow_acc"
 
     def __init__(self,
                  learning_rate=0.001,
@@ -507,32 +518,22 @@ class AdamOptimizer(Optimizer):
     def _create_accumulators(self, block, parameters):
         assert isinstance(block, framework.Block)
 
-        main_block = block.program.global_block()
-        # Create beta1 and beta2 power tensors
-        beta_shape = [1]
-        self._beta1_pow_acc = self.helper.create_global_variable(
-            name=unique_name.generate('beta1_pow_acc'),
-            dtype='float32' if self._dtype == None else self._dtype,
-            shape=beta_shape,
-            lod_level=0,
-            persistable=True)
-        self.helper.set_variable_initializer(
-            self._beta1_pow_acc, initializer=Constant(self._beta1))
-
-        self._beta2_pow_acc = self.helper.create_global_variable(
-            name=unique_name.generate('beta2_pow_acc'),
-            dtype='float32' if self._dtype == None else self._dtype,
-            shape=beta_shape,
-            lod_level=0,
-            persistable=True)
-
-        self.helper.set_variable_initializer(
-            self._beta2_pow_acc, initializer=Constant(self._beta2))
-
         # Create accumulator tensors for first and second moments
         for p in parameters:
             self._add_accumulator(self._moment1_acc_str, p)
             self._add_accumulator(self._moment2_acc_str, p)
+            self._add_accumulator(
+                name=self._beta1_pow_acc_str,
+                param=p,
+                dtype='float32',
+                fill_value=self._beta1,
+                shape=[1])
+            self._add_accumulator(
+                name=self._beta2_pow_acc_str,
+                param=p,
+                dtype='float32',
+                fill_value=self._beta2,
+                shape=[1])
 
     def _append_optimize_op(self, block, param_and_grad):
         assert isinstance(block, framework.Block)
@@ -541,6 +542,11 @@ class AdamOptimizer(Optimizer):
                                         param_and_grad[0])
         moment2 = self._get_accumulator(self._moment2_acc_str,
                                         param_and_grad[0])
+        beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
+                                              param_and_grad[0])
+        beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str,
+                                              param_and_grad[0])
+
         # create the adam optimize op
         adam_op = block.append_op(
             type=self.type,
@@ -550,8 +556,8 @@ class AdamOptimizer(Optimizer):
                 "LearningRate": self._create_param_lr(param_and_grad),
                 "Moment1": moment1,
                 "Moment2": moment2,
-                "Beta1Pow": self._beta1_pow_acc,
-                "Beta2Pow": self._beta2_pow_acc
+                "Beta1Pow": beta1_pow_acc,
+                "Beta2Pow": beta2_pow_acc
             },
             outputs={
                 "ParamOut": param_and_grad[0],
@@ -566,24 +572,30 @@ class AdamOptimizer(Optimizer):
 
         return adam_op
 
-    def _finish_update(self, block):
+    def _finish_update(self, block, param_and_grads):
         """Update Beta1 and Beta2 Power accumulators
         """
         assert isinstance(block, framework.Block)
         main_block = block.program.global_block()
-        scale_beta1 = main_block.append_op(
-            type="scale",
-            inputs={"X": self._beta1_pow_acc},
-            outputs={"Out": self._beta1_pow_acc},
-            attrs={"scale": self._beta1})
-
-        scale_beta2 = main_block.append_op(
-            type="scale",
-            inputs={"X": self._beta2_pow_acc},
-            outputs={"Out": self._beta2_pow_acc},
-            attrs={"scale": self._beta2})
-
-        return [scale_beta1, scale_beta2]
+        for param, grad in param_and_grads:
+            if grad is None:
+                continue
+            with param.block.program.optimized_guard([param, grad]):
+                beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
+                                                      param)
+                beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str,
+                                                      param)
+                main_block.append_op(
+                    type="scale",
+                    inputs={"X": beta1_pow_acc},
+                    outputs={"Out": beta1_pow_acc},
+                    attrs={"scale": self._beta1})
+
+                main_block.append_op(
+                    type="scale",
+                    inputs={"X": beta2_pow_acc},
+                    outputs={"Out": beta2_pow_acc},
+                    attrs={"scale": self._beta2})
 
 
 class AdamaxOptimizer(Optimizer):
@@ -626,6 +638,7 @@ class AdamaxOptimizer(Optimizer):
     """
     _moment_acc_str = "moment"
     _inf_norm_acc_str = "inf_norm"
+    _beta1_pow_acc_str = "beta1_pow_acc"
 
     def __init__(self,
                  learning_rate=0.001,
@@ -645,21 +658,16 @@ class AdamaxOptimizer(Optimizer):
         self._epsilon = epsilon
 
     def _create_accumulators(self, block, parameters):
-        # Create beta1 power accumulator tensor
-        beta_shape = [1]
-        self._beta1_pow_acc = self.helper.create_global_variable(
-            name=unique_name.generate('beta1_pow_acc'),
-            dtype='float32' if self._dtype == None else self._dtype,
-            shape=beta_shape,
-            lod_level=0,
-            persistable=True)
-        self.helper.set_variable_initializer(
-            self._beta1_pow_acc, initializer=Constant(self._beta1))
-
         # Create accumulator tensors for first moment and infinity norm
         for p in parameters:
             self._add_accumulator(self._moment_acc_str, p)
             self._add_accumulator(self._inf_norm_acc_str, p)
+            self._add_accumulator(
+                name=self._beta1_pow_acc_str,
+                param=p,
+                dtype='float32',
+                fill_value=self._beta1,
+                shape=[1])
 
     def _append_optimize_op(self, block, param_and_grad):
         assert isinstance(block, framework.Block)
@@ -667,6 +675,8 @@ class AdamaxOptimizer(Optimizer):
         moment = self._get_accumulator(self._moment_acc_str, param_and_grad[0])
         inf_norm = self._get_accumulator(self._inf_norm_acc_str,
                                          param_and_grad[0])
+        beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
+                                              param_and_grad[0])
         # create the adamax optimize op
         adamax_op = block.append_op(
             type=self.type,
@@ -676,7 +686,7 @@ class AdamaxOptimizer(Optimizer):
                 "LearningRate": self._create_param_lr(param_and_grad),
                 "Moment": moment,
                 "InfNorm": inf_norm,
-                "Beta1Pow": self._beta1_pow_acc
+                "Beta1Pow": beta1_pow_acc
             },
             outputs={
                 "ParamOut": param_and_grad[0],
@@ -691,18 +701,22 @@ class AdamaxOptimizer(Optimizer):
 
         return adamax_op
 
-    def _finish_update(self, block):
+    def _finish_update(self, block, parameters_and_grads):
         """Update Beta1 Power accumulator
         """
         assert isinstance(block, framework.Block)
         main_block = block.program.global_block()
-        scale_beta1 = main_block.append_op(
-            type="scale",
-            inputs={"X": self._beta1_pow_acc},
-            outputs={"Out": self._beta1_pow_acc},
-            attrs={"scale": self._beta1})
-
-        return [scale_beta1]
+        for param, grad in parameters_and_grads:
+            if grad is None:
+                continue
+            with param.block.program.optimized_guard([param, grad]):
+                beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
+                                                      param)
+                main_block.append_op(
+                    type="scale",
+                    inputs={"X": beta1_pow_acc},
+                    outputs={"Out": beta1_pow_acc},
+                    attrs={"scale": self._beta1})
 
 
 class DecayedAdagradOptimizer(Optimizer):
@@ -883,7 +897,20 @@ class RMSPropOptimizer(Optimizer):
 
         r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2
 
-        v(w, t) & = \\beta v(w, t-1) + \\frac{\\eta} {\\sqrt{v(w,t) +
+        v(w, t) & = \\beta v(w, t-1) + \\frac{\\eta} {\\sqrt{r(w,t) +
+            \\epsilon}} \\nabla Q_{i}(w)
+
+        w & = w - v(w, t)
+
+    if centered is True:
+
+    ..  math::
+
+        r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2
+
+        g(w, t) & = \\rho g(w, t-1) + (1 - \\rho)\\nabla Q_{i}(w)
+
+        v(w, t) & = \\beta v(w, t-1) + \\frac{\\eta} {\\sqrt{r(w,t) - (g(w, t))^2 +
             \\epsilon}} \\nabla Q_{i}(w)
 
         w & = w - v(w, t)
@@ -901,6 +928,10 @@ class RMSPropOptimizer(Optimizer):
             avoid division by zero, set 1e-6 by default.
         momentum(float): :math:`\\beta` in equation is the momentum term,
             set 0.0 by default.
+        centered(bool): If True, gradients are normalized by the estimated variance of
+            the gradient; if False, by the uncentered second moment. Setting this to
+            True may help with training, but is slightly more expensive in terms of
+            computation and memory. Defaults to False.
 
     Raises:
         ValueError: If learning_rate, rho, epsilon, momentum are None.
@@ -914,12 +945,14 @@ class RMSPropOptimizer(Optimizer):
 
     _momentum_acc_str = "momentum"
     _mean_square_acc_str = "mean_square"
+    _mean_grad_acc_str = "mean_grad"
 
     def __init__(self,
                  learning_rate,
                  rho=0.95,
                  epsilon=1.0e-6,
                  momentum=0.0,
+                 centered=False,
                  **kwargs):
         super(RMSPropOptimizer, self).__init__(
             learning_rate=learning_rate, **kwargs)
@@ -936,6 +969,7 @@ class RMSPropOptimizer(Optimizer):
         self._rho = rho
         self._epsilon = epsilon
         self._momentum = momentum
+        self._centered = centered
 
     def _create_accumulators(self, block, parameters):
         if not isinstance(block, framework.Block):
@@ -944,6 +978,7 @@ class RMSPropOptimizer(Optimizer):
         for p in parameters:
             self._add_accumulator(self._momentum_acc_str, p)
             self._add_accumulator(self._mean_square_acc_str, p)
+            self._add_accumulator(self._mean_grad_acc_str, p)
 
     def _append_optimize_op(self, block, param_and_grad):
         if not isinstance(block, framework.Block):
@@ -953,6 +988,8 @@ class RMSPropOptimizer(Optimizer):
                                              param_and_grad[0])
         mean_square_acc = self._get_accumulator(self._mean_square_acc_str,
                                                 param_and_grad[0])
+        mean_grad_acc = self._get_accumulator(self._mean_grad_acc_str,
+                                              param_and_grad[0])
         rmsprop_op = block.append_op(
             type=self.type,
             inputs={
@@ -960,17 +997,20 @@ class RMSPropOptimizer(Optimizer):
                 "Grad": param_and_grad[1],
                 "Moment": momentum_acc,
                 "MeanSquare": mean_square_acc,
+                "MeanGrad": mean_grad_acc,
                 "LearningRate": self._create_param_lr(param_and_grad),
             },
             outputs={
                 "ParamOut": param_and_grad[0],
                 "MomentOut": momentum_acc,
-                "MeanSquareOut": mean_square_acc
+                "MeanSquareOut": mean_square_acc,
+                "MeanGradOut": mean_grad_acc
             },
             attrs={
                 "epsilon": self._epsilon,
                 "decay": self._rho,
-                "momentum": self._momentum
+                "momentum": self._momentum,
+                "centered": self._centered
             })
 
         return rmsprop_op
@@ -1156,7 +1196,10 @@ class ModelAverage(Optimizer):
                 self.params_grads.append((param, grad))
 
         for param, grad in self.params_grads:
-            self._append_average_accumulate_op(param)
+            if grad is None:
+                continue
+            with param.block.program.optimized_guard([param, grad]):
+                self._append_average_accumulate_op(param)
 
         self.apply_program = Program()
         block = self.apply_program.global_block()
@@ -1171,16 +1214,16 @@ class ModelAverage(Optimizer):
                 self._add_average_restore_op(block, param_grad)
 
     def _add_average_apply_op(self, block, param_grad):
-        param = block.clone_variable(param_grad[0])
-        grad = block.clone_variable(param_grad[1])
-        sum_1 = block.clone_variable(self._get_accumulator('sum_1', param))
-        sum_2 = block.clone_variable(self._get_accumulator('sum_2', param))
-        sum_3 = block.clone_variable(self._get_accumulator('sum_3', param))
-        num_accumulates = block.clone_variable(
+        param = block._clone_variable(param_grad[0])
+        grad = block._clone_variable(param_grad[1])
+        sum_1 = block._clone_variable(self._get_accumulator('sum_1', param))
+        sum_2 = block._clone_variable(self._get_accumulator('sum_2', param))
+        sum_3 = block._clone_variable(self._get_accumulator('sum_3', param))
+        num_accumulates = block._clone_variable(
             self._get_accumulator('num_accumulates', param))
-        old_num_accumulates = block.clone_variable(
+        old_num_accumulates = block._clone_variable(
             self._get_accumulator('old_num_accumulates', param))
-        num_updates = block.clone_variable(
+        num_updates = block._clone_variable(
             self._get_accumulator('num_updates', param))
         # backup param value to grad
         layers.assign(input=param, output=grad)
@@ -1194,8 +1237,8 @@ class ModelAverage(Optimizer):
         layers.elementwise_div(x=sum, y=tmp, out=param)
 
     def _add_average_restore_op(self, block, param_grad):
-        param = block.clone_variable(param_grad[0])
-        grad = block.clone_variable(param_grad[1])
+        param = block._clone_variable(param_grad[0])
+        grad = block._clone_variable(param_grad[1])
         layers.assign(input=grad, output=param)
 
     def _append_average_accumulate_op(self, param):
diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
index 6baf648198585022f992709c519038688af293e1..44af29d3390e35129d0ee65b31eacad6b28a9d60 100644
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -12,12 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import core
+from __future__ import print_function
 import multiprocessing
-import framework
-import executor
+from . import core
+from . import framework
+from . import executor
+from .. import compat as cpt
 import warnings
 import sys
+import six
 import os
 
 __all__ = ['ParallelExecutor', 'ExecutionStrategy', 'BuildStrategy']
@@ -40,8 +43,9 @@ class ParallelExecutor(object):
         num_trainers(int): If greater than 1, NCCL will be initialized with
             multiple rank of nodes, each node should have same number of GPUs.
             Distributed training will be enabled then. Default 1.
-        trainer_id(int: Must use together with num_trainers. trainer_id is the
+        trainer_id(int): Must use together with num_trainers. trainer_id is the
             "rank" of current node starts from 0. Default 0.
+        scope(Scope): scope to run with, default use fluid.global_scope().
 
     Returns:
         ParallelExecutor: The initialized ParallelExecutor object.
@@ -70,6 +74,7 @@ class ParallelExecutor(object):
                  build_strategy=None,
                  num_trainers=1,
                  trainer_id=0,
+                 scope=None,
                  **kwargs):
         if len(kwargs) != 0:
             err_msg = ""
@@ -94,7 +99,7 @@ class ParallelExecutor(object):
         self._places = []
         self._act_places = []
         if use_cuda:
-            for i in xrange(core.get_cuda_device_count()):
+            for i in six.moves.range(core.get_cuda_device_count()):
                 p = core.Place()
                 self._act_places.append(core.CUDAPlace(i))
                 p.set_place(self._act_places[-1])
@@ -102,7 +107,7 @@ class ParallelExecutor(object):
         else:
             cpu_num = int(
                 os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
-            for i in xrange(cpu_num):
+            for i in six.moves.range(cpu_num):
                 p = core.Place()
                 self._act_places.append(core.CPUPlace())
                 p.set_place(self._act_places[-1])
@@ -121,19 +126,22 @@ class ParallelExecutor(object):
             else:
                 cpu_num = int(
                     os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
-                exec_strategy.num_threads = cpu_num
+                exec_strategy.num_threads = cpu_num * 2
+
+        # Set 1 thread num under nccl2 distribute 
+        #   env to make sure all gpus run ops in same order.
+        if num_trainers > 1:
+            assert (use_cuda)
+            # FIXME(gongwb): avoid this set.
+            exec_strategy.num_threads = 1
 
         if build_strategy is None:
             build_strategy = BuildStrategy()
 
         main = main_program
         main = main if main else framework.default_main_program()
-        scope = executor.global_scope()
-        # FIXME(Yancey1989): it's a temporary approach to determinate the distribute
-        # train program, call self.bcast_param() at the end of each mini-batch.
-        self.is_dist = True if "recv" in [
-            op.type for op in main.global_block().ops
-        ] else False
+        if scope == None:
+            scope = executor.global_scope()
 
         if share_vars_from and not isinstance(share_vars_from,
                                               ParallelExecutor):
@@ -143,20 +151,22 @@ class ParallelExecutor(object):
         ) if share_vars_from else []
 
         self.persistable_vars = [
-            v.name
-            for v in filter(
-                lambda var: var.persistable and var.type != core.VarDesc.VarType.RAW,
-                main.list_vars())
+            v.name for v in [
+                var for var in main.list_vars()
+                if var.persistable and var.type != core.VarDesc.VarType.RAW
+            ]
         ]
 
         self.executor = core.ParallelExecutor(
             self._places,
             set([
-                p.name for p in main.global_block().iter_parameters()
+                cpt.to_text(p.name)
+                for p in main.global_block().iter_parameters()
                 if not p.stop_gradient
             ]),
-            set(self.persistable_vars), main.desc, loss_name
-            if loss_name else '', scope, local_scopes, exec_strategy,
+            set(cpt.to_text(var) for var in self.persistable_vars), main.desc,
+            cpt.to_text(loss_name)
+            if loss_name else six.u(''), scope, local_scopes, exec_strategy,
             build_strategy, num_trainers, trainer_id)
         self.scope = scope
 
@@ -227,7 +237,9 @@ class ParallelExecutor(object):
         """
         if feed is None and feed_dict is not None:
             feed = feed_dict
-            print >> sys.stderr, "`feed_dict` is deprecated. Please use `feed=`"
+            print(
+                "`feed_dict` is deprecated. Please use `feed=`",
+                file=sys.stderr)
 
         if isinstance(feed, dict):
             feed_tensor_dict = dict()
@@ -269,21 +281,11 @@ class ParallelExecutor(object):
         self.executor.run(fetch_list, fetch_var_name)
         arr = self.scope.find_var(fetch_var_name).get_lod_tensor_array()
 
-        if self.is_dist:
-            self.bcast_params()
-
         if return_numpy:
             return executor.as_numpy(arr)
 
         return [arr[i] for i in range(len(arr))]
 
-    def bcast_params(self):
-        """
-        Broadcast the parameters to other devices. It is used during
-        distributed training.
-        """
-        self.executor.bcast_params(set(self.persistable_vars))
-
     @property
     def device_count(self):
         return len(self._act_places)
diff --git a/python/paddle/fluid/param_attr.py b/python/paddle/fluid/param_attr.py
index 0a42b9fca8dba7a11b414990be6c04c93158864f..f0be794327f51cbbc4202b8b7b401b712b6d66a3 100644
--- a/python/paddle/fluid/param_attr.py
+++ b/python/paddle/fluid/param_attr.py
@@ -12,8 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from initializer import Initializer, Xavier, Constant
-from regularizer import WeightDecayRegularizer
+from __future__ import print_function
+
+import six
+
+from .initializer import Initializer, Xavier, Constant
+from .regularizer import WeightDecayRegularizer
 
 __all__ = [
     'ParamAttr',
@@ -67,7 +71,7 @@ class ParamAttr(object):
         self.gradient_clip = gradient_clip
         self.model_average = do_model_average
 
-    def set_default_initializer(self, initializer):
+    def _set_default_initializer(self, initializer):
         """
         Set the default initializer, the initializer should be Constant,
         Uniform, Normal, Xavier, MSRA.
@@ -88,7 +92,7 @@ class ParamAttr(object):
 
         self.initializer = initializer
 
-    def set_default_param_initializer(self):
+    def _set_default_param_initializer(self):
         """
         Set the default initializer for the parameter with Xavier.
 
@@ -98,9 +102,9 @@ class ParamAttr(object):
         Returns:
             None.
         """
-        self.set_default_initializer(Xavier())
+        self._set_default_initializer(Xavier())
 
-    def set_default_bias_initializer(self):
+    def _set_default_bias_initializer(self):
         """
         Set the default initializer for the bias with Constant(0.0).
 
@@ -110,10 +114,10 @@ class ParamAttr(object):
         Returns:
             None.
         """
-        self.set_default_initializer(Constant(0.0))
+        self._set_default_initializer(Constant(0.0))
 
     @staticmethod
-    def to_attr(arg):
+    def _to_attr(arg):
         """
         Create ParamAttr[s].
 
@@ -131,21 +135,21 @@ class ParamAttr(object):
         if arg is None:
             return ParamAttr()
         elif isinstance(arg, list) or isinstance(arg, tuple):
-            return [ParamAttr.to_attr(a) for a in arg]
+            return [ParamAttr._to_attr(a) for a in arg]
         elif isinstance(arg, ParamAttr):
             return arg
-        elif isinstance(arg, str) or isinstance(arg, unicode):
+        elif isinstance(arg, six.string_types):
             return ParamAttr(name=arg)
         elif isinstance(arg, Initializer):
             return ParamAttr(initializer=arg)
         elif isinstance(arg, WeightDecayRegularizer):
             return ParamAttr(regularizer=arg)
         elif isinstance(arg, bool):
-            return ParamAttr.to_attr(None) if arg else False
+            return ParamAttr._to_attr(None) if arg else False
         else:
             raise TypeError("{0} cast to ParamAttr".format(type(arg)))
 
-    def to_kwargs(self, with_initializer=False):
+    def _to_kwargs(self, with_initializer=False):
         """
         Returns the attributes of this parameter.
 
diff --git a/python/paddle/fluid/profiler.py b/python/paddle/fluid/profiler.py
index 6a321ae024dcb50452bc4d96d7e7e70f590a42c6..e05885f5f5bfc169828c1c6e723dffff098c3c2e 100644
--- a/python/paddle/fluid/profiler.py
+++ b/python/paddle/fluid/profiler.py
@@ -12,9 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import core
+from __future__ import print_function
+
+from . import core
 from contextlib import contextmanager
 import os
+import six
 
 __all__ = [
     'cuda_profiler', 'reset_profiler', 'profiler', 'start_profiler',
@@ -88,7 +91,7 @@ def cuda_profiler(output_file, output_mode=None, config=None):
     config = NVPROF_CONFIG if config is None else config
     config_file = 'nvprof_config_file'
     with open(config_file, 'wb') as fp:
-        fp.writelines(["%s\n" % item for item in config])
+        fp.writelines([six.b("%s\n" % item) for item in config])
     core.nvprof_init(output_file, output_mode, config_file)
     # Enables profiler collection by the active CUDA profiling tool.
     core.nvprof_start()
@@ -218,20 +221,20 @@ def stop_profiler(sorted_key=None, profile_path='/tmp/profile'):
 def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
     """The profiler interface.
     Different from cuda_profiler, this profiler can be used to profile both CPU
-    and GPU program. By defalut, it records the CPU and GPU operator kernels,
+    and GPU program. By default, it records the CPU and GPU operator kernels,
     if you want to profile other program, you can refer the profiling tutorial
     to add more records in C++ code.
 
     If the state == 'All', a profile proto file will be written to
     `profile_path`. This file records timeline information during the execution.
-    Then users can visualize this file to see the timeline, please refer 
+    Then users can visualize this file to see the timeline, please refer
     https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/howto/optimization/timeline.md
 
     Args:
         state (string) : The profiling state, which should be 'CPU' or 'GPU',
             telling the profiler to use CPU timer or GPU timer for profiling.
             Although users may have already specified the execution place
-            (CPUPlace/CUDAPlace) in the begining, for flexibility the profiler
+            (CPUPlace/CUDAPlace) in the beginning, for flexibility the profiler
             would not inherit this place.
         sorted_key (string) : If None, the profiling results will be printed
             in the order of first end time of events. Otherwise, the profiling
diff --git a/python/paddle/fluid/recordio_writer.py b/python/paddle/fluid/recordio_writer.py
index bd57772713057f12b876942de58ee43527e94834..a69c0c29d4675d3e6b9b2a2d766b8be9935092cf 100644
--- a/python/paddle/fluid/recordio_writer.py
+++ b/python/paddle/fluid/recordio_writer.py
@@ -12,9 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import os
-import core
 import contextlib
+from . import core
 __all__ = [
     'convert_reader_to_recordio_file', 'convert_reader_to_recordio_files'
 ]
diff --git a/python/paddle/fluid/regularizer.py b/python/paddle/fluid/regularizer.py
index dac474d5ee76590a75311d6bf2c4cb2fe85b6c40..da38626111a6767e1a76a35d6d1375ccc1283de4 100644
--- a/python/paddle/fluid/regularizer.py
+++ b/python/paddle/fluid/regularizer.py
@@ -12,13 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import framework
+from __future__ import print_function
+
+from . import framework
 from . import core
 
-__all__ = [
-    'append_regularization_ops', 'L1Decay', 'L2Decay', 'L1DecayRegularizer',
-    'L2DecayRegularizer'
-]
+__all__ = ['L1Decay', 'L2Decay', 'L1DecayRegularizer', 'L2DecayRegularizer']
 
 
 def append_regularization_ops(parameters_and_grads, regularization=None):
@@ -44,12 +43,11 @@ def append_regularization_ops(parameters_and_grads, regularization=None):
     """
     params_and_grads = []
     for param, grad in parameters_and_grads:
-        with param.block.program.optimized_guard(param):
-            # If no gradient then we don't need to do anything
-            if grad is None:
-                params_and_grads.append((param, grad))
-                continue
-
+        # If no gradient then we don't need to do anything
+        if grad is None:
+            params_and_grads.append((param, grad))
+            continue
+        with param.block.program.optimized_guard([param, grad]):
             regularization_term = None
             if param.regularizer is not None:
                 # Add variable for regularization term in grad block
@@ -146,14 +144,20 @@ class L2DecayRegularizer(WeightDecayRegularizer):
             dtype="float32", shape=param.shape, lod_level=param.lod_level)
 
         if grad.type == core.VarDesc.VarType.SELECTED_ROWS:
+            idx = block.create_var(
+                dtype="int64",
+                shape=param.shape,
+                type=core.VarDesc.VarType.LOD_TENSOR)
             decay = block.create_var(
                 dtype="float32",
                 shape=param.shape,
                 type=core.VarDesc.VarType.SELECTED_ROWS)
+            block.append_op(
+                type='extract_rows', inputs={'X': grad}, outputs={'Out': idx})
             block.append_op(
                 type='lookup_table',
                 inputs={'W': param,
-                        'Ids': grad},
+                        'Ids': idx},
                 outputs={'Out': decay},
                 attrs={'is_sparse': True})
             param = decay
@@ -220,14 +224,20 @@ class L1DecayRegularizer(WeightDecayRegularizer):
             dtype="float32", shape=param.shape, lod_level=param.lod_level)
 
         if grad.type == core.VarDesc.VarType.SELECTED_ROWS:
+            idx = block.create_var(
+                dtype="int64",
+                shape=param.shape,
+                type=core.VarDesc.VarType.LOD_TENSOR)
             decay = block.create_var(
                 dtype="float32",
                 shape=param.shape,
                 type=core.VarDesc.VarType.SELECTED_ROWS)
+            block.append_op(
+                type='extract_rows', inputs={'X': grad}, outputs={'Out': idx})
             block.append_op(
                 type='lookup_table',
                 inputs={'W': param,
-                        'Ids': grad},
+                        'Ids': idx},
                 outputs={'Out': decay},
                 attrs={'is_sparse': True})
 
diff --git a/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py b/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py
index ad28c9eff560507e5b326451159be3949353f58f..e1368a3392a9cab3e82eff0a73eb225a52aa03bf 100644
--- a/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py
+++ b/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import paddle
 import paddle.fluid as fluid
 import contextlib
@@ -45,14 +47,14 @@ def train_program():
     loss = fluid.layers.square_error_cost(input=y_predict, label=y)
     avg_loss = fluid.layers.mean(loss)
 
-    return avg_loss
+    return [avg_loss, y_predict]
 
 
 def optimizer_func():
     return fluid.optimizer.SGD(learning_rate=0.001)
 
 
-def train(use_cuda, train_program, params_dirname):
+def train(use_cuda, train_program, params_dirname, inference_model_dirname):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
 
     trainer = fluid.Trainer(
@@ -63,7 +65,7 @@ def train(use_cuda, train_program, params_dirname):
             if event.step == 10:
                 test_metrics = trainer.test(
                     reader=test_reader, feed_order=['x', 'y'])
-                print test_metrics
+                print(test_metrics)
                 '''
                 ...
                 ['25.768919467926025']
@@ -72,6 +74,8 @@ def train(use_cuda, train_program, params_dirname):
                 '''
                 if params_dirname is not None:
                     trainer.save_params(params_dirname)
+                    trainer.save_inference_model(inference_model_dirname,
+                                                 ['x'], [1])
                 trainer.stop()
 
     trainer.train(
@@ -97,15 +101,55 @@ def infer(use_cuda, inference_program, params_dirname=None):
     print("infer results: ", results[0])
 
 
+def infer_by_saved_model(use_cuda, save_dirname=None):
+    if save_dirname is None:
+        return
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    inference_scope = fluid.core.Scope()
+    with fluid.scope_guard(inference_scope):
+        # Use fluid.io.load_inference_model to obtain the inference program desc,
+        # the feed_target_names (the names of variables that will be feeded
+        # data using feed operators), and the fetch_targets (variables that
+        # we want to obtain data from using fetch operators).
+        [inference_program, feed_target_names,
+         fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
+
+        # The input's dimension should be 2-D and the second dim is 13
+        # The input data should be >= 0
+        batch_size = 10
+
+        test_reader = paddle.batch(
+            paddle.dataset.uci_housing.test(), batch_size=batch_size)
+
+        test_data = next(test_reader())
+        test_feat = numpy.array(
+            [data[0] for data in test_data]).astype("float32")
+        test_label = numpy.array(
+            [data[1] for data in test_data]).astype("float32")
+
+        assert feed_target_names[0] == 'x'
+        results = exe.run(inference_program,
+                          feed={feed_target_names[0]: numpy.array(test_feat)},
+                          fetch_list=fetch_targets)
+        print("infer shape: ", results[0].shape)
+        print("infer results: ", results[0])
+        print("ground truth: ", test_label)
+
+
 def main(use_cuda):
     if use_cuda and not fluid.core.is_compiled_with_cuda():
         return
 
     # Directory for saving the trained model
-    params_dirname = "fit_a_line.inference.model"
+    params_dirname = "fit_a_line.model"
+    inference_model_dirname = "fit_a_line.inference_model"
 
-    train(use_cuda, train_program, params_dirname)
+    train(use_cuda, train_program, params_dirname, inference_model_dirname)
     infer(use_cuda, inference_program, params_dirname)
+    infer_by_saved_model(use_cuda, inference_model_dirname)
 
 
 class TestFitALine(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/cifar10_small_test_set.py b/python/paddle/fluid/tests/book/high-level-api/image_classification/cifar10_small_test_set.py
index 7fed6d914f75b690e34411aa154359c93b6ca989..48c0f3d3611547308b5d4460748d3aab765f5805 100644
--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/cifar10_small_test_set.py
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/cifar10_small_test_set.py
@@ -28,11 +28,14 @@ images per class.
 
 """
 
-import cPickle
+from __future__ import print_function
+
 import itertools
 import numpy
-import paddle.v2.dataset.common
+import paddle.dataset.common
 import tarfile
+import six
+from six.moves import cPickle as pickle
 
 __all__ = ['train10']
 
@@ -43,20 +46,25 @@ CIFAR10_MD5 = 'c58f30108f718f92721af3b95e74349a'
 
 def reader_creator(filename, sub_name, batch_size=None):
     def read_batch(batch):
-        data = batch['data']
-        labels = batch.get('labels', batch.get('fine_labels', None))
+        data = batch[six.b('data')]
+        labels = batch.get(
+            six.b('labels'), batch.get(six.b('fine_labels'), None))
         assert labels is not None
-        for sample, label in itertools.izip(data, labels):
+        for sample, label in six.moves.zip(data, labels):
             yield (sample / 255.0).astype(numpy.float32), int(label)
 
     def reader():
         with tarfile.open(filename, mode='r') as f:
-            names = (each_item.name for each_item in f
-                     if sub_name in each_item.name)
+            names = [
+                each_item.name for each_item in f if sub_name in each_item.name
+            ]
 
             batch_count = 0
             for name in names:
-                batch = cPickle.load(f.extractfile(name))
+                if six.PY2:
+                    batch = pickle.load(f.extractfile(name))
+                else:
+                    batch = pickle.load(f.extractfile(name), encoding='bytes')
                 for item in read_batch(batch):
                     if isinstance(batch_size, int) and batch_count > batch_size:
                         break
@@ -77,6 +85,6 @@ def train10(batch_size=None):
     :rtype: callable
     """
     return reader_creator(
-        paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
+        paddle.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
         'data_batch',
         batch_size=batch_size)
diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
index 8e222d26907e8fe697b596a67e62cc9df84afe0e..de276755bb1eb2746cc780575a40357255223809 100644
--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
@@ -16,7 +16,10 @@ from __future__ import print_function
 
 import paddle
 import paddle.fluid as fluid
+import paddle.fluid.core as core
 import numpy
+import six
+import os
 import cifar10_small_test_set
 
 
@@ -57,7 +60,7 @@ def resnet_cifar10(input, depth=32):
         return tmp
 
     assert (depth - 2) % 6 == 0
-    n = (depth - 2) / 6
+    n = (depth - 2) // 6
     conv1 = conv_bn_layer(
         input=input, ch_out=16, filter_size=3, stride=1, padding=1)
     res1 = layer_warp(basicblock, conv1, 16, 16, n, 1)
@@ -89,7 +92,7 @@ def optimizer_func():
     return fluid.optimizer.Adam(learning_rate=0.001)
 
 
-def train(use_cuda, train_program, params_dirname):
+def train(use_cuda, train_program, parallel, params_dirname):
     BATCH_SIZE = 128
     EPOCH_NUM = 1
 
@@ -116,7 +119,10 @@ def train(use_cuda, train_program, params_dirname):
 
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     trainer = fluid.Trainer(
-        train_func=train_program, optimizer_func=optimizer_func, place=place)
+        train_func=train_program,
+        optimizer_func=optimizer_func,
+        place=place,
+        parallel=parallel)
 
     trainer.train(
         reader=train_reader,
@@ -125,10 +131,13 @@ def train(use_cuda, train_program, params_dirname):
         feed_order=['pixel', 'label'])
 
 
-def infer(use_cuda, inference_program, params_dirname=None):
+def infer(use_cuda, inference_program, parallel, params_dirname=None):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     inferencer = fluid.Inferencer(
-        infer_func=inference_program, param_path=params_dirname, place=place)
+        infer_func=inference_program,
+        param_path=params_dirname,
+        place=place,
+        parallel=parallel)
 
     # The input's dimension of conv should be 4-D or 5-D.
     # Use normilized image pixels as input data, which should be in the range
@@ -139,22 +148,34 @@ def infer(use_cuda, inference_program, params_dirname=None):
     print("infer results: ", results)
 
 
-def main(use_cuda):
+def main(use_cuda, parallel):
     if use_cuda and not fluid.core.is_compiled_with_cuda():
         return
     save_path = "image_classification_resnet.inference.model"
 
+    os.environ['CPU_NUM'] = str(4)
     train(
         use_cuda=use_cuda,
         train_program=train_network,
-        params_dirname=save_path)
+        params_dirname=save_path,
+        parallel=parallel)
 
+    # FIXME(zcd): in the inference stage, the number of
+    # input data is one, it is not appropriate to use parallel.
+    if parallel and use_cuda:
+        return
+
+    os.environ['CPU_NUM'] = str(1)
     infer(
         use_cuda=use_cuda,
         inference_program=inference_network,
-        params_dirname=save_path)
+        params_dirname=save_path,
+        parallel=parallel)
 
 
 if __name__ == '__main__':
     for use_cuda in (False, True):
-        main(use_cuda=use_cuda)
+        for parallel in (False, True):
+            if use_cuda and not core.is_compiled_with_cuda():
+                continue
+            main(use_cuda=use_cuda, parallel=parallel)
diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
index dbc7bc06c93157f271c79e85b6925468e861e57f..dd547f3448ae55c07b6c09f9de4ac08d8ec5ee88 100644
--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
@@ -16,7 +16,10 @@ from __future__ import print_function
 
 import paddle
 import paddle.fluid as fluid
+import paddle.fluid.core as core
 import numpy
+import six
+import os
 import cifar10_small_test_set
 
 
@@ -68,7 +71,7 @@ def optimizer_func():
     return fluid.optimizer.Adam(learning_rate=0.001)
 
 
-def train(use_cuda, train_program, params_dirname):
+def train(use_cuda, train_program, parallel, params_dirname):
     BATCH_SIZE = 128
     train_reader = paddle.batch(
         paddle.reader.shuffle(
@@ -93,7 +96,10 @@ def train(use_cuda, train_program, params_dirname):
 
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     trainer = fluid.Trainer(
-        train_func=train_program, place=place, optimizer_func=optimizer_func)
+        train_func=train_program,
+        place=place,
+        optimizer_func=optimizer_func,
+        parallel=parallel)
 
     trainer.train(
         reader=train_reader,
@@ -102,10 +108,13 @@ def train(use_cuda, train_program, params_dirname):
         feed_order=['pixel', 'label'])
 
 
-def infer(use_cuda, inference_program, params_dirname=None):
+def infer(use_cuda, inference_program, parallel, params_dirname=None):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     inferencer = fluid.Inferencer(
-        infer_func=inference_program, param_path=params_dirname, place=place)
+        infer_func=inference_program,
+        param_path=params_dirname,
+        place=place,
+        parallel=parallel)
 
     # The input's dimension of conv should be 4-D or 5-D.
     # Use normilized image pixels as input data, which should be in the range
@@ -116,22 +125,31 @@ def infer(use_cuda, inference_program, params_dirname=None):
     print("infer results: ", results)
 
 
-def main(use_cuda):
-    if use_cuda and not fluid.core.is_compiled_with_cuda():
-        return
+def main(use_cuda, parallel):
     save_path = "image_classification_vgg.inference.model"
 
+    os.environ['CPU_NUM'] = str(4)
     train(
         use_cuda=use_cuda,
         train_program=train_network,
-        params_dirname=save_path)
+        params_dirname=save_path,
+        parallel=parallel)
 
+    # FIXME(zcd): in the inference stage, the number of
+    # input data is one, it is not appropriate to use parallel.
+    if parallel and use_cuda:
+        return
+    os.environ['CPU_NUM'] = str(1)
     infer(
         use_cuda=use_cuda,
         inference_program=inference_network,
-        params_dirname=save_path)
+        params_dirname=save_path,
+        parallel=parallel)
 
 
 if __name__ == '__main__':
     for use_cuda in (False, True):
-        main(use_cuda=use_cuda)
+        for parallel in (False, True):
+            if use_cuda and not core.is_compiled_with_cuda():
+                continue
+            main(use_cuda=use_cuda, parallel=parallel)
diff --git a/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py b/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py
index 67aa21e8c5699f1cb568dad23cd13f4cb51a6ec9..ec4e1c768c7f2a2421ac409a2eecc0100c086a6a 100755
--- a/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py
+++ b/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py
@@ -178,14 +178,15 @@ def train(use_cuda, train_program, params_dirname):
             if float(avg_cost) < 100.0:  # Large value to increase CI speed
                 trainer.save_params(params_dirname)
             else:
-                print('BatchID {0}, Test Loss {1:0.2}'.format(event.epoch + 1,
-                                                              float(avg_cost)))
+                print(
+                    ('BatchID {0}, Test Loss {1:0.2}'.format(event.epoch + 1,
+                                                             float(avg_cost))))
                 if math.isnan(float(avg_cost)):
                     sys.exit("got NaN loss, training failed.")
 
         elif isinstance(event, fluid.EndStepEvent):
             print("Step {0}, Epoch {1} Metrics {2}".format(
-                event.step, event.epoch, map(np.array, event.metrics)))
+                event.step, event.epoch, list(map(np.array, event.metrics))))
             if event.step == 1:  # Run 2 iterations to speed CI
                 trainer.save_params(params_dirname)
                 trainer.stop()
@@ -207,14 +208,14 @@ def infer(use_cuda, inference_program, params_dirname):
         inference_program, param_path=params_dirname, place=place)
 
     # Setup input by creating LoDTensor to represent sequence of words.
-    # Here each word is the basic element of the LoDTensor and the shape of 
-    # each word (base_shape) should be [1] since it is simply an index to 
+    # Here each word is the basic element of the LoDTensor and the shape of
+    # each word (base_shape) should be [1] since it is simply an index to
     # look up for the corresponding word vector.
     # Suppose the recursive_sequence_lengths info is set to [[3, 4, 2]],
-    # which has only one level of detail. Then the created LoDTensor will have only 
-    # one higher level structure (sequence of words, or sentence) than the basic 
-    # element (word). Hence the LoDTensor will hold data for three sentences of 
-    # length 3, 4 and 2, respectively. 
+    # which has only one level of detail. Then the created LoDTensor will have only
+    # one higher level structure (sequence of words, or sentence) than the basic
+    # element (word). Hence the LoDTensor will hold data for three sentences of
+    # length 3, 4 and 2, respectively.
     # Note that recursive_sequence_lengths should be a list of lists.
     recursive_seq_lens = [[3, 4, 2]]
     base_shape = [1]
diff --git a/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py b/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py
index 8becd2404b0201c44b587a28e88995958082cd28..560f1189581f631dc6a3470cf8f22f902ca26f26 100644
--- a/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py
+++ b/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py
@@ -11,6 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from __future__ import print_function
 import contextlib
 
 import numpy as np
@@ -250,7 +252,7 @@ def decode_main(use_cuda, is_sparse):
     feeder = fluid.DataFeeder(feed_list, place)
 
     for data in train_data():
-        feed_dict = feeder.feed(map(lambda x: [x[0]], data))
+        feed_dict = feeder.feed([[x[0]] for x in data])
         feed_dict['init_ids'] = init_ids
         feed_dict['init_scores'] = init_scores
 
@@ -259,7 +261,7 @@ def decode_main(use_cuda, is_sparse):
             feed=feed_dict,
             fetch_list=[translation_ids, translation_scores],
             return_numpy=False)
-        print result_ids.recursive_sequence_lengths()
+        print(result_ids.recursive_sequence_lengths())
         break
 
 
diff --git a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
index 9a09db25dc0e2c71772aa06e6d0cf993321612e4..973308498bec3cddde2ef651751ad5d0c9f84503 100644
--- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
@@ -11,10 +11,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 from __future__ import print_function
+
 import argparse
 import paddle.fluid as fluid
+import paddle.fluid.core as core
 import paddle
+import six
 import sys
 import numpy
 import unittest
@@ -61,14 +65,14 @@ def optimizer_func():
     return fluid.optimizer.Adam(learning_rate=0.001)
 
 
-def train(use_cuda, train_program, params_dirname):
+def train(use_cuda, train_program, parallel, params_dirname):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
 
     trainer = fluid.Trainer(
         train_func=train_program,
         place=place,
         optimizer_func=optimizer_func,
-        parallel=True)
+        parallel=parallel)
 
     def event_handler(event):
         if isinstance(event, fluid.EndEpochEvent):
@@ -88,8 +92,10 @@ def train(use_cuda, train_program, params_dirname):
                 if math.isnan(avg_cost):
                     sys.exit("got NaN loss, training failed.")
         elif isinstance(event, fluid.EndStepEvent):
-            print("Step {0}, Epoch {1} Metrics {2}".format(
-                event.step, event.epoch, map(numpy.array, event.metrics)))
+            print(
+                ("Step {0}, Epoch {1} Metrics {2}".format(
+                    event.step, event.epoch,
+                    list(map(numpy.array, event.metrics)))))
 
     train_reader = paddle.batch(
         paddle.reader.shuffle(
@@ -103,11 +109,14 @@ def train(use_cuda, train_program, params_dirname):
         feed_order=['img', 'label'])
 
 
-def infer(use_cuda, inference_program, params_dirname=None):
+def infer(use_cuda, inference_program, parallel, params_dirname=None):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
 
     inferencer = fluid.Inferencer(
-        infer_func=inference_program, param_path=params_dirname, place=place)
+        infer_func=inference_program,
+        param_path=params_dirname,
+        place=place,
+        parallel=parallel)
 
     batch_size = 1
     tensor_img = numpy.random.uniform(-1.0, 1.0,
@@ -118,20 +127,32 @@ def infer(use_cuda, inference_program, params_dirname=None):
     print("infer results: ", results[0])
 
 
-def main(use_cuda):
+def main(use_cuda, parallel):
     params_dirname = "recognize_digits_conv.inference.model"
 
     # call train() with is_local argument to run distributed train
+    os.environ['CPU_NUM'] = str(4)
     train(
         use_cuda=use_cuda,
         train_program=train_program,
-        params_dirname=params_dirname)
+        params_dirname=params_dirname,
+        parallel=parallel)
+
+    # FIXME(zcd): in the inference stage, the number of
+    # input data is one, it is not appropriate to use parallel.
+    if parallel and use_cuda:
+        return
+    os.environ['CPU_NUM'] = str(1)
     infer(
         use_cuda=use_cuda,
         inference_program=inference_program,
-        params_dirname=params_dirname)
+        params_dirname=params_dirname,
+        parallel=parallel)
 
 
 if __name__ == '__main__':
-    # for use_cuda in (False, True):
-    main(use_cuda=True)
+    for use_cuda in (False, True):
+        for parallel in (False, True):
+            if use_cuda and not core.is_compiled_with_cuda():
+                continue
+            main(use_cuda=use_cuda, parallel=parallel)
diff --git a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py
index b2b544e791d7ea35ff7d2c9a2dce7ce7f5680f38..cb4aeb430e1a9662a183084c0cdacc41c5a8ec11 100644
--- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py
@@ -11,10 +11,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 from __future__ import print_function
+
 import argparse
 import paddle.fluid as fluid
+import paddle.fluid.core as core
 import paddle
+import six
 import sys
 import numpy
 import unittest
@@ -48,11 +52,14 @@ def optimizer_func():
     return fluid.optimizer.Adam(learning_rate=0.001)
 
 
-def train(use_cuda, train_program, params_dirname):
+def train(use_cuda, train_program, params_dirname, parallel):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
 
     trainer = fluid.Trainer(
-        train_func=train_program, place=place, optimizer_func=optimizer_func)
+        train_func=train_program,
+        place=place,
+        optimizer_func=optimizer_func,
+        parallel=parallel)
 
     def event_handler(event):
         if isinstance(event, fluid.EndEpochEvent):
@@ -84,11 +91,14 @@ def train(use_cuda, train_program, params_dirname):
         feed_order=['img', 'label'])
 
 
-def infer(use_cuda, inference_program, params_dirname=None):
+def infer(use_cuda, inference_program, parallel, params_dirname=None):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
 
     inferencer = fluid.Inferencer(
-        infer_func=inference_program, param_path=params_dirname, place=place)
+        infer_func=inference_program,
+        param_path=params_dirname,
+        place=place,
+        parallel=parallel)
 
     batch_size = 1
     tensor_img = numpy.random.uniform(-1.0, 1.0,
@@ -99,20 +109,32 @@ def infer(use_cuda, inference_program, params_dirname=None):
     print("infer results: ", results[0])
 
 
-def main(use_cuda):
+def main(use_cuda, parallel):
     params_dirname = "recognize_digits_mlp.inference.model"
 
     # call train() with is_local argument to run distributed train
+    os.environ['CPU_NUM'] = str(4)
     train(
         use_cuda=use_cuda,
         train_program=train_program,
-        params_dirname=params_dirname)
+        params_dirname=params_dirname,
+        parallel=parallel)
+
+    # FIXME(zcd): in the inference stage, the number of
+    # input data is one, it is not appropriate to use parallel.
+    if parallel and use_cuda:
+        return
+    os.environ['CPU_NUM'] = str(1)
     infer(
         use_cuda=use_cuda,
         inference_program=inference_program,
-        params_dirname=params_dirname)
+        params_dirname=params_dirname,
+        parallel=parallel)
 
 
 if __name__ == '__main__':
-    # for use_cuda in (False, True):
-    main(use_cuda=False)
+    for use_cuda in (False, True):
+        for parallel in (False, True):
+            if use_cuda and not core.is_compiled_with_cuda():
+                continue
+            main(use_cuda=use_cuda, parallel=parallel)
diff --git a/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py b/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py
index c860f1641708d947fd2a8008d3d3ccd0a231f6c2..9e2767783bb6748cfc8f95567627068d7532a8c8 100644
--- a/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import math
 import sys
 import numpy as np
@@ -186,8 +188,9 @@ def train(use_cuda, train_program, params_dirname):
                 trainer.save_params(params_dirname)
                 trainer.stop()
             else:
-                print('BatchID {0}, Test Loss {1:0.2}'.format(event.epoch + 1,
-                                                              float(avg_cost)))
+                print(
+                    ('BatchID {0}, Test Loss {1:0.2}'.format(event.epoch + 1,
+                                                             float(avg_cost))))
                 if math.isnan(float(avg_cost)):
                     sys.exit("got NaN loss, training failed.")
 
diff --git a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py
index 1668ae83d3581125b799508c8c3115a038e93d5a..097c2a468fca558106aba2f24c332256189d9076 100644
--- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py
@@ -98,7 +98,7 @@ def train(use_cuda, train_program, params_dirname):
                     sys.exit("got NaN loss, training failed.")
         elif isinstance(event, fluid.EndStepEvent):
             print("Step {0}, Epoch {1} Metrics {2}".format(
-                event.step, event.epoch, map(np.array, event.metrics)))
+                event.step, event.epoch, list(map(np.array, event.metrics))))
             if event.step == 1:  # Run 2 iterations to speed CI
                 trainer.save_params(params_dirname)
                 trainer.stop()
@@ -125,14 +125,14 @@ def infer(use_cuda, inference_program, params_dirname=None):
         place=place)
 
     # Setup input by creating LoDTensor to represent sequence of words.
-    # Here each word is the basic element of the LoDTensor and the shape of 
-    # each word (base_shape) should be [1] since it is simply an index to 
+    # Here each word is the basic element of the LoDTensor and the shape of
+    # each word (base_shape) should be [1] since it is simply an index to
     # look up for the corresponding word vector.
     # Suppose the recursive_sequence_lengths info is set to [[3, 4, 2]],
-    # which has only one level of detail. Then the created LoDTensor will have only 
-    # one higher level structure (sequence of words, or sentence) than the basic 
-    # element (word). Hence the LoDTensor will hold data for three sentences of 
-    # length 3, 4 and 2, respectively. 
+    # which has only one level of detail. Then the created LoDTensor will have only
+    # one higher level structure (sequence of words, or sentence) than the basic
+    # element (word). Hence the LoDTensor will hold data for three sentences of
+    # length 3, 4 and 2, respectively.
     # Note that recursive_sequence_lengths should be a list of lists.
     recursive_seq_lens = [[3, 4, 2]]
     base_shape = [1]
diff --git a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py
index 8da89d82cb8e00853eebfd794602a0e1e1020e7c..5f74cd142590abb93f8846bc831a9f5e3dd2f311 100644
--- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py
@@ -113,7 +113,7 @@ def train(use_cuda, train_program, params_dirname):
                     sys.exit("got NaN loss, training failed.")
         elif isinstance(event, fluid.EndStepEvent):
             print("Step {0}, Epoch {1} Metrics {2}".format(
-                event.step, event.epoch, map(np.array, event.metrics)))
+                event.step, event.epoch, list(map(np.array, event.metrics))))
             if event.step == 1:  # Run 2 iterations to speed CI
                 trainer.save_params(params_dirname)
                 trainer.stop()
@@ -140,14 +140,14 @@ def infer(use_cuda, inference_program, params_dirname=None):
         place=place)
 
     # Setup input by creating LoDTensor to represent sequence of words.
-    # Here each word is the basic element of the LoDTensor and the shape of 
-    # each word (base_shape) should be [1] since it is simply an index to 
+    # Here each word is the basic element of the LoDTensor and the shape of
+    # each word (base_shape) should be [1] since it is simply an index to
     # look up for the corresponding word vector.
     # Suppose the recursive_sequence_lengths info is set to [[3, 4, 2]],
-    # which has only one level of detail. Then the created LoDTensor will have only 
-    # one higher level structure (sequence of words, or sentence) than the basic 
-    # element (word). Hence the LoDTensor will hold data for three sentences of 
-    # length 3, 4 and 2, respectively. 
+    # which has only one level of detail. Then the created LoDTensor will have only
+    # one higher level structure (sequence of words, or sentence) than the basic
+    # element (word). Hence the LoDTensor will hold data for three sentences of
+    # length 3, 4 and 2, respectively.
     # Note that recursive_sequence_lengths should be a list of lists.
     recursive_seq_lens = [[3, 4, 2]]
     base_shape = [1]
diff --git a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py
index 74faa2e8aa734cd644dfcc38127fd12df1fb1092..284a6ca168636377699c287236c491352566909b 100644
--- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py
@@ -107,7 +107,7 @@ def train(use_cuda, train_program, params_dirname):
                     sys.exit("got NaN loss, training failed.")
         elif isinstance(event, fluid.EndStepEvent):
             print("Step {0}, Epoch {1} Metrics {2}".format(
-                event.step, event.epoch, map(np.array, event.metrics)))
+                event.step, event.epoch, list(map(np.array, event.metrics))))
             if event.step == 1:  # Run 2 iterations to speed CI
                 trainer.save_params(params_dirname)
                 trainer.stop()
@@ -135,14 +135,14 @@ def infer(use_cuda, inference_program, params_dirname=None):
         place=place)
 
     # Setup input by creating LoDTensor to represent sequence of words.
-    # Here each word is the basic element of the LoDTensor and the shape of 
-    # each word (base_shape) should be [1] since it is simply an index to 
+    # Here each word is the basic element of the LoDTensor and the shape of
+    # each word (base_shape) should be [1] since it is simply an index to
     # look up for the corresponding word vector.
     # Suppose the recursive_sequence_lengths info is set to [[3, 4, 2]],
-    # which has only one level of detail. Then the created LoDTensor will have only 
-    # one higher level structure (sequence of words, or sentence) than the basic 
-    # element (word). Hence the LoDTensor will hold data for three sentences of 
-    # length 3, 4 and 2, respectively. 
+    # which has only one level of detail. Then the created LoDTensor will have only
+    # one higher level structure (sequence of words, or sentence) than the basic
+    # element (word). Hence the LoDTensor will hold data for three sentences of
+    # length 3, 4 and 2, respectively.
     # Note that recursive_sequence_lengths should be a list of lists.
     recursive_seq_lens = [[3, 4, 2]]
     base_shape = [1]
diff --git a/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py b/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py
index 02e65cf56c4d1bd262831320befd2edc735c0d1c..1c7cf3199a07c3f65d967eda70a481b1bd1b1638 100644
--- a/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py
+++ b/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import paddle
 import paddle.fluid as fluid
 import numpy as np
diff --git a/python/paddle/fluid/tests/book/notest_understand_sentiment.py b/python/paddle/fluid/tests/book/notest_understand_sentiment.py
index 1df7b99aad6094a8b8ddfe783b9de35cef61c524..82f1c6615f3c4ca54bf5e979b55082022cd4da9f 100644
--- a/python/paddle/fluid/tests/book/notest_understand_sentiment.py
+++ b/python/paddle/fluid/tests/book/notest_understand_sentiment.py
@@ -11,8 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 from __future__ import print_function
 
+from paddle.fluid.layers.device import get_places
 import unittest
 import paddle.fluid as fluid
 import paddle
@@ -144,7 +146,7 @@ def train(word_dict,
         cost, acc_out, prediction = net_method(
             data, label, input_dim=dict_dim, class_dim=class_dim)
     else:
-        places = fluid.layers.get_places()
+        places = get_places()
         pd = fluid.layers.ParallelDo(places)
         with pd.do():
             cost, acc, _ = net_method(
@@ -175,7 +177,7 @@ def train(word_dict,
     def train_loop(main_program):
         exe.run(fluid.default_startup_program())
 
-        for pass_id in xrange(PASS_NUM):
+        for pass_id in range(PASS_NUM):
             for data in train_data():
                 cost_val, acc_val = exe.run(main_program,
                                             feed=feeder.feed(data),
@@ -235,14 +237,14 @@ def infer(word_dict, use_cuda, save_dirname=None):
         word_dict_len = len(word_dict)
 
         # Setup input by creating LoDTensor to represent sequence of words.
-        # Here each word is the basic element of the LoDTensor and the shape of 
-        # each word (base_shape) should be [1] since it is simply an index to 
+        # Here each word is the basic element of the LoDTensor and the shape of
+        # each word (base_shape) should be [1] since it is simply an index to
         # look up for the corresponding word vector.
         # Suppose the recursive_sequence_lengths info is set to [[3, 4, 2]],
-        # which has only one level of detail. Then the created LoDTensor will have only 
-        # one higher level structure (sequence of words, or sentence) than the basic 
-        # element (word). Hence the LoDTensor will hold data for three sentences of 
-        # length 3, 4 and 2, respectively. 
+        # which has only one level of detail. Then the created LoDTensor will have only
+        # one higher level structure (sequence of words, or sentence) than the basic
+        # element (word). Hence the LoDTensor will hold data for three sentences of
+        # length 3, 4 and 2, respectively.
         # Note that recursive_sequence_lengths should be a list of lists.
         recursive_seq_lens = [[3, 4, 2]]
         base_shape = [1]
diff --git a/python/paddle/fluid/tests/book/test_fit_a_line.py b/python/paddle/fluid/tests/book/test_fit_a_line.py
index 71bf5f8b3a9b17f24ce35220a9348bb871852623..334294ab485cf203aa0ccf680a53010322d3af3b 100644
--- a/python/paddle/fluid/tests/book/test_fit_a_line.py
+++ b/python/paddle/fluid/tests/book/test_fit_a_line.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import paddle
 import paddle.fluid as fluid
 import contextlib
@@ -114,7 +116,7 @@ def infer(use_cuda, save_dirname=None):
         test_reader = paddle.batch(
             paddle.dataset.uci_housing.test(), batch_size=batch_size)
 
-        test_data = test_reader().next()
+        test_data = next(test_reader())
         test_feat = numpy.array(
             [data[0] for data in test_data]).astype("float32")
         test_label = numpy.array(
diff --git a/python/paddle/fluid/tests/book/test_image_classification.py b/python/paddle/fluid/tests/book/test_image_classification.py
index a2fb186b86c9706ac1aff0de49defbfb06e2eb0f..9fe361425c128590da910128beaccb3336f8ba57 100644
--- a/python/paddle/fluid/tests/book/test_image_classification.py
+++ b/python/paddle/fluid/tests/book/test_image_classification.py
@@ -62,7 +62,7 @@ def resnet_cifar10(input, depth=32):
         return tmp
 
     assert (depth - 2) % 6 == 0
-    n = (depth - 2) / 6
+    n = (depth - 2) // 6
     conv1 = conv_bn_layer(
         input=input, ch_out=16, filter_size=3, stride=1, padding=1)
     res1 = layer_warp(basicblock, conv1, 16, 16, n, 1)
@@ -121,7 +121,7 @@ def train(net_type, use_cuda, save_dirname, is_local):
     avg_cost = fluid.layers.mean(cost)
     acc = fluid.layers.accuracy(input=predict, label=label)
 
-    # Test program 
+    # Test program
     test_program = fluid.default_main_program().clone(for_test=True)
 
     optimizer = fluid.optimizer.Adam(learning_rate=0.001)
diff --git a/python/paddle/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
index d489feae9c568ec1d9e3a230766d10d1ced0200a..f63387a90617dc4e9b7c9ee7caa2d01595237a03 100644
--- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import contextlib
 import math
 import numpy as np
@@ -181,7 +183,7 @@ def train(use_cuda, save_dirname=None, is_local=True):
 
         start_time = time.time()
         batch_id = 0
-        for pass_id in xrange(PASS_NUM):
+        for pass_id in range(PASS_NUM):
             for data in train_data():
                 cost = exe.run(main_program,
                                feed=feeder.feed(data),
@@ -248,14 +250,14 @@ def infer(use_cuda, save_dirname=None):
          fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
 
         # Setup input by creating LoDTensor to represent sequence of words.
-        # Here each word is the basic element of the LoDTensor and the shape of 
-        # each word (base_shape) should be [1] since it is simply an index to 
+        # Here each word is the basic element of the LoDTensor and the shape of
+        # each word (base_shape) should be [1] since it is simply an index to
         # look up for the corresponding word vector.
         # Suppose the recursive_sequence_lengths info is set to [[3, 4, 2]],
-        # which has only one level of detail. Then the created LoDTensor will have only 
-        # one higher level structure (sequence of words, or sentence) than the basic 
-        # element (word). Hence the LoDTensor will hold data for three sentences of 
-        # length 3, 4 and 2, respectively. 
+        # which has only one level of detail. Then the created LoDTensor will have only
+        # one higher level structure (sequence of words, or sentence) than the basic
+        # element (word). Hence the LoDTensor will hold data for three sentences of
+        # length 3, 4 and 2, respectively.
         # Note that recursive_sequence_lengths should be a list of lists.
         recursive_seq_lens = [[3, 4, 2]]
         base_shape = [1]
diff --git a/python/paddle/fluid/tests/book/test_machine_translation.py b/python/paddle/fluid/tests/book/test_machine_translation.py
index 90c301a66105d8d872ee531556c5060b5d727515..5e241aaa32727686b84a0354a11d5a92f9576a90 100644
--- a/python/paddle/fluid/tests/book/test_machine_translation.py
+++ b/python/paddle/fluid/tests/book/test_machine_translation.py
@@ -11,6 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from __future__ import print_function
 import contextlib
 
 import numpy as np
@@ -199,7 +201,7 @@ def train_main(use_cuda, is_sparse, is_local=True):
         feeder = fluid.DataFeeder(feed_list, place)
 
         batch_id = 0
-        for pass_id in xrange(1):
+        for pass_id in range(1):
             for data in train_data():
                 outs = exe.run(main_program,
                                feed=feeder.feed(data),
@@ -273,7 +275,7 @@ def decode_main(use_cuda, is_sparse):
     feeder = fluid.DataFeeder(feed_list, place)
 
     for data in train_data():
-        feed_dict = feeder.feed(map(lambda x: [x[0]], data))
+        feed_dict = feeder.feed([[x[0]] for x in data])
         feed_dict['init_ids'] = init_ids
         feed_dict['init_scores'] = init_scores
 
@@ -282,7 +284,7 @@ def decode_main(use_cuda, is_sparse):
             feed=feed_dict,
             fetch_list=[translation_ids, translation_scores],
             return_numpy=False)
-        print result_ids.recursive_sequence_lengths()
+        print(result_ids.recursive_sequence_lengths())
         break
 
 
diff --git a/python/paddle/fluid/tests/book/test_recognize_digits.py b/python/paddle/fluid/tests/book/test_recognize_digits.py
index 5f5c8544bbdb87421f129b201a0ebaf4cb8602a1..da216d0cc4a2867cb169240d28235b6db747a818 100644
--- a/python/paddle/fluid/tests/book/test_recognize_digits.py
+++ b/python/paddle/fluid/tests/book/test_recognize_digits.py
@@ -11,16 +11,20 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 from __future__ import print_function
-import argparse
-import paddle.fluid as fluid
-import paddle
-import sys
-import numpy
-import unittest
+
+import paddle.fluid.core as core
 import math
-import sys
 import os
+import sys
+import unittest
+
+import numpy
+
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.layers.device import get_places
 
 BATCH_SIZE = 64
 
@@ -76,7 +80,7 @@ def train(nn_type,
         net_conf = conv_net
 
     if parallel:
-        places = fluid.layers.get_places()
+        places = get_places()
         pd = fluid.layers.ParallelDo(places)
         with pd.do():
             img_ = pd.read_input(img)
@@ -255,6 +259,8 @@ def inject_test_method(use_cuda, parallel, nn_type, combine):
 
 def inject_all_tests():
     for use_cuda in (False, True):
+        if use_cuda and not core.is_compiled_with_cuda():
+            continue
         for parallel in (False, True):
             for nn_type in ('mlp', 'conv'):
                 inject_test_method(use_cuda, parallel, nn_type, True)
diff --git a/python/paddle/fluid/tests/book/test_recommender_system.py b/python/paddle/fluid/tests/book/test_recommender_system.py
index 6548766ef5d0162b50d4dd072e8e91dd95dc5d2b..cf8c48f34697d789d3d81d4d94f90a7169657baf 100644
--- a/python/paddle/fluid/tests/book/test_recommender_system.py
+++ b/python/paddle/fluid/tests/book/test_recommender_system.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import math
 import sys
 import os
@@ -260,15 +262,15 @@ def infer(use_cuda, save_dirname=None):
 
         # Use the first data from paddle.dataset.movielens.test() as input
         assert feed_target_names[0] == "user_id"
-        # Use create_lod_tensor(data, recursive_sequence_lengths, place) API 
-        # to generate LoD Tensor where `data` is a list of sequences of index 
-        # numbers, `recursive_sequence_lengths` is the length-based level of detail 
+        # Use create_lod_tensor(data, recursive_sequence_lengths, place) API
+        # to generate LoD Tensor where `data` is a list of sequences of index
+        # numbers, `recursive_sequence_lengths` is the length-based level of detail
         # (lod) info associated with `data`.
         # For example, data = [[10, 2, 3], [2, 3]] means that it contains
         # two sequences of indexes, of length 3 and 2, respectively.
-        # Correspondingly, recursive_sequence_lengths = [[3, 2]] contains one 
-        # level of detail info, indicating that `data` consists of two sequences 
-        # of length 3 and 2, respectively. 
+        # Correspondingly, recursive_sequence_lengths = [[3, 2]] contains one
+        # level of detail info, indicating that `data` consists of two sequences
+        # of length 3 and 2, respectively.
         user_id = fluid.create_lod_tensor([[1]], [[1]], place)
 
         assert feed_target_names[1] == "gender_id"
diff --git a/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py b/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py
index 467282624154086a874b0e73736ed5b1358915ff..91c8705aa4c88dbfeea45e15c368459ba5b5ac1f 100644
--- a/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py
+++ b/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import numpy as np
 import paddle
 import paddle.fluid as fluid
@@ -175,7 +177,7 @@ def train(use_cuda, save_dirname=None):
     feeder = fluid.DataFeeder(feed_list, place)
 
     batch_id = 0
-    for pass_id in xrange(2):
+    for pass_id in range(2):
         for data in train_data():
             outs = exe.run(framework.default_main_program(),
                            feed=feeder.feed(data),
@@ -213,14 +215,14 @@ def infer(use_cuda, save_dirname=None):
          fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
 
         # Setup input by creating LoDTensor to represent sequence of words.
-        # Here each word is the basic element of the LoDTensor and the shape of 
-        # each word (base_shape) should be [1] since it is simply an index to 
+        # Here each word is the basic element of the LoDTensor and the shape of
+        # each word (base_shape) should be [1] since it is simply an index to
         # look up for the corresponding word vector.
         # Suppose the recursive_sequence_lengths info is set to [[4, 6]],
-        # which has only one level of detail. Then the created LoDTensor will have only 
-        # one higher level structure (sequence of words, or sentence) than the basic 
-        # element (word). Hence the LoDTensor will hold data for two sentences of 
-        # length 4 and 6, respectively. 
+        # which has only one level of detail. Then the created LoDTensor will have only
+        # one higher level structure (sequence of words, or sentence) than the basic
+        # element (word). Hence the LoDTensor will hold data for two sentences of
+        # length 4 and 6, respectively.
         # Note that recursive_sequence_lengths should be a list of lists.
         recursive_seq_lens = [[4, 6]]
         base_shape = [1]
diff --git a/python/paddle/fluid/tests/book/test_word2vec.py b/python/paddle/fluid/tests/book/test_word2vec.py
index 49bd72c7a53c0ae740bdbabe15b1d37340699d41..fe063eb4629dbe06dc65ce98c6c01858db901f03 100644
--- a/python/paddle/fluid/tests/book/test_word2vec.py
+++ b/python/paddle/fluid/tests/book/test_word2vec.py
@@ -12,8 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import paddle
 import paddle.fluid as fluid
+from paddle.fluid.layers.device import get_places
 import unittest
 import os
 import numpy as np
@@ -80,13 +83,15 @@ def train(use_cuda, is_sparse, is_parallel, save_dirname, is_local=True):
         avg_cost, predict_word = __network__(
             [first_word, second_word, third_word, forth_word, next_word])
     else:
-        places = fluid.layers.get_places()
+        places = get_places()
         pd = fluid.layers.ParallelDo(places)
         with pd.do():
             avg_cost, predict_word = __network__(
-                map(pd.read_input, [
-                    first_word, second_word, third_word, forth_word, next_word
-                ]))
+                list(
+                    map(pd.read_input, [
+                        first_word, second_word, third_word, forth_word,
+                        next_word
+                    ])))
             pd.write_output(avg_cost)
 
         avg_cost = fluid.layers.mean(pd())
@@ -166,11 +171,11 @@ def infer(use_cuda, save_dirname=None):
         word_dict = paddle.dataset.imikolov.build_dict()
         dict_size = len(word_dict)
 
-        # Setup inputs by creating 4 LoDTensors representing 4 words. Here each word 
-        # is simply an index to look up for the corresponding word vector and hence 
-        # the shape of word (base_shape) should be [1]. The recursive_sequence_lengths, 
-        # which is length-based level of detail (lod) of each LoDTensor, should be [[1]] 
-        # meaning there is only one level of detail and there is only one sequence of 
+        # Setup inputs by creating 4 LoDTensors representing 4 words. Here each word
+        # is simply an index to look up for the corresponding word vector and hence
+        # the shape of word (base_shape) should be [1]. The recursive_sequence_lengths,
+        # which is length-based level of detail (lod) of each LoDTensor, should be [[1]]
+        # meaning there is only one level of detail and there is only one sequence of
         # one word on this level.
         # Note that recursive_sequence_lengths should be a list of lists.
         recursive_seq_lens = [[1]]
@@ -244,7 +249,7 @@ def inject_test_method(use_cuda, is_sparse, is_parallel):
                     is_sparse=is_sparse,
                     is_parallel=is_parallel)
 
-    if use_cuda and is_sparse:
+    if (not fluid.core.is_compiled_with_cuda() or use_cuda) and is_sparse:
         fn = __impl__
     else:
         # skip the other test when on CI server
diff --git a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
index be347cd5315668dde0454d7959dbf9bcfa465b5f..f530f8f4882a23df18c141b51560cb618fce86b5 100644
--- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
+++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
@@ -12,12 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import numpy as np
-import paddle
-import paddle.fluid as fluid
+from __future__ import print_function
+
 import math
 import sys
 
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.layers.device import get_places
+
 # need to fix random seed and training data to compare the loss
 # value accurately calculated by the default and the memory optimization
 # version.
@@ -34,7 +37,7 @@ if fluid.core.is_compiled_with_cuda():
     use_nccl = False
     place = fluid.CUDAPlace(0)
 
-places = fluid.layers.get_places(device_count=0, device_type=device_type)
+places = get_places(device_count=0, device_type=device_type)
 pd = fluid.layers.ParallelDo(places, use_nccl=use_nccl)
 with pd.do():
     x_ = pd.read_input(x)
@@ -77,7 +80,7 @@ for pass_id in range(PASS_NUM):
 
         if avg_loss_value[0] < 10.0:
             exit(0)  # if avg cost less than 10.0, we think our code is good.
-        print avg_loss_value[0]
+        print(avg_loss_value[0])
         if math.isnan(float(avg_loss_value)):
             sys.exit("got NaN loss, training failed.")
 exit(1)
diff --git a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
index dfebb9a06ea4f290f128c486dcaccaeccdcef8c4..a231bbfbc8d5712275c92b4d27580016825ea91b 100644
--- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
+++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
@@ -58,7 +58,7 @@ def resnet_cifar10(input, depth=32):
         return tmp
 
     assert (depth - 2) % 6 == 0
-    n = (depth - 2) / 6
+    n = (depth - 2) // 6
     conv1 = conv_bn_layer(
         input=input, ch_out=16, filter_size=3, stride=1, padding=1)
     res1 = layer_warp(basicblock, conv1, 16, 16, n, 1)
@@ -125,8 +125,8 @@ opts = optimizer.minimize(avg_cost)
 batch_size = fluid.layers.create_tensor(dtype='int64')
 batch_acc = fluid.layers.accuracy(input=predict, label=label, total=batch_size)
 
-# fluid.memory_optimize(fluid.default_main_program(), level=0)
-fluid.release_memory(fluid.default_main_program())
+fluid.memory_optimize(fluid.default_main_program(), level=0)
+# fluid.release_memory(fluid.default_main_program())
 
 BATCH_SIZE = 16
 PASS_NUM = 1
diff --git a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
index fa696acdfa9058af14f0bd34ce1a2980db5aeafc..e520c8965089263d1ba10a6057acda1a53cc34a9 100644
--- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
+++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import numpy as np
 import paddle
 import paddle.fluid as fluid
@@ -90,8 +92,8 @@ def main():
     optimizer = fluid.optimizer.Adagrad(learning_rate=1e-4)
     optimizer.minimize(avg_cost)
 
-    # fluid.memory_optimize(fluid.default_main_program())
-    fluid.release_memory(fluid.default_main_program())
+    fluid.memory_optimize(fluid.default_main_program())
+    # fluid.release_memory(fluid.default_main_program())
 
     # fix the order of training data
     train_data = paddle.batch(
@@ -118,7 +120,7 @@ def main():
     feeder = fluid.DataFeeder(feed_list, place)
 
     batch_id = 0
-    for pass_id in xrange(10):
+    for pass_id in range(10):
         for data in train_data():
             outs = exe.run(fluid.default_main_program(),
                            feed=feeder.feed(data),
diff --git a/python/paddle/fluid/tests/demo/fc_gan.py b/python/paddle/fluid/tests/demo/fc_gan.py
index 8ea1b2b15cc0c0eb5bca67a9c5a6ac6c6774e7e2..bd77779ce6ab5cf19e3e5ace3e51e39734b27c10 100644
--- a/python/paddle/fluid/tests/demo/fc_gan.py
+++ b/python/paddle/fluid/tests/demo/fc_gan.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import errno
 import math
 import os
@@ -137,7 +139,7 @@ def main():
             generated_img = exe.run(g_program,
                                     feed={'noise': n},
                                     fetch_list={g_img})[0]
-            real_data = numpy.array(map(lambda x: x[0], data)).astype('float32')
+            real_data = numpy.array([x[0] for x in data]).astype('float32')
             real_data = real_data.reshape(num_true, 784)
             total_data = numpy.concatenate([real_data, generated_img])
             total_label = numpy.concatenate([
@@ -150,7 +152,7 @@ def main():
                                 feed={'img': total_data,
                                       'label': total_label},
                                 fetch_list={d_loss})[0]
-            for _ in xrange(NUM_TRAIN_TIMES_OF_DG):
+            for _ in range(NUM_TRAIN_TIMES_OF_DG):
                 n = numpy.random.uniform(
                     low=-1.0, high=1.0,
                     size=[2 * num_true * NOISE_SIZE]).astype('float32').reshape(
diff --git a/python/paddle/fluid/tests/demo/text_classification/.gitignore b/python/paddle/fluid/tests/demo/file_reader/.gitignore
similarity index 100%
rename from python/paddle/fluid/tests/demo/text_classification/.gitignore
rename to python/paddle/fluid/tests/demo/file_reader/.gitignore
diff --git a/python/paddle/fluid/tests/demo/text_classification/convert_data_to_recordio.py b/python/paddle/fluid/tests/demo/file_reader/convert_data_to_recordio.py
similarity index 84%
rename from python/paddle/fluid/tests/demo/text_classification/convert_data_to_recordio.py
rename to python/paddle/fluid/tests/demo/file_reader/convert_data_to_recordio.py
index 9425d472a48056e71da5da364f659971ef6c2520..45a104ec9625eacfcb87ea6eae619e3d71410da9 100644
--- a/python/paddle/fluid/tests/demo/text_classification/convert_data_to_recordio.py
+++ b/python/paddle/fluid/tests/demo/file_reader/convert_data_to_recordio.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import sys
 import paddle.fluid as fluid
 import paddle.v2 as paddle
@@ -31,9 +33,12 @@ def load_vocab(filename):
 
 
 # load word dict with paddle inner function
-word_dict = load_vocab(sys.argv[1])
-word_dict["<unk>"] = len(word_dict)
-print "Dict dim = ", len(word_dict)
+if len(sys.argv) == 1:
+    word_dict = paddle.dataset.imdb.word_dict()
+else:
+    word_dict = load_vocab(sys.argv[1])
+    word_dict["<unk>"] = len(word_dict)
+print("Dict dim = ", len(word_dict))
 
 # input text data
 data = fluid.layers.data(name="words", shape=[1], dtype="int64", lod_level=1)
@@ -47,7 +52,7 @@ feeder = fluid.DataFeeder(feed_list=[data, label], place=fluid.CPUPlace())
 BATCH_SIZE = 128
 train_reader = paddle.batch(
     paddle.reader.shuffle(
-        paddle.dataset.imdb.train(word_dict), buf_size=10000),
+        paddle.dataset.imdb.train(word_dict), buf_size=25000),
     batch_size=BATCH_SIZE)
 
 test_reader = paddle.batch(
diff --git a/python/paddle/fluid/tests/demo/file_reader/train.py b/python/paddle/fluid/tests/demo/file_reader/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f5d2848da42e18f2a142faae0c89352344d8cee
--- /dev/null
+++ b/python/paddle/fluid/tests/demo/file_reader/train.py
@@ -0,0 +1,140 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import paddle.fluid as fluid
+import numpy
+import sys
+
+TRAIN_FILES = ['train.recordio']
+TEST_FILES = ['test.recordio']
+
+DICT_DIM = 5147
+
+# embedding dim
+emb_dim = 128
+
+# hidden dim
+hid_dim = 128
+
+# class num
+class_dim = 2
+
+# epoch num
+epoch_num = 10
+
+
+def build_program(is_train):
+    file_obj_handle = fluid.layers.io.open_files(
+        filenames=TRAIN_FILES if is_train else TEST_FILES,
+        shapes=[[-1, 1], [-1, 1]],
+        lod_levels=[1, 0],
+        dtypes=['int64', 'int64'])
+
+    file_obj = fluid.layers.io.double_buffer(file_obj_handle)
+
+    with fluid.unique_name.guard():
+
+        data, label = fluid.layers.read_file(file_obj)
+
+        emb = fluid.layers.embedding(input=data, size=[DICT_DIM, emb_dim])
+
+        conv_3 = fluid.nets.sequence_conv_pool(
+            input=emb,
+            num_filters=hid_dim,
+            filter_size=3,
+            act="tanh",
+            pool_type="sqrt")
+
+        conv_4 = fluid.nets.sequence_conv_pool(
+            input=emb,
+            num_filters=hid_dim,
+            filter_size=4,
+            act="tanh",
+            pool_type="sqrt")
+
+        prediction = fluid.layers.fc(input=[conv_3, conv_4],
+                                     size=class_dim,
+                                     act="softmax")
+
+        # cross entropy loss
+        cost = fluid.layers.cross_entropy(input=prediction, label=label)
+
+        # mean loss
+        avg_cost = fluid.layers.mean(x=cost)
+        acc = fluid.layers.accuracy(input=prediction, label=label)
+
+        if is_train:
+            # SGD optimizer
+            sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=0.001)
+            sgd_optimizer.minimize(avg_cost)
+
+    return {'loss': avg_cost, 'log': [avg_cost, acc], 'file': file_obj_handle}
+
+
+def main():
+    train = fluid.Program()
+    startup = fluid.Program()
+    test = fluid.Program()
+
+    with fluid.program_guard(train, startup):
+        train_args = build_program(is_train=True)
+
+    with fluid.program_guard(test, startup):
+        test_args = build_program(is_train=False)
+
+    use_cuda = fluid.core.is_compiled_with_cuda()
+    # startup
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place=place)
+    exe.run(startup)
+
+    train_exe = fluid.ParallelExecutor(
+        use_cuda=use_cuda,
+        loss_name=train_args['loss'].name,
+        main_program=train)
+    test_exe = fluid.ParallelExecutor(
+        use_cuda=use_cuda, main_program=test, share_vars_from=train_exe)
+
+    fetch_var_list = [var.name for var in train_args['log']]
+    for epoch_id in range(epoch_num):
+        # train
+        try:
+            batch_id = 0
+            while True:
+                loss, acc = map(numpy.array,
+                                train_exe.run(fetch_list=fetch_var_list))
+                print 'Train epoch', epoch_id, 'batch', batch_id, 'loss:', loss, 'acc:', acc
+                batch_id += 1
+        except fluid.core.EOFException:
+            print 'End of epoch', epoch_id
+            train_args['file'].reset()
+
+        # test
+        loss = []
+        acc = []
+        try:
+            while True:
+                loss_np, acc_np = map(numpy.array,
+                                      test_exe.run(fetch_list=fetch_var_list))
+                loss.append(loss_np[0])
+                acc.append(acc_np[0])
+        except:
+            test_args['file'].reset()
+            print 'Test loss:', numpy.mean(loss), 'acc:', numpy.mean(acc)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/python/paddle/fluid/tests/demo/pyreader.py b/python/paddle/fluid/tests/demo/pyreader.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec61e0ebae4feb1a2177da916b77b2ba2d3981b9
--- /dev/null
+++ b/python/paddle/fluid/tests/demo/pyreader.py
@@ -0,0 +1,101 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy
+import six
+
+import paddle
+import paddle.dataset.mnist as mnist
+import paddle.fluid as fluid
+import paddle.v2
+
+
+def network(is_train):
+    reader = fluid.layers.py_reader(
+        capacity=10,
+        shapes=((-1, 784), (-1, 1)),
+        dtypes=('float32', 'int64'),
+        name="train_reader" if is_train else "test_reader",
+        use_double_buffer=True)
+    img, label = fluid.layers.read_file(reader)
+
+    hidden = img
+
+    for i in six.moves.xrange(2):
+        hidden = fluid.layers.fc(input=hidden, size=100, act='tanh')
+        hidden = fluid.layers.dropout(
+            hidden, dropout_prob=0.5, is_test=not is_train)
+
+    prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
+    loss = fluid.layers.cross_entropy(input=prediction, label=label)
+    return fluid.layers.mean(loss), reader
+
+
+def main():
+    train_prog = fluid.Program()
+    startup_prog = fluid.Program()
+
+    with fluid.program_guard(train_prog, startup_prog):
+        with fluid.unique_name.guard():
+            loss, train_reader = network(True)
+            adam = fluid.optimizer.Adam(learning_rate=0.01)
+            adam.minimize(loss)
+
+    test_prog = fluid.Program()
+    test_startup = fluid.Program()
+    with fluid.program_guard(test_prog, test_startup):
+        with fluid.unique_name.guard():
+            test_loss, test_reader = network(False)
+
+    use_cuda = fluid.core.is_compiled_with_cuda()
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    fluid.Executor(place).run(startup_prog)
+    fluid.Executor(place).run(test_startup)
+
+    trainer = fluid.ParallelExecutor(
+        use_cuda=use_cuda, loss_name=loss.name, main_program=train_prog)
+
+    tester = fluid.ParallelExecutor(
+        use_cuda=use_cuda, share_vars_from=trainer, main_program=test_prog)
+
+    train_reader.decorate_paddle_reader(
+        paddle.v2.reader.shuffle(
+            paddle.batch(mnist.train(), 512), buf_size=8192))
+
+    test_reader.decorate_paddle_reader(paddle.batch(mnist.test(), 512))
+
+    for epoch_id in six.moves.xrange(10):
+        train_reader.start()
+        try:
+            while True:
+                print 'train_loss', numpy.array(
+                    trainer.run(fetch_list=[loss.name]))
+        except fluid.core.EOFException:
+            print 'End of epoch', epoch_id
+            train_reader.reset()
+
+        test_reader.start()
+        try:
+            while True:
+                print 'test loss', numpy.array(
+                    tester.run(fetch_list=[test_loss.name]))
+        except fluid.core.EOFException:
+            print 'End of testing'
+            test_reader.reset()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/python/paddle/fluid/tests/demo/text_classification/train.py b/python/paddle/fluid/tests/demo/text_classification/train.py
deleted file mode 100644
index e408684c6e0941a1b317ffeac66f071c1382836d..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/demo/text_classification/train.py
+++ /dev/null
@@ -1,148 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.fluid as fluid
-import numpy
-import sys
-
-TRAIN_FILES = ['train.recordio']
-TEST_FILES = ['test.recordio']
-
-DICT_DIM = 89528
-
-# embedding dim
-emb_dim = 128
-
-# hidden dim
-hid_dim = 128
-
-# hidden dim2
-hid_dim2 = 96
-
-# class num
-class_dim = 2
-
-
-def network_cfg(is_train, pass_num=100):
-    with fluid.unique_name.guard():
-        train_file_obj = fluid.layers.open_files(
-            filenames=TRAIN_FILES,
-            pass_num=pass_num,
-            shapes=[[-1, 1], [-1, 1]],
-            lod_levels=[1, 0],
-            dtypes=['int64', 'int64'],
-            thread_num=1)
-
-        test_file_obj = fluid.layers.open_files(
-            filenames=TEST_FILES,
-            pass_num=1,
-            shapes=[[-1, 1], [-1, 1]],
-            lod_levels=[1, 0],
-            dtypes=['int64', 'int64'],
-            thread_num=1)
-
-        if is_train:
-            file_obj = fluid.layers.shuffle(train_file_obj, buffer_size=1000)
-        else:
-            file_obj = test_file_obj
-
-        file_obj = fluid.layers.double_buffer(
-            file_obj,
-            name="train_double_buffer" if is_train else 'test_double_buffer')
-
-        data, label = fluid.layers.read_file(file_obj)
-
-        emb = fluid.layers.embedding(input=data, size=[DICT_DIM, emb_dim])
-
-        # sequence conv with window size = 3
-        win_size = 3
-        conv_3 = fluid.nets.sequence_conv_pool(
-            input=emb,
-            num_filters=hid_dim,
-            filter_size=win_size,
-            act="tanh",
-            pool_type="max")
-
-        # fc layer after conv
-        fc_1 = fluid.layers.fc(input=[conv_3], size=hid_dim2)
-
-        # probability of each class
-        prediction = fluid.layers.fc(input=[fc_1],
-                                     size=class_dim,
-                                     act="softmax")
-        # cross entropy loss
-        cost = fluid.layers.cross_entropy(input=prediction, label=label)
-
-        # mean loss
-        avg_cost = fluid.layers.mean(x=cost)
-        acc = fluid.layers.accuracy(input=prediction, label=label)
-
-        if is_train:
-            # SGD optimizer
-            sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=0.01)
-            sgd_optimizer.minimize(avg_cost)
-
-        return {
-            'loss': avg_cost,
-            'log': [avg_cost, acc],
-            'file': train_file_obj if is_train else test_file_obj
-        }
-
-
-def main():
-    train = fluid.Program()
-    startup = fluid.Program()
-
-    with fluid.program_guard(train, startup):
-        train_args = network_cfg(is_train=True)
-
-    test = fluid.Program()
-
-    with fluid.program_guard(test, fluid.Program()):
-        test_args = network_cfg(is_train=False)
-
-    # startup
-    place = fluid.CUDAPlace(0)
-    exe = fluid.Executor(place=place)
-    exe.run(startup)
-
-    train_exe = fluid.ParallelExecutor(
-        use_cuda=True, loss_name=train_args['loss'].name, main_program=train)
-
-    fetch_var_list = [var.name for var in train_args['log']]
-    for i in xrange(sys.maxint):
-        result = map(numpy.array,
-                     train_exe.run(fetch_list=fetch_var_list
-                                   if i % 1000 == 0 else []))
-        if len(result) != 0:
-            print 'Train: ', result
-
-        if i % 1000 == 0:
-            test_exe = fluid.ParallelExecutor(
-                use_cuda=True, main_program=test, share_vars_from=train_exe)
-            loss = []
-            acc = []
-            try:
-                while True:
-                    loss_np, acc_np = map(
-                        numpy.array, test_exe.run(fetch_list=fetch_var_list))
-                    loss.append(loss_np[0])
-                    acc.append(acc_np[0])
-            except:
-                test_args['file'].reset()
-                print 'TEST: ', numpy.mean(loss), numpy.mean(acc)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/python/paddle/fluid/tests/no_test_concurrency.py b/python/paddle/fluid/tests/no_test_concurrency.py
index e8f6cfb4a907b2c01e9662e7e9bf2cb0fbd6cb1b..b5d7676f4a2cb085c6900cd0bd0644afa2b2afd5 100644
--- a/python/paddle/fluid/tests/no_test_concurrency.py
+++ b/python/paddle/fluid/tests/no_test_concurrency.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import paddle.fluid as fluid
 import paddle.fluid.core as core
@@ -194,7 +196,7 @@ class TestRoutineOp(unittest.TestCase):
             quit_ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
 
             with fluid.Go():
-                for i in xrange(10):
+                for i in range(10):
                     fluid.channel_recv(ch1, result)
                     Print(result)
 
diff --git a/python/paddle/fluid/tests/notest_concurrency.py b/python/paddle/fluid/tests/notest_concurrency.py
index 77107f8b36f31c1f494b0ade218ee047ef7eb7c6..fd9da4cce0ea51c53b4b01e7c3dc2a2ed1eeb089 100644
--- a/python/paddle/fluid/tests/notest_concurrency.py
+++ b/python/paddle/fluid/tests/notest_concurrency.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import paddle.fluid as fluid
 import paddle.fluid.core as core
diff --git a/python/paddle/fluid/tests/test_beam_search_decoder.py b/python/paddle/fluid/tests/test_beam_search_decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe8a9daa3bea4b99bb42edc78538685c5ce11fe3
--- /dev/null
+++ b/python/paddle/fluid/tests/test_beam_search_decoder.py
@@ -0,0 +1,267 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+A simple machine translation demo using beam search decoder.
+"""
+
+from __future__ import print_function
+
+import contextlib
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import paddle.fluid.framework as framework
+import paddle.fluid.layers as layers
+from paddle.fluid.executor import Executor
+from paddle.fluid.contrib.decoder.beam_search_decoder import *
+import unittest
+import os
+
+dict_size = 30000
+source_dict_dim = target_dict_dim = dict_size
+src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)
+hidden_dim = 32
+word_dim = 32
+decoder_size = hidden_dim
+IS_SPARSE = True
+batch_size = 2
+max_length = 8
+topk_size = 50
+trg_dic_size = 10000
+beam_size = 2
+
+
+def encoder():
+    # encoder
+    src_word = layers.data(
+        name="src_word", shape=[1], dtype='int64', lod_level=1)
+    src_embedding = layers.embedding(
+        input=src_word,
+        size=[dict_size, word_dim],
+        dtype='float32',
+        is_sparse=IS_SPARSE)
+
+    fc1 = layers.fc(input=src_embedding, size=hidden_dim * 4, act='tanh')
+    lstm_hidden0, lstm_0 = layers.dynamic_lstm(input=fc1, size=hidden_dim * 4)
+    encoder_out = layers.sequence_last_step(input=lstm_hidden0)
+    return encoder_out
+
+
+def decoder_state_cell(context):
+    h = InitState(init=context, need_reorder=True)
+    state_cell = StateCell(inputs={'x': None}, states={'h': h}, out_state='h')
+
+    @state_cell.state_updater
+    def updater(state_cell):
+        current_word = state_cell.get_input('x')
+        prev_h = state_cell.get_state('h')
+        # make sure lod of h heritted from prev_h
+        h = layers.fc(input=[prev_h, current_word],
+                      size=decoder_size,
+                      act='tanh')
+        state_cell.set_state('h', h)
+
+    return state_cell
+
+
+def decoder_train(state_cell):
+    # decoder
+    trg_language_word = layers.data(
+        name="target_word", shape=[1], dtype='int64', lod_level=1)
+    trg_embedding = layers.embedding(
+        input=trg_language_word,
+        size=[dict_size, word_dim],
+        dtype='float32',
+        is_sparse=IS_SPARSE)
+
+    decoder = TrainingDecoder(state_cell)
+
+    with decoder.block():
+        current_word = decoder.step_input(trg_embedding)
+        decoder.state_cell.compute_state(inputs={'x': current_word})
+        current_score = layers.fc(input=decoder.state_cell.get_state('h'),
+                                  size=target_dict_dim,
+                                  act='softmax')
+        decoder.state_cell.update_states()
+        decoder.output(current_score)
+
+    return decoder()
+
+
+def decoder_decode(state_cell):
+    init_ids = layers.data(
+        name="init_ids", shape=[1], dtype="int64", lod_level=2)
+    init_scores = layers.data(
+        name="init_scores", shape=[1], dtype="float32", lod_level=2)
+
+    decoder = BeamSearchDecoder(
+        state_cell=state_cell,
+        init_ids=init_ids,
+        init_scores=init_scores,
+        target_dict_dim=target_dict_dim,
+        word_dim=word_dim,
+        input_var_dict={},
+        topk_size=topk_size,
+        sparse_emb=IS_SPARSE,
+        max_len=max_length,
+        beam_size=beam_size,
+        end_id=1,
+        name=None)
+    decoder.decode()
+    translation_ids, translation_scores = decoder()
+
+    return translation_ids, translation_scores
+
+
+def train_main(use_cuda):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+
+    context = encoder()
+    state_cell = decoder_state_cell(context)
+    rnn_out = decoder_train(state_cell)
+    label = layers.data(
+        name="target_next_word", shape=[1], dtype='int64', lod_level=1)
+    cost = layers.cross_entropy(input=rnn_out, label=label)
+    avg_cost = layers.mean(x=cost)
+
+    optimizer = fluid.optimizer.Adagrad(learning_rate=1e-3)
+    optimizer.minimize(avg_cost)
+
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.wmt14.train(dict_size), buf_size=1000),
+        batch_size=batch_size)
+    feed_order = ['src_word', 'target_word', 'target_next_word']
+
+    exe = Executor(place)
+
+    def train_loop(main_program):
+        exe.run(framework.default_startup_program())
+
+        feed_list = [
+            main_program.global_block().var(var_name) for var_name in feed_order
+        ]
+        feeder = fluid.DataFeeder(feed_list, place)
+
+        for pass_id in range(1):
+            for batch_id, data in enumerate(train_reader()):
+                outs = exe.run(main_program,
+                               feed=feeder.feed(data),
+                               fetch_list=[avg_cost])
+                avg_cost_val = np.array(outs[0])
+                print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) +
+                      " avg_cost=" + str(avg_cost_val))
+                if batch_id > 3:
+                    break
+
+    train_loop(framework.default_main_program())
+
+
+def decode_main(use_cuda):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+
+    context = encoder()
+    state_cell = decoder_state_cell(context)
+    translation_ids, translation_scores = decoder_decode(state_cell)
+
+    exe = Executor(place)
+    exe.run(framework.default_startup_program())
+
+    init_ids_data = np.array([0 for _ in range(batch_size)], dtype='int64')
+    init_scores_data = np.array(
+        [1. for _ in range(batch_size)], dtype='float32')
+    init_ids_data = init_ids_data.reshape((batch_size, 1))
+    init_scores_data = init_scores_data.reshape((batch_size, 1))
+    init_lod = [1] * batch_size
+    init_lod = [init_lod, init_lod]
+
+    init_ids = fluid.create_lod_tensor(init_ids_data, init_lod, place)
+    init_scores = fluid.create_lod_tensor(init_scores_data, init_lod, place)
+
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.wmt14.train(dict_size), buf_size=1000),
+        batch_size=batch_size)
+
+    feed_order = ['src_word']
+    feed_list = [
+        framework.default_main_program().global_block().var(var_name)
+        for var_name in feed_order
+    ]
+    feeder = fluid.DataFeeder(feed_list, place)
+
+    data = next(train_reader())
+    feed_dict = feeder.feed([[x[0]] for x in data])
+    feed_dict['init_ids'] = init_ids
+    feed_dict['init_scores'] = init_scores
+
+    result_ids, result_scores = exe.run(
+        framework.default_main_program(),
+        feed=feed_dict,
+        fetch_list=[translation_ids, translation_scores],
+        return_numpy=False)
+    print(result_ids.lod())
+
+
+class TestBeamSearchDecoder(unittest.TestCase):
+    pass
+
+
+@contextlib.contextmanager
+def scope_prog_guard():
+    prog = fluid.Program()
+    startup_prog = fluid.Program()
+    scope = fluid.core.Scope()
+    with fluid.scope_guard(scope):
+        with fluid.program_guard(prog, startup_prog):
+            yield
+
+
+def inject_test_train(use_cuda):
+    f_name = 'test_{0}_train'.format('cuda' if use_cuda else 'cpu')
+
+    def f(*args):
+        with scope_prog_guard():
+            train_main(use_cuda)
+
+    setattr(TestBeamSearchDecoder, f_name, f)
+
+
+def inject_test_decode(use_cuda, decorator=None):
+    f_name = 'test_{0}_decode'.format('cuda' if use_cuda else 'cpu', 'sparse')
+
+    def f(*args):
+        with scope_prog_guard():
+            decode_main(use_cuda)
+
+    if decorator is not None:
+        f = decorator(f)
+
+    setattr(TestBeamSearchDecoder, f_name, f)
+
+
+for _use_cuda_ in (False, True):
+    inject_test_train(_use_cuda_)
+
+for _use_cuda_ in (False, True):
+    _decorator_ = None
+    inject_test_decode(use_cuda=_use_cuda_, decorator=_decorator_)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/test_cpp_reader.py b/python/paddle/fluid/tests/test_cpp_reader.py
index 6cc291dfcffdd7083f498389834e37bd06ca4572..b2a5253b9500bb504c651b2ab684206133199ada 100644
--- a/python/paddle/fluid/tests/test_cpp_reader.py
+++ b/python/paddle/fluid/tests/test_cpp_reader.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import paddle
 import paddle.fluid as fluid
 import numpy as np
diff --git a/python/paddle/fluid/tests/test_data_feeder.py b/python/paddle/fluid/tests/test_data_feeder.py
index 30b7a634a2b978df85d6432854ef12285460be44..01de564aa438e5f14a5c578f7bbbfb475155ca55 100644
--- a/python/paddle/fluid/tests/test_data_feeder.py
+++ b/python/paddle/fluid/tests/test_data_feeder.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import paddle.fluid as fluid
 import unittest
 
diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py
index 2d70c986b1b6c42ff709e9cf3b4234cf4fc26836..56129641ce5900d82aedf243d2fa1eadfd6b8d86 100644
--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 from __future__ import print_function
+
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 from paddle.fluid.framework import Program, program_guard
@@ -145,6 +146,64 @@ class TestAnchorGenerator(unittest.TestCase):
         assert anchor.shape[3] == 4
 
 
+class TestGenerateProposalLabels(unittest.TestCase):
+    def test_generate_proposal_labels(self):
+        program = Program()
+        with program_guard(program):
+            rpn_rois = layers.data(
+                name='rpn_rois',
+                shape=[4, 4],
+                dtype='float32',
+                lod_level=1,
+                append_batch_size=False)
+            gt_classes = layers.data(
+                name='gt_classes',
+                shape=[6],
+                dtype='int32',
+                lod_level=1,
+                append_batch_size=False)
+            is_crowd = layers.data(
+                name='is_crowd',
+                shape=[6],
+                dtype='int32',
+                lod_level=1,
+                append_batch_size=False)
+            gt_boxes = layers.data(
+                name='gt_boxes',
+                shape=[6, 4],
+                dtype='float32',
+                lod_level=1,
+                append_batch_size=False)
+            im_info = layers.data(
+                name='im_info',
+                shape=[1, 3],
+                dtype='float32',
+                lod_level=1,
+                append_batch_size=False)
+            class_nums = 5
+            rois, labels_int32, bbox_targets, bbox_inside_weights, bbox_outside_weights = fluid.layers.generate_proposal_labels(
+                rpn_rois=rpn_rois,
+                gt_classes=gt_classes,
+                is_crowd=is_crowd,
+                gt_boxes=gt_boxes,
+                im_info=im_info,
+                batch_size_per_im=2,
+                fg_fraction=0.5,
+                fg_thresh=0.5,
+                bg_thresh_hi=0.5,
+                bg_thresh_lo=0.0,
+                bbox_reg_weights=[0.1, 0.1, 0.2, 0.2],
+                class_nums=class_nums)
+            assert rois.shape[1] == 4
+            assert rois.shape[0] == labels_int32.shape[0]
+            assert rois.shape[0] == bbox_targets.shape[0]
+            assert rois.shape[0] == bbox_inside_weights.shape[0]
+            assert rois.shape[0] == bbox_outside_weights.shape[0]
+            assert bbox_targets.shape[1] == 4 * class_nums
+            assert bbox_inside_weights.shape[1] == 4 * class_nums
+            assert bbox_outside_weights.shape[1] == 4 * class_nums
+
+
 class TestMultiBoxHead(unittest.TestCase):
     def test_multi_box_head(self):
         data_shape = [3, 224, 224]
@@ -200,5 +259,109 @@ class TestDetectionMAP(unittest.TestCase):
         print(str(program))
 
 
+class TestRpnTargetAssign(unittest.TestCase):
+    def test_rpn_target_assign(self):
+        program = Program()
+        with program_guard(program):
+            bbox_pred_shape = [10, 50, 4]
+            cls_logits_shape = [10, 50, 2]
+            anchor_shape = [50, 4]
+
+            bbox_pred = layers.data(
+                name='bbox_pred',
+                shape=bbox_pred_shape,
+                append_batch_size=False,
+                dtype='float32')
+            cls_logits = layers.data(
+                name='cls_logits',
+                shape=cls_logits_shape,
+                append_batch_size=False,
+                dtype='float32')
+            anchor_box = layers.data(
+                name='anchor_box',
+                shape=anchor_shape,
+                append_batch_size=False,
+                dtype='float32')
+            anchor_var = layers.data(
+                name='anchor_var',
+                shape=anchor_shape,
+                append_batch_size=False,
+                dtype='float32')
+            gt_boxes = layers.data(
+                name='gt_boxes', shape=[4], lod_level=1, dtype='float32')
+            is_crowd = layers.data(
+                name='is_crowd',
+                shape=[10],
+                dtype='int32',
+                lod_level=1,
+                append_batch_size=False)
+            im_info = layers.data(
+                name='im_info',
+                shape=[1, 3],
+                dtype='float32',
+                lod_level=1,
+                append_batch_size=False)
+            pred_scores, pred_loc, tgt_lbl, tgt_bbox = layers.rpn_target_assign(
+                bbox_pred=bbox_pred,
+                cls_logits=cls_logits,
+                anchor_box=anchor_box,
+                anchor_var=anchor_var,
+                gt_boxes=gt_boxes,
+                is_crowd=is_crowd,
+                im_info=im_info,
+                rpn_batch_size_per_im=256,
+                rpn_straddle_thresh=0.0,
+                rpn_fg_fraction=0.5,
+                rpn_positive_overlap=0.7,
+                rpn_negative_overlap=0.3)
+
+            self.assertIsNotNone(pred_scores)
+            self.assertIsNotNone(pred_loc)
+            self.assertIsNotNone(tgt_lbl)
+            self.assertIsNotNone(tgt_bbox)
+            assert pred_scores.shape[1] == 1
+            assert pred_loc.shape[1] == 4
+            assert pred_loc.shape[1] == tgt_bbox.shape[1]
+
+
+class TestGenerateProposals(unittest.TestCase):
+    def test_generate_proposals(self):
+        data_shape = [20, 64, 64]
+        images = fluid.layers.data(
+            name='images', shape=data_shape, dtype='float32')
+        im_info = fluid.layers.data(
+            name='im_info', shape=[1, 3], dtype='float32')
+        anchors, variances = fluid.layers.anchor_generator(
+            name='anchor_generator',
+            input=images,
+            anchor_sizes=[32, 64],
+            aspect_ratios=[1.0],
+            variance=[0.1, 0.1, 0.2, 0.2],
+            stride=[16.0, 16.0],
+            offset=0.5)
+        num_anchors = anchors.shape[2]
+        scores = fluid.layers.data(
+            name='scores', shape=[1, num_anchors, 8, 8], dtype='float32')
+        bbox_deltas = fluid.layers.data(
+            name='bbox_deltas',
+            shape=[1, num_anchors * 4, 8, 8],
+            dtype='float32')
+        rpn_rois, rpn_roi_probs = fluid.layers.generate_proposals(
+            name='generate_proposals',
+            scores=scores,
+            bbox_deltas=bbox_deltas,
+            im_info=im_info,
+            anchors=anchors,
+            variances=variances,
+            pre_nms_top_n=6000,
+            post_nms_top_n=1000,
+            nms_thresh=0.5,
+            min_size=0.1,
+            eta=1.0)
+        self.assertIsNotNone(rpn_rois)
+        self.assertIsNotNone(rpn_roi_probs)
+        print(rpn_rois.shape)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/test_error_clip.py b/python/paddle/fluid/tests/test_error_clip.py
index 89f4c64975802dc1827ec17ed3626b91e36d6971..3c977afc7c813908fbe2dfb7445d9ca183cf2231 100644
--- a/python/paddle/fluid/tests/test_error_clip.py
+++ b/python/paddle/fluid/tests/test_error_clip.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 from __future__ import print_function
+
 import numpy as np
 import paddle
 import paddle.fluid as fluid
@@ -36,7 +37,7 @@ with fluid.program_guard(main_program=prog):
     avg_cost = fluid.layers.mean(cost)
 
 prog_clip = prog.clone()
-prog_clip.block(0).var(hidden1.name).set_error_clip(
+prog_clip.block(0).var(hidden1.name)._set_error_clip(
     fluid.clip.ErrorClipByValue(
         max=CLIP_MAX, min=CLIP_MIN))
 
diff --git a/python/paddle/fluid/tests/test_gradient_clip.py b/python/paddle/fluid/tests/test_gradient_clip.py
index d530601f13be6810a8a99b13c92faf584df568f9..266687fcd092dfdeec9343e2592f4c22b683d588 100644
--- a/python/paddle/fluid/tests/test_gradient_clip.py
+++ b/python/paddle/fluid/tests/test_gradient_clip.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import numpy as np
 import paddle
 import paddle.fluid as fluid
diff --git a/python/paddle/fluid/tests/test_mnist_if_else_op.py b/python/paddle/fluid/tests/test_if_else_op.py
similarity index 54%
rename from python/paddle/fluid/tests/test_mnist_if_else_op.py
rename to python/paddle/fluid/tests/test_if_else_op.py
index d34f52db5ffc889f17513d034ad2c99f696b0cdf..61d81f483636a99ea9e0282de89f12e47f3b824c 100644
--- a/python/paddle/fluid/tests/test_mnist_if_else_op.py
+++ b/python/paddle/fluid/tests/test_if_else_op.py
@@ -12,18 +12,26 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import paddle
 import paddle.fluid.layers as layers
-from paddle.fluid.framework import Program, program_guard, default_main_program, default_startup_program
+from paddle.fluid.framework import Program, program_guard
 from paddle.fluid.executor import Executor
 from paddle.fluid.optimizer import MomentumOptimizer
 import paddle.fluid.core as core
+import paddle.fluid as fluid
+from paddle.fluid.layers.control_flow import split_lod_tensor
+from paddle.fluid.layers.control_flow import merge_lod_tensor
+from paddle.fluid.layers.control_flow import ConditionalBlock
+
 import unittest
 import numpy as np
 
 
 class TestMNISTIfElseOp(unittest.TestCase):
-    def test_raw_api(self):
+    # FIXME: https://github.com/PaddlePaddle/Paddle/issues/12245#issuecomment-406462379
+    def not_test_raw_api(self):
         prog = Program()
         startup_prog = Program()
         with program_guard(prog, startup_prog):
@@ -31,14 +39,12 @@ class TestMNISTIfElseOp(unittest.TestCase):
 
             label = layers.data(name='y', shape=[1], dtype='int64')
 
-            limit = layers.fill_constant_batch_size_like(
-                input=label, dtype='int64', shape=[1], value=5.0)
+            limit = layers.fill_constant(shape=[1], dtype='int64', value=5)
             cond = layers.less_than(x=label, y=limit)
-            true_image, false_image = layers.split_lod_tensor(
-                input=image, mask=cond)
+            true_image, false_image = split_lod_tensor(input=image, mask=cond)
 
             true_out = layers.create_tensor(dtype='float32')
-            true_cond = layers.ConditionalBlock([true_image])
+            true_cond = ConditionalBlock([cond])
 
             with true_cond.block():
                 hidden = layers.fc(input=true_image, size=100, act='tanh')
@@ -46,14 +52,14 @@ class TestMNISTIfElseOp(unittest.TestCase):
                 layers.assign(input=prob, output=true_out)
 
             false_out = layers.create_tensor(dtype='float32')
-            false_cond = layers.ConditionalBlock([false_image])
+            false_cond = ConditionalBlock([cond])
 
             with false_cond.block():
                 hidden = layers.fc(input=false_image, size=200, act='tanh')
                 prob = layers.fc(input=hidden, size=10, act='softmax')
                 layers.assign(input=prob, output=false_out)
 
-            prob = layers.merge_lod_tensor(
+            prob = merge_lod_tensor(
                 in_true=true_out, in_false=false_out, mask=cond, x=image)
             loss = layers.cross_entropy(input=prob, label=label)
             avg_loss = layers.mean(loss)
@@ -64,7 +70,7 @@ class TestMNISTIfElseOp(unittest.TestCase):
         train_reader = paddle.batch(
             paddle.reader.shuffle(
                 paddle.dataset.mnist.train(), buf_size=8192),
-            batch_size=200)
+            batch_size=10)
 
         place = core.CPUPlace()
         exe = Executor(place)
@@ -73,20 +79,21 @@ class TestMNISTIfElseOp(unittest.TestCase):
         PASS_NUM = 100
         for pass_id in range(PASS_NUM):
             for data in train_reader():
-                x_data = np.array(map(lambda x: x[0], data)).astype("float32")
-                y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+                x_data = np.array([x[0] for x in data]).astype("float32")
+                y_data = np.array([x[1] for x in data]).astype("int64")
                 y_data = np.expand_dims(y_data, axis=1)
 
                 outs = exe.run(prog,
                                feed={'x': x_data,
                                      'y': y_data},
                                fetch_list=[avg_loss])
-                print outs[0]
+                print(outs[0])
                 if outs[0] < 1.0:
                     return
         self.assertFalse(True)
 
-    def test_ifelse(self):
+    # FIXME: https://github.com/PaddlePaddle/Paddle/issues/12245#issuecomment-406462379
+    def not_test_ifelse(self):
         prog = Program()
         startup_prog = Program()
         with program_guard(prog, startup_prog):
@@ -94,8 +101,7 @@ class TestMNISTIfElseOp(unittest.TestCase):
 
             label = layers.data(name='y', shape=[1], dtype='int64')
 
-            limit = layers.fill_constant_batch_size_like(
-                input=label, dtype='int64', shape=[1], value=5.0)
+            limit = layers.fill_constant(shape=[1], dtype='int64', value=5)
             cond = layers.less_than(x=label, y=limit)
             ie = layers.IfElse(cond)
 
@@ -125,24 +131,95 @@ class TestMNISTIfElseOp(unittest.TestCase):
         place = core.CPUPlace()
         exe = Executor(place)
 
-        exe.run(kwargs['startup_program'])
+        exe.run(startup_prog)
         PASS_NUM = 100
         for pass_id in range(PASS_NUM):
             for data in train_reader():
-                x_data = np.array(map(lambda x: x[0], data)).astype("float32")
-                y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+                x_data = np.array([x[0] for x in data]).astype("float32")
+                y_data = np.array([x[1] for x in data]).astype("int64")
                 y_data = y_data.reshape((y_data.shape[0], 1))
 
-                outs = exe.run(kwargs['main_program'],
+                outs = exe.run(prog,
                                feed={'x': x_data,
                                      'y': y_data},
                                fetch_list=[avg_loss])
-                print outs[0]
+                print(outs[0])
                 if outs[0] < 1.0:
                     return
         self.assertFalse(True)
 
 
+class TestIfElse(unittest.TestCase):
+    def set_test_case(self):
+        # condiction is: self.data < self.cond_value
+        self.cond_value = 0.5
+        self.data = np.random.rand(25, 1).astype(np.float32)
+
+    def numpy_cal(self):
+        s1 = self.data[np.where(self.data < self.cond_value)]
+        res = np.sum(np.exp(s1))
+        s2 = self.data[np.where(self.data >= self.cond_value)]
+        res += np.sum(np.tanh(s2))
+        return res
+
+    def compare_ifelse_op_and_numpy(self, place):
+        self.set_test_case()
+
+        prog = Program()
+        startup_prog = Program()
+        with program_guard(prog, startup_prog):
+            src = layers.data(name='data', shape=[1], dtype='float32')
+            cond = layers.fill_constant(
+                [1], dtype='float32', value=self.cond_value)
+            ifcond = layers.less_than(x=src, y=cond)
+            ie = layers.IfElse(ifcond)
+            with ie.true_block():
+                true_target = ie.input(src)
+                true_target = fluid.layers.exp(true_target)
+                ie.output(true_target)
+
+            with ie.false_block():
+                false_target = ie.input(src)
+                false_target = fluid.layers.tanh(false_target)
+                ie.output(false_target)
+            if_out = ie()
+            out = layers.reduce_sum(if_out)
+
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
+            fetch_list = [out]
+            o1, = exe.run(fluid.default_main_program(),
+                          feed={'data': self.data},
+                          fetch_list=[out])
+            o2 = self.numpy_cal()
+
+            self.assertTrue(
+                np.allclose(
+                    o1, o2, atol=1e-8),
+                "IfElse result : " + str(o1) + "\n Numpy result :" + str(o2))
+
+    def test_cpu(self):
+        self.compare_ifelse_op_and_numpy(fluid.CPUPlace())
+
+    def test_cuda(self):
+        if not core.is_compiled_with_cuda():
+            return
+        self.compare_ifelse_op_and_numpy(fluid.CUDAPlace(0))
+
+
+class TestIfElseTrueBranch(TestIfElse):
+    def set_test_case(self):
+        # condiction is: self.data < self.cond_value
+        self.cond_value = 10.
+        self.data = np.random.rand(25, 1).astype(np.float32)
+
+
+class TestIfElseFalseBranch(TestIfElse):
+    def set_test_case(self):
+        # condiction is: self.data < self.cond_value
+        self.cond_value = -10.
+        self.data = np.random.rand(25, 1).astype(np.float32)
+
+
 if __name__ == '__main__':
-    # temp disable if else unittest since it could be buggy.
-    exit(0)
+    unittest.main()
diff --git a/python/paddle/fluid/tests/test_lod_tensor.py b/python/paddle/fluid/tests/test_lod_tensor.py
index f7a9dd4129027417a06a6c25ff9a801fff259c5e..722b5f07b04f9374db3f262f5134347fe753ba19 100644
--- a/python/paddle/fluid/tests/test_lod_tensor.py
+++ b/python/paddle/fluid/tests/test_lod_tensor.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import paddle.fluid as fluid
 from paddle.fluid.lod_tensor import create_lod_tensor, create_random_int_lodtensor
 import numpy as np
diff --git a/python/paddle/fluid/tests/test_python_operator_overriding.py b/python/paddle/fluid/tests/test_python_operator_overriding.py
index b5ac97eac559e8c52a8949cfd63fc8671ba52514..5f92c437ec726f510d9194d23f1a01a5478827d6 100644
--- a/python/paddle/fluid/tests/test_python_operator_overriding.py
+++ b/python/paddle/fluid/tests/test_python_operator_overriding.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 
 import numpy as np
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index f6c8dcabcbc592024188f4742e6c532a704d2289..9892d3f8075d21b9aa01cfda0bb73e4d12008852 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -12,6 +12,11 @@ endif(NOT WITH_MKLDNN)
 
 if(NOT WITH_DISTRIBUTE)
     list(REMOVE_ITEM TEST_OPS test_recv_op)
+    list(REMOVE_ITEM TEST_OPS test_dist_transpiler)
+    list(REMOVE_ITEM TEST_OPS test_simple_dist_transpiler)
+    list(REMOVE_ITEM TEST_OPS test_listen_and_serv_op)
+    LIST(REMOVE_ITEM TEST_OPS test_dist_mnist)
+    LIST(REMOVE_ITEM TEST_OPS test_dist_word2vec)
 endif(NOT WITH_DISTRIBUTE)
 
 list(REMOVE_ITEM TEST_OPS test_seq_concat_op) # FIXME(helin): https://github.com/PaddlePaddle/Paddle/issues/8290
@@ -23,6 +28,10 @@ list(REMOVE_ITEM TEST_OPS test_cond_op) # FIXME(qijun): https://github.com/Paddl
 
 list(REMOVE_ITEM TEST_OPS op_test) # op_test is a helper python file, not a test
 list(REMOVE_ITEM TEST_OPS decorators) # decorators is a helper python file, not a test
+if(APPLE)
+    # this op is not support on mac
+    list(REMOVE_ITEM TEST_OPS test_fusion_seqexpand_concat_fc_op)
+endif()
 
 function(py_test_modules TARGET_NAME)
   if(WITH_TESTING)
@@ -35,21 +44,34 @@ function(py_test_modules TARGET_NAME)
              ${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/tools/test_runner.py ${py_test_modules_MODULES}
              WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
     if (py_test_modules_SERIAL)
-        set_property(TEST ${TARGET_NAME} PROPERTY SERIAL 1)
+        set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
     endif()
   endif()
 endfunction()
 list(REMOVE_ITEM TEST_OPS test_warpctc_op)
 list(REMOVE_ITEM TEST_OPS test_dist_train)
+list(REMOVE_ITEM TEST_OPS test_dist_transpiler)
 list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf)
 list(REMOVE_ITEM TEST_OPS test_parallel_executor_fetch_feed)
+list(REMOVE_ITEM TEST_OPS test_dist_se_resnext)
+list(REMOVE_ITEM TEST_OPS test_dist_transformer)
+list(REMOVE_ITEM TEST_OPS test_parallel_executor_transformer)
+list(REMOVE_ITEM TEST_OPS test_image_classification_resnet)
 foreach(TEST_OP ${TEST_OPS})
     py_test_modules(${TEST_OP} MODULES ${TEST_OP})
 endforeach(TEST_OP)
 py_test_modules(test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=${WARPCTC_LIB_DIR} SERIAL)
-py_test_modules(test_dist_train MODULES test_dist_train SERIAL)
+if(WITH_DISTRIBUTE)
+    py_test_modules(test_dist_train MODULES test_dist_train SERIAL)
+    set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 20)
+    set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 200)
+    set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 200)
+    py_test_modules(test_dist_transpiler MODULES test_dist_transpiler)
+    py_test_modules(test_dist_transformer MODULES test_dist_transformer SERIAL)
+    py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext SERIAL)
+endif()
 py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL)
 py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL)
-set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 20)
-set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 180)
-set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 180)
+set_tests_properties(test_parallel_executor_fetch_feed PROPERTIES TIMEOUT 150)
+py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executor_transformer SERIAL)
+py_test_modules(test_image_classification_resnet MODULES test_image_classification_resnet SERIAL)
diff --git a/python/paddle/fluid/tests/unittests/benchmark.py b/python/paddle/fluid/tests/unittests/benchmark.py
index e891ee932f1440001eb25b222f1f4613e97dfcb1..9ea95f3e8700274977eda4ca113a6468c631584c 100644
--- a/python/paddle/fluid/tests/unittests/benchmark.py
+++ b/python/paddle/fluid/tests/unittests/benchmark.py
@@ -12,10 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import numpy as np
 import unittest
 import time
 import itertools
+import six
 
 import paddle.fluid as fluid
 import paddle.fluid.core as core
@@ -40,8 +43,8 @@ class BenchmarkSuite(OpTest):
             expect_t = np.array(item_cpu_out)
             actual = item_gpu_out
             actual_t = np.array(item_gpu_out)
-            var_name = variable if isinstance(variable,
-                                              basestring) else variable.name
+            var_name = variable if isinstance(
+                variable, six.string_types) else variable.name
             self.assertTrue(
                 np.allclose(
                     actual_t, expect_t, atol=atol),
@@ -53,7 +56,7 @@ class BenchmarkSuite(OpTest):
 
     def _get_input_names(self):
         inputs = []
-        for name, value in self.inputs.iteritems():
+        for name, value in six.iteritems(self.inputs):
             if isinstance(value, list):
                 inputs.extend([sub_name for sub_name, _ in value])
             inputs.append(name)
@@ -61,7 +64,7 @@ class BenchmarkSuite(OpTest):
 
     def _get_output_names(self):
         outputs = []
-        for var_name, var in self.outputs.iteritems():
+        for var_name, var in six.iteritems(self.outputs):
             if isinstance(var, list):
                 for sub_var_name, sub_var in var:
                     outputs.append(sub_var_name)
diff --git a/python/paddle/fluid/tests/unittests/benchmark_sum_op.py b/python/paddle/fluid/tests/unittests/benchmark_sum_op.py
index 91a5f1bca4441d80489a02eb9283928e38321826..0e7338b839e2a7f5808e7a752e9ca6389622c2cb 100644
--- a/python/paddle/fluid/tests/unittests/benchmark_sum_op.py
+++ b/python/paddle/fluid/tests/unittests/benchmark_sum_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 
diff --git a/python/paddle/fluid/tests/unittests/decorators.py b/python/paddle/fluid/tests/unittests/decorators.py
index d1165e2a9199454dbcc1fda411afad20449bcc92..1a5f4540cf033b4d3244537cc5016ee06f341464 100644
--- a/python/paddle/fluid/tests/unittests/decorators.py
+++ b/python/paddle/fluid/tests/unittests/decorators.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import paddle.fluid as fluid
 
 __all__ = ['many_times', 'prog_scope']
diff --git a/python/paddle/fluid/tests/unittests/dist_mnist.py b/python/paddle/fluid/tests/unittests/dist_mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..85a96c0b53f6bc08687965048d6251265055a6fe
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dist_mnist.py
@@ -0,0 +1,106 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import time
+import math
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import os
+import signal
+from functools import reduce
+from test_dist_base import TestDistRunnerBase, runtime_main
+
+DTYPE = "float32"
+paddle.dataset.mnist.fetch()
+
+# Fix seed for test
+fluid.default_startup_program().random_seed = 1
+fluid.default_main_program().random_seed = 1
+
+
+def cnn_model(data):
+    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+        input=data,
+        filter_size=5,
+        num_filters=20,
+        pool_size=2,
+        pool_stride=2,
+        act="relu",
+        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
+            value=0.3)))
+    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+        input=conv_pool_1,
+        filter_size=5,
+        num_filters=50,
+        pool_size=2,
+        pool_stride=2,
+        act="relu",
+        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
+            value=0.2)))
+
+    SIZE = 10
+    input_shape = conv_pool_2.shape
+    param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE]
+    scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5
+
+    predict = fluid.layers.fc(
+        input=conv_pool_2,
+        size=SIZE,
+        act="softmax",
+        param_attr=fluid.param_attr.ParamAttr(
+            initializer=fluid.initializer.Constant(value=0.1)))
+    return predict
+
+
+class TestDistMnist2x2(TestDistRunnerBase):
+    def get_model(self, batch_size=2):
+        # Input data
+        images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+        # Train program
+        predict = cnn_model(images)
+        cost = fluid.layers.cross_entropy(input=predict, label=label)
+        avg_cost = fluid.layers.mean(x=cost)
+
+        # Evaluator
+        batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+        batch_acc = fluid.layers.accuracy(
+            input=predict, label=label, total=batch_size_tensor)
+
+        inference_program = fluid.default_main_program().clone()
+        # Optimization
+        opt = fluid.optimizer.AdamOptimizer(
+            learning_rate=0.001, beta1=0.9, beta2=0.999)
+
+        # Reader
+        train_reader = paddle.batch(
+            paddle.dataset.mnist.train(), batch_size=batch_size)
+        test_reader = paddle.batch(
+            paddle.dataset.mnist.test(), batch_size=batch_size)
+        opt.minimize(avg_cost)
+        return inference_program, avg_cost, train_reader, test_reader, batch_acc, predict
+
+
+if __name__ == "__main__":
+    runtime_main(TestDistMnist2x2)
diff --git a/python/paddle/fluid/tests/unittests/dist_se_resnext.py b/python/paddle/fluid/tests/unittests/dist_se_resnext.py
new file mode 100644
index 0000000000000000000000000000000000000000..a4ffe7d40c40501ebd43fec0b664159227ea34bd
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dist_se_resnext.py
@@ -0,0 +1,258 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import time
+import math
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import os
+import sys
+import signal
+from test_dist_base import TestDistRunnerBase, runtime_main
+
+# Fix seed for test
+fluid.default_startup_program().random_seed = 1
+fluid.default_main_program().random_seed = 1
+
+train_parameters = {
+    "input_size": [3, 224, 224],
+    "input_mean": [0.485, 0.456, 0.406],
+    "input_std": [0.229, 0.224, 0.225],
+    "learning_strategy": {
+        "name": "piecewise_decay",
+        "epochs": [30, 60, 90],
+        "steps": [0.1, 0.01, 0.001, 0.0001]
+    }
+}
+
+
+class SE_ResNeXt():
+    def __init__(self, layers=50):
+        self.params = train_parameters
+        self.layers = layers
+
+    def net(self, input, class_dim=1000):
+        layers = self.layers
+        supported_layers = [50, 101, 152]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(supported_layers, layers)
+        if layers == 50:
+            cardinality = 32
+            reduction_ratio = 16
+            depth = [3, 4, 6, 3]
+            num_filters = [128, 256, 512, 1024]
+
+            conv = self.conv_bn_layer(
+                input=input,
+                num_filters=64,
+                filter_size=7,
+                stride=2,
+                act='relu')
+            conv = fluid.layers.pool2d(
+                input=conv,
+                pool_size=3,
+                pool_stride=2,
+                pool_padding=1,
+                pool_type='max')
+        elif layers == 101:
+            cardinality = 32
+            reduction_ratio = 16
+            depth = [3, 4, 23, 3]
+            num_filters = [128, 256, 512, 1024]
+
+            conv = self.conv_bn_layer(
+                input=input,
+                num_filters=64,
+                filter_size=7,
+                stride=2,
+                act='relu')
+            conv = fluid.layers.pool2d(
+                input=conv,
+                pool_size=3,
+                pool_stride=2,
+                pool_padding=1,
+                pool_type='max')
+        elif layers == 152:
+            cardinality = 64
+            reduction_ratio = 16
+            depth = [3, 8, 36, 3]
+            num_filters = [128, 256, 512, 1024]
+
+            conv = self.conv_bn_layer(
+                input=input,
+                num_filters=64,
+                filter_size=3,
+                stride=2,
+                act='relu')
+            conv = self.conv_bn_layer(
+                input=conv, num_filters=64, filter_size=3, stride=1, act='relu')
+            conv = self.conv_bn_layer(
+                input=conv,
+                num_filters=128,
+                filter_size=3,
+                stride=1,
+                act='relu')
+            conv = fluid.layers.pool2d(
+                input=conv, pool_size=3, pool_stride=2, pool_padding=1, \
+                pool_type='max')
+
+        for block in range(len(depth)):
+            for i in range(depth[block]):
+                conv = self.bottleneck_block(
+                    input=conv,
+                    num_filters=num_filters[block],
+                    stride=2 if i == 0 and block != 0 else 1,
+                    cardinality=cardinality,
+                    reduction_ratio=reduction_ratio)
+
+        pool = fluid.layers.pool2d(
+            input=conv, pool_size=7, pool_type='avg', global_pooling=True)
+        drop = fluid.layers.dropout(x=pool, dropout_prob=0.2)
+        stdv = 1.0 / math.sqrt(drop.shape[1] * 1.0)
+        out = fluid.layers.fc(
+            input=drop,
+            size=class_dim,
+            act='softmax',
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.05)))
+        return out
+
+    def shortcut(self, input, ch_out, stride):
+        ch_in = input.shape[1]
+        if ch_in != ch_out or stride != 1:
+            filter_size = 1
+            return self.conv_bn_layer(input, ch_out, filter_size, stride)
+        else:
+            return input
+
+    def bottleneck_block(self, input, num_filters, stride, cardinality,
+                         reduction_ratio):
+        conv0 = self.conv_bn_layer(
+            input=input, num_filters=num_filters, filter_size=1, act='relu')
+        conv1 = self.conv_bn_layer(
+            input=conv0,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            groups=cardinality,
+            act='relu')
+        conv2 = self.conv_bn_layer(
+            input=conv1, num_filters=num_filters * 2, filter_size=1, act=None)
+        scale = self.squeeze_excitation(
+            input=conv2,
+            num_channels=num_filters * 2,
+            reduction_ratio=reduction_ratio)
+
+        short = self.shortcut(input, num_filters * 2, stride)
+
+        return fluid.layers.elementwise_add(x=short, y=scale, act='relu')
+
+    def conv_bn_layer(self,
+                      input,
+                      num_filters,
+                      filter_size,
+                      stride=1,
+                      groups=1,
+                      act=None):
+        conv = fluid.layers.conv2d(
+            input=input,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            act=None,
+            # avoid pserver CPU init differs from GPU
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.05)),
+            bias_attr=False)
+        return fluid.layers.batch_norm(input=conv, act=act)
+
+    def squeeze_excitation(self, input, num_channels, reduction_ratio):
+        pool = fluid.layers.pool2d(
+            input=input, pool_size=0, pool_type='avg', global_pooling=True)
+        stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
+        squeeze = fluid.layers.fc(
+            input=pool,
+            size=num_channels // reduction_ratio,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.05)),
+            act='relu')
+        stdv = 1.0 / math.sqrt(squeeze.shape[1] * 1.0)
+        excitation = fluid.layers.fc(
+            input=squeeze,
+            size=num_channels,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.05)),
+            act='sigmoid')
+        scale = fluid.layers.elementwise_mul(x=input, y=excitation, axis=0)
+        return scale
+
+
+class DistSeResneXt2x2(TestDistRunnerBase):
+    def get_model(self, batch_size=2):
+        # Input data
+        image = fluid.layers.data(
+            name="data", shape=[3, 224, 224], dtype='float32')
+        label = fluid.layers.data(name="int64", shape=[1], dtype='int64')
+
+        # Train program
+        model = SE_ResNeXt(layers=50)
+        out = model.net(input=image, class_dim=102)
+        cost = fluid.layers.cross_entropy(input=out, label=label)
+
+        avg_cost = fluid.layers.mean(x=cost)
+        acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
+        acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
+
+        # Evaluator
+        test_program = fluid.default_main_program().clone(for_test=True)
+
+        # Optimization
+        total_images = 6149  # flowers
+        epochs = [30, 60, 90]
+        step = int(total_images / batch_size + 1)
+
+        bd = [step * e for e in epochs]
+        base_lr = 0.1
+        lr = []
+        lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
+
+        optimizer = fluid.optimizer.Momentum(
+            learning_rate=fluid.layers.piecewise_decay(
+                boundaries=bd, values=lr),
+            momentum=0.9,
+            regularization=fluid.regularizer.L2Decay(1e-4))
+        optimizer.minimize(avg_cost)
+
+        # Reader
+        train_reader = paddle.batch(
+            paddle.dataset.flowers.train(), batch_size=batch_size)
+        test_reader = paddle.batch(
+            paddle.dataset.flowers.test(use_xmap=False), batch_size=batch_size)
+
+        return test_program, avg_cost, train_reader, test_reader, acc_top1, out
+
+
+if __name__ == "__main__":
+    runtime_main(DistSeResneXt2x2)
diff --git a/python/paddle/fluid/tests/unittests/dist_transformer.py b/python/paddle/fluid/tests/unittests/dist_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3db316698398ff693157d583ad1410d10dcf81d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dist_transformer.py
@@ -0,0 +1,1732 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import time
+import math
+import os
+import sys
+import six
+import argparse
+import ast
+import multiprocessing
+import time
+from functools import partial
+from os.path import expanduser
+import glob
+import random
+import tarfile
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+from paddle.fluid import core
+from test_dist_base import TestDistRunnerBase, runtime_main
+import paddle.compat as cpt
+from paddle.compat import long_type
+
+import hashlib
+
+from paddle.fluid.transpiler.details import program_to_code
+
+const_para_attr = fluid.ParamAttr(initializer=fluid.initializer.Constant(0.001))
+const_bias_attr = const_para_attr
+
+# Fix seed for test
+fluid.default_startup_program().random_seed = 1
+fluid.default_main_program().random_seed = 1
+
+
+#from transformer_config import ModelHyperParams, TrainTaskConfig, merge_cfg_from_list
+class TrainTaskConfig(object):
+    # only support GPU currently
+    use_gpu = True
+    # the epoch number to train.
+    pass_num = 1
+    # the number of sequences contained in a mini-batch.
+    # deprecated, set batch_size in args.
+    batch_size = 20
+    # the hyper parameters for Adam optimizer.
+    # This static learning_rate will be multiplied to the LearningRateScheduler
+    # derived learning rate the to get the final learning rate.
+    learning_rate = 1
+    beta1 = 0.9
+    beta2 = 0.98
+    eps = 1e-9
+    # the parameters for learning rate scheduling.
+    warmup_steps = 4000
+    # the weight used to mix up the ground-truth distribution and the fixed
+    # uniform distribution in label smoothing when training.
+    # Set this as zero if label smoothing is not wanted.
+    label_smooth_eps = 0.1
+    # the directory for saving trained models.
+    model_dir = "trained_models"
+    # the directory for saving checkpoints.
+    ckpt_dir = "trained_ckpts"
+    # the directory for loading checkpoint.
+    # If provided, continue training from the checkpoint.
+    ckpt_path = None
+    # the parameter to initialize the learning rate scheduler.
+    # It should be provided if use checkpoints, since the checkpoint doesn't
+    # include the training step counter currently.
+    start_step = 0
+
+    check_acc = True
+
+    data_path = expanduser("~") + (
+        "/.cache/paddle/dataset/test_dist_transformer/")
+    src_vocab_fpath = data_path + "vocab.bpe.32000"
+    trg_vocab_fpath = data_path + "vocab.bpe.32000"
+    train_file_pattern = data_path + "train.tok.clean.bpe.32000.en-de"
+    val_file_pattern = data_path + "newstest2013.tok.bpe.32000.en-de"
+    pool_size = 2000
+    sort_type = None
+    local = True
+    shuffle = False
+    shuffle_batch = False
+    special_token = ['<s>', '<e>', '<unk>']
+    token_delimiter = ' '
+    use_token_batch = False
+
+
+class InferTaskConfig(object):
+    use_gpu = True
+    # the number of examples in one run for sequence generation.
+    batch_size = 10
+    # the parameters for beam search.
+    beam_size = 5
+    max_out_len = 256
+    # the number of decoded sentences to output.
+    n_best = 1
+    # the flags indicating whether to output the special tokens.
+    output_bos = False
+    output_eos = False
+    output_unk = True
+    # the directory for loading the trained model.
+    model_path = "trained_models/pass_1.infer.model"
+
+
+class ModelHyperParams(object):
+    # These following five vocabularies related configurations will be set
+    # automatically according to the passed vocabulary path and special tokens.
+    # size of source word dictionary.
+    src_vocab_size = 10000
+    # size of target word dictionay
+    trg_vocab_size = 10000
+    # index for <bos> token
+    bos_idx = 0
+    # index for <eos> token
+    eos_idx = 1
+    # index for <unk> token
+    unk_idx = 2
+    # max length of sequences deciding the size of position encoding table.
+    # Start from 1 and count start and end tokens in.
+    max_length = 256
+    # the dimension for word embeddings, which is also the last dimension of
+    # the input and output of multi-head attention, position-wise feed-forward
+    # networks, encoder and decoder.
+    d_model = 512
+    # size of the hidden layer in position-wise feed-forward networks.
+    d_inner_hid = 2048
+    # the dimension that keys are projected to for dot-product attention.
+    d_key = 64
+    # the dimension that values are projected to for dot-product attention.
+    d_value = 64
+    # number of head used in multi-head attention.
+    n_head = 8
+    # number of sub-layers to be stacked in the encoder and decoder.
+    n_layer = 6
+    # dropout rate used by all dropout layers.
+    dropout = 0.0  # no random
+    # random seed used in dropout for CE.
+    dropout_seed = None
+    # the flag indicating whether to share embedding and softmax weights.
+    # vocabularies in source and target should be same for weight sharing.
+    weight_sharing = True
+
+
+def merge_cfg_from_list(cfg_list, g_cfgs):
+    """
+    Set the above global configurations using the cfg_list.
+    """
+    assert len(cfg_list) % 2 == 0
+    for key, value in zip(cfg_list[0::2], cfg_list[1::2]):
+        for g_cfg in g_cfgs:
+            if hasattr(g_cfg, key):
+                try:
+                    value = eval(value)
+                except Exception:  # for file path
+                    pass
+                setattr(g_cfg, key, value)
+                break
+
+
+# The placeholder for batch_size in compile time. Must be -1 currently to be
+# consistent with some ops' infer-shape output in compile time, such as the
+# sequence_expand op used in beamsearch decoder.
+batch_size = -1
+# The placeholder for squence length in compile time.
+seq_len = ModelHyperParams.max_length
+# Here list the data shapes and data types of all inputs.
+# The shapes here act as placeholder and are set to pass the infer-shape in
+# compile time.
+input_descs = {
+    # The actual data shape of src_word is:
+    # [batch_size * max_src_len_in_batch, 1]
+    "src_word": [(batch_size, seq_len, long_type(1)), "int64", 2],
+    # The actual data shape of src_pos is:
+    # [batch_size * max_src_len_in_batch, 1]
+    "src_pos": [(batch_size, seq_len, long_type(1)), "int64"],
+    # This input is used to remove attention weights on paddings in the
+    # encoder.
+    # The actual data shape of src_slf_attn_bias is:
+    # [batch_size, n_head, max_src_len_in_batch, max_src_len_in_batch]
+    "src_slf_attn_bias": [(batch_size, ModelHyperParams.n_head, seq_len,
+                           seq_len), "float32"],
+    # The actual data shape of trg_word is:
+    # [batch_size * max_trg_len_in_batch, 1]
+    "trg_word": [(batch_size, seq_len, long_type(1)), "int64",
+                 2],  # lod_level is only used in fast decoder.
+    # The actual data shape of trg_pos is:
+    # [batch_size * max_trg_len_in_batch, 1]
+    "trg_pos": [(batch_size, seq_len, long_type(1)), "int64"],
+    # This input is used to remove attention weights on paddings and
+    # subsequent words in the decoder.
+    # The actual data shape of trg_slf_attn_bias is:
+    # [batch_size, n_head, max_trg_len_in_batch, max_trg_len_in_batch]
+    "trg_slf_attn_bias": [(batch_size, ModelHyperParams.n_head, seq_len,
+                           seq_len), "float32"],
+    # This input is used to remove attention weights on paddings of the source
+    # input in the encoder-decoder attention.
+    # The actual data shape of trg_src_attn_bias is:
+    # [batch_size, n_head, max_trg_len_in_batch, max_src_len_in_batch]
+    "trg_src_attn_bias": [(batch_size, ModelHyperParams.n_head, seq_len,
+                           seq_len), "float32"],
+    # This input is used in independent decoder program for inference.
+    # The actual data shape of enc_output is:
+    # [batch_size, max_src_len_in_batch, d_model]
+    "enc_output": [(batch_size, seq_len, ModelHyperParams.d_model), "float32"],
+    # The actual data shape of label_word is:
+    # [batch_size * max_trg_len_in_batch, 1]
+    "lbl_word": [(batch_size * seq_len, long_type(1)), "int64"],
+    # This input is used to mask out the loss of paddding tokens.
+    # The actual data shape of label_weight is:
+    # [batch_size * max_trg_len_in_batch, 1]
+    "lbl_weight": [(batch_size * seq_len, long_type(1)), "float32"],
+    # These inputs are used to change the shape tensor in beam-search decoder.
+    "trg_slf_attn_pre_softmax_shape_delta": [(long_type(2), ), "int32"],
+    "trg_slf_attn_post_softmax_shape_delta": [(long_type(4), ), "int32"],
+    "init_score": [(batch_size, long_type(1)), "float32"],
+}
+
+# Names of word embedding table which might be reused for weight sharing.
+word_emb_param_names = (
+    "src_word_emb_table",
+    "trg_word_emb_table", )
+# Names of position encoding table which will be initialized externally.
+pos_enc_param_names = (
+    "src_pos_enc_table",
+    "trg_pos_enc_table", )
+# separated inputs for different usages.
+encoder_data_input_fields = (
+    "src_word",
+    "src_pos",
+    "src_slf_attn_bias", )
+decoder_data_input_fields = (
+    "trg_word",
+    "trg_pos",
+    "trg_slf_attn_bias",
+    "trg_src_attn_bias",
+    "enc_output", )
+label_data_input_fields = (
+    "lbl_word",
+    "lbl_weight", )
+# In fast decoder, trg_pos (only containing the current time step) is generated
+# by ops and trg_slf_attn_bias is not needed.
+fast_decoder_data_input_fields = (
+    "trg_word",
+    "init_score",
+    "trg_src_attn_bias", )
+
+# fast_decoder_util_input_fields = (
+#     "trg_slf_attn_pre_softmax_shape_delta",
+#     "trg_slf_attn_post_softmax_shape_delta", )
+
+
+#from optim import LearningRateScheduler
+class LearningRateScheduler(object):
+    """
+    Wrapper for learning rate scheduling as described in the Transformer paper.
+    LearningRateScheduler adapts the learning rate externally and the adapted
+    learning rate will be feeded into the main_program as input data.
+    """
+
+    def __init__(self,
+                 d_model,
+                 warmup_steps,
+                 learning_rate=0.001,
+                 current_steps=0,
+                 name="learning_rate"):
+        self.current_steps = current_steps
+        self.warmup_steps = warmup_steps
+        self.d_model = d_model
+        self.static_lr = learning_rate
+        self.learning_rate = layers.create_global_var(
+            name=name,
+            shape=[1],
+            value=float(learning_rate),
+            dtype="float32",
+            persistable=True)
+
+    def update_learning_rate(self):
+        self.current_steps += 1
+        lr_value = np.power(self.d_model, -0.5) * np.min([
+            np.power(self.current_steps, -0.5),
+            np.power(self.warmup_steps, -1.5) * self.current_steps
+        ]) * self.static_lr
+        return np.array([lr_value], dtype="float32")
+
+
+#from transformer_train import train_loop
+def pad_batch_data(insts,
+                   pad_idx,
+                   n_head,
+                   is_target=False,
+                   is_label=False,
+                   return_attn_bias=True,
+                   return_max_len=True,
+                   return_num_token=False):
+    """
+    Pad the instances to the max sequence length in batch, and generate the
+    corresponding position data and attention bias.
+    """
+    return_list = []
+    max_len = max(len(inst) for inst in insts)
+    num_token = six.moves.reduce(
+        lambda x, y: x + y,
+        [len(inst) for inst in insts]) if return_num_token else 0
+    # Any token included in dict can be used to pad, since the paddings' loss
+    # will be masked out by weights and make no effect on parameter gradients.
+    inst_data = np.array(
+        [inst + [pad_idx] * (max_len - len(inst)) for inst in insts])
+    return_list += [inst_data.astype("int64").reshape([-1, 1])]
+    if is_label:  # label weight
+        inst_weight = np.array(
+            [[1.] * len(inst) + [0.] * (max_len - len(inst)) for inst in insts])
+        return_list += [inst_weight.astype("float32").reshape([-1, 1])]
+    else:  # position data
+        inst_pos = np.array([
+            list(range(1, len(inst) + 1)) + [0] * (max_len - len(inst))
+            for inst in insts
+        ])
+        return_list += [inst_pos.astype("int64").reshape([-1, 1])]
+    if return_attn_bias:
+        if is_target:
+            # This is used to avoid attention on paddings and subsequent
+            # words.
+            slf_attn_bias_data = np.ones((inst_data.shape[0], max_len, max_len))
+            slf_attn_bias_data = np.triu(slf_attn_bias_data,
+                                         1).reshape([-1, 1, max_len, max_len])
+            slf_attn_bias_data = np.tile(slf_attn_bias_data,
+                                         [1, n_head, 1, 1]) * [-1e9]
+        else:
+            # This is used to avoid attention on paddings.
+            slf_attn_bias_data = np.array([[0] * len(inst) + [-1e9] *
+                                           (max_len - len(inst))
+                                           for inst in insts])
+            slf_attn_bias_data = np.tile(
+                slf_attn_bias_data.reshape([-1, 1, 1, max_len]),
+                [1, n_head, max_len, 1])
+        return_list += [slf_attn_bias_data.astype("float32")]
+    if return_max_len:
+        return_list += [max_len]
+    if return_num_token:
+        return_list += [num_token]
+    return return_list if len(return_list) > 1 else return_list[0]
+
+
+def prepare_batch_input(insts, data_input_names, src_pad_idx, trg_pad_idx,
+                        n_head, d_model):
+    """
+    Put all padded data needed by training into a dict.
+    """
+    src_word, src_pos, src_slf_attn_bias, src_max_len = pad_batch_data(
+        [inst[0] for inst in insts], src_pad_idx, n_head, is_target=False)
+    src_word = src_word.reshape(-1, src_max_len, 1)
+    src_pos = src_pos.reshape(-1, src_max_len, 1)
+    trg_word, trg_pos, trg_slf_attn_bias, trg_max_len = pad_batch_data(
+        [inst[1] for inst in insts], trg_pad_idx, n_head, is_target=True)
+    trg_word = trg_word.reshape(-1, trg_max_len, 1)
+    trg_pos = trg_pos.reshape(-1, trg_max_len, 1)
+
+    trg_src_attn_bias = np.tile(src_slf_attn_bias[:, :, ::src_max_len, :],
+                                [1, 1, trg_max_len, 1]).astype("float32")
+
+    lbl_word, lbl_weight, num_token = pad_batch_data(
+        [inst[2] for inst in insts],
+        trg_pad_idx,
+        n_head,
+        is_target=False,
+        is_label=True,
+        return_attn_bias=False,
+        return_max_len=False,
+        return_num_token=True)
+
+    data_input_dict = dict(
+        list(
+            zip(data_input_names, [
+                src_word, src_pos, src_slf_attn_bias, trg_word, trg_pos,
+                trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight
+            ])))
+    return data_input_dict, np.asarray([num_token], dtype="float32")
+
+
+def read_multiple(reader, count, clip_last=True):
+    """
+    Stack data from reader for multi-devices.
+    """
+
+    def __impl__():
+        res = []
+        for item in reader():
+            res.append(item)
+            if len(res) == count:
+                yield res
+                res = []
+        if len(res) == count:
+            yield res
+        elif not clip_last:
+            data = []
+            for item in res:
+                data += item
+            if len(data) > count:
+                inst_num_per_part = len(data) // count
+                yield [
+                    data[inst_num_per_part * i:inst_num_per_part * (i + 1)]
+                    for i in range(count)
+                ]
+
+    return __impl__
+
+
+def split_data(data, num_part):
+    """
+    Split data for each device.
+    """
+    if len(data) == num_part:
+        return data
+    data = data[0]
+    inst_num_per_part = len(data) // num_part
+    return [
+        data[inst_num_per_part * i:inst_num_per_part * (i + 1)]
+        for i in range(num_part)
+    ]
+
+
+def test_context(train_progm, avg_cost, train_exe, dev_count, data_input_names,
+                 sum_cost, token_num):
+    # Context to do validation.
+    test_program = train_progm.clone()
+    with fluid.program_guard(test_program):
+        test_program = fluid.io.get_inference_program([avg_cost])
+
+    val_data = DataReader(
+        src_vocab_fpath=TrainTaskConfig.src_vocab_fpath,
+        trg_vocab_fpath=TrainTaskConfig.trg_vocab_fpath,
+        fpattern=TrainTaskConfig.val_file_pattern,
+        token_delimiter=TrainTaskConfig.token_delimiter,
+        use_token_batch=TrainTaskConfig.use_token_batch,
+        batch_size=TrainTaskConfig.batch_size *
+        (1 if TrainTaskConfig.use_token_batch else dev_count),
+        pool_size=TrainTaskConfig.pool_size,
+        sort_type=TrainTaskConfig.sort_type,
+        start_mark=TrainTaskConfig.special_token[0],
+        end_mark=TrainTaskConfig.special_token[1],
+        unk_mark=TrainTaskConfig.special_token[2],
+        # count start and end tokens out
+        max_length=ModelHyperParams.max_length - 2,
+        clip_last_batch=False,
+        shuffle=False,
+        shuffle_batch=False)
+
+    build_strategy = fluid.BuildStrategy()
+
+    strategy = fluid.ExecutionStrategy()
+    strategy.num_threads = 1
+
+    test_exe = fluid.ParallelExecutor(
+        use_cuda=TrainTaskConfig.use_gpu,
+        main_program=test_program,
+        share_vars_from=train_exe,
+        build_strategy=build_strategy,
+        exec_strategy=strategy)
+
+    def test(exe=test_exe):
+        test_total_cost = 0
+        test_total_token = 0
+        test_data = read_multiple(
+            reader=val_data.batch_generator,
+            count=dev_count if TrainTaskConfig.use_token_batch else 1)
+        for batch_id, data in enumerate(test_data()):
+            feed_list = []
+            for place_id, data_buffer in enumerate(
+                    split_data(
+                        data, num_part=dev_count)):
+                data_input_dict, _ = prepare_batch_input(
+                    data_buffer, data_input_names, ModelHyperParams.eos_idx,
+                    ModelHyperParams.eos_idx, ModelHyperParams.n_head,
+                    ModelHyperParams.d_model)
+                feed_list.append(data_input_dict)
+
+            outs = exe.run(feed=feed_list,
+                           fetch_list=[sum_cost.name, token_num.name])
+            sum_cost_val, token_num_val = np.array(outs[0]), np.array(outs[1])
+            test_total_cost += sum_cost_val.sum()
+            test_total_token += token_num_val.sum()
+        test_avg_cost = test_total_cost / test_total_token
+        test_ppl = np.exp([min(test_avg_cost, 100)])
+        return test_avg_cost, test_ppl
+
+    return test
+
+
+def train_loop(exe, train_progm, dev_count, sum_cost, avg_cost, lr_scheduler,
+               token_num, predict):
+    # Initialize the parameters.
+    if TrainTaskConfig.ckpt_path:
+        lr_scheduler.current_steps = TrainTaskConfig.start_step
+    else:
+        exe.run(fluid.framework.default_startup_program())
+
+    train_data = DataReader(
+        src_vocab_fpath=TrainTaskConfig.src_vocab_fpath,
+        trg_vocab_fpath=TrainTaskConfig.trg_vocab_fpath,
+        fpattern=TrainTaskConfig.train_file_pattern,
+        token_delimiter=TrainTaskConfig.token_delimiter,
+        use_token_batch=TrainTaskConfig.use_token_batch,
+        batch_size=TrainTaskConfig.batch_size *
+        (1 if TrainTaskConfig.use_token_batch else dev_count),
+        pool_size=TrainTaskConfig.pool_size,
+        sort_type=TrainTaskConfig.sort_type,
+        shuffle=TrainTaskConfig.shuffle,
+        shuffle_batch=TrainTaskConfig.shuffle_batch,
+        start_mark=TrainTaskConfig.special_token[0],
+        end_mark=TrainTaskConfig.special_token[1],
+        unk_mark=TrainTaskConfig.special_token[2],
+        # count start and end tokens out
+        max_length=ModelHyperParams.max_length - 2,
+        clip_last_batch=False)
+    train_data = read_multiple(
+        reader=train_data.batch_generator,
+        count=dev_count if TrainTaskConfig.use_token_batch else 1)
+
+    build_strategy = fluid.BuildStrategy()
+    # Since the token number differs among devices, customize gradient scale to
+    # use token average cost among multi-devices. and the gradient scale is
+    # `1 / token_number` for average cost.
+    build_strategy.gradient_scale_strategy = fluid.BuildStrategy.GradientScaleStrategy.Customized
+
+    strategy = fluid.ExecutionStrategy()
+    strategy.num_threads = 1
+
+    train_exe = fluid.ParallelExecutor(
+        use_cuda=TrainTaskConfig.use_gpu,
+        loss_name=sum_cost.name,
+        main_program=train_progm,
+        build_strategy=build_strategy,
+        exec_strategy=strategy)
+
+    data_input_names = encoder_data_input_fields + decoder_data_input_fields[:
+                                                                             -1] + label_data_input_fields
+
+    if TrainTaskConfig.val_file_pattern is not None:
+        test = test_context(train_progm, avg_cost, train_exe, dev_count,
+                            data_input_names, sum_cost, token_num)
+
+    # the best cross-entropy value with label smoothing
+    loss_normalizer = -((1. - TrainTaskConfig.label_smooth_eps) * np.log(
+        (1. - TrainTaskConfig.label_smooth_eps
+         )) + TrainTaskConfig.label_smooth_eps *
+                        np.log(TrainTaskConfig.label_smooth_eps / (
+                            ModelHyperParams.trg_vocab_size - 1) + 1e-20))
+    init = False
+    for pass_id in six.moves.xrange(TrainTaskConfig.pass_num):
+        pass_start_time = time.time()
+        for batch_id, data in enumerate(train_data()):
+            if batch_id >= 5:
+                break
+
+            feed_list = []
+            total_num_token = 0
+
+            #if TrainTaskConfig.local:
+            #    lr_rate = lr_scheduler.update_learning_rate()
+            #for place_id, data_buffer in enumerate(
+            #        split_data(
+            #            data, num_part=dev_count)):
+
+            if TrainTaskConfig.local:
+                lr_rate = lr_scheduler.update_learning_rate()
+
+            for place_id, data_buffer in enumerate(
+                    split_data(
+                        data, num_part=dev_count)):
+                data_input_dict, num_token = prepare_batch_input(
+                    data_buffer, data_input_names, ModelHyperParams.eos_idx,
+                    ModelHyperParams.eos_idx, ModelHyperParams.n_head,
+                    ModelHyperParams.d_model)
+                total_num_token += num_token
+                feed_kv_pairs = list(data_input_dict.items())
+                if TrainTaskConfig.local:
+                    feed_kv_pairs += list({
+                        lr_scheduler.learning_rate.name: lr_rate
+                    }.items())
+                feed_list.append(dict(feed_kv_pairs))
+
+                if not init:
+                    for pos_enc_param_name in pos_enc_param_names:
+                        pos_enc = position_encoding_init(
+                            ModelHyperParams.max_length + 1,
+                            ModelHyperParams.d_model)
+                        feed_list[place_id][pos_enc_param_name] = pos_enc
+
+            if not TrainTaskConfig.check_acc:
+                for feed_dict in feed_list:
+                    feed_dict[sum_cost.name + "@GRAD"] = 1. / total_num_token
+            else:
+                b = 100 * TrainTaskConfig.batch_size
+                a = np.asarray([b], dtype="float32")
+                for feed_dict in feed_list:
+                    feed_dict[sum_cost.name + "@GRAD"] = 1. / a
+
+            outs = train_exe.run(fetch_list=[sum_cost.name, token_num.name],
+                                 feed=feed_list)
+
+            sum_cost_val, token_num_val = np.array(outs[0]), np.array(outs[1])
+            total_sum_cost = sum_cost_val.sum()
+            total_token_num = token_num_val.sum()
+            total_avg_cost = total_sum_cost / total_token_num
+
+            init = True
+
+            # Validate and save the model for inference.
+            if TrainTaskConfig.val_file_pattern is not None:
+                val_avg_cost, val_ppl = test()
+                print("[%f]" % val_avg_cost)
+            else:
+                assert (False)
+
+
+#import transformer_reader as reader
+class SortType(object):
+    GLOBAL = 'global'
+    POOL = 'pool'
+    NONE = "none"
+
+
+class Converter(object):
+    def __init__(self, vocab, beg, end, unk, delimiter):
+        self._vocab = vocab
+        self._beg = beg
+        self._end = end
+        self._unk = unk
+        self._delimiter = delimiter
+
+    def __call__(self, sentence):
+        return [self._beg] + [
+            self._vocab.get(w, self._unk)
+            for w in sentence.split(self._delimiter)
+        ] + [self._end]
+
+
+class ComposedConverter(object):
+    def __init__(self, converters):
+        self._converters = converters
+
+    def __call__(self, parallel_sentence):
+        return [
+            self._converters[i](parallel_sentence[i])
+            for i in range(len(self._converters))
+        ]
+
+
+class SentenceBatchCreator(object):
+    def __init__(self, batch_size):
+        self.batch = []
+        self._batch_size = batch_size
+
+    def append(self, info):
+        self.batch.append(info)
+        if len(self.batch) == self._batch_size:
+            tmp = self.batch
+            self.batch = []
+            return tmp
+
+
+class TokenBatchCreator(object):
+    def __init__(self, batch_size):
+        self.batch = []
+        self.max_len = -1
+        self._batch_size = batch_size
+
+    def append(self, info):
+        cur_len = info.max_len
+        max_len = max(self.max_len, cur_len)
+        if max_len * (len(self.batch) + 1) > self._batch_size:
+            result = self.batch
+            self.batch = [info]
+            self.max_len = cur_len
+            return result
+        else:
+            self.max_len = max_len
+            self.batch.append(info)
+
+
+class SampleInfo(object):
+    def __init__(self, i, max_len, min_len):
+        self.i = i
+        self.min_len = min_len
+        self.max_len = max_len
+
+
+class MinMaxFilter(object):
+    def __init__(self, max_len, min_len, underlying_creator):
+        self._min_len = min_len
+        self._max_len = max_len
+        self._creator = underlying_creator
+
+    def append(self, info):
+        if info.max_len > self._max_len or info.min_len < self._min_len:
+            return
+        else:
+            return self._creator.append(info)
+
+    @property
+    def batch(self):
+        return self._creator.batch
+
+
+class DataReader(object):
+    """
+    The data reader loads all data from files and produces batches of data
+    in the way corresponding to settings.
+
+    An example of returning a generator producing data batches whose data
+    is shuffled in each pass and sorted in each pool:
+
+    ```
+    train_data = DataReader(
+        src_vocab_fpath='data/src_vocab_file',
+        trg_vocab_fpath='data/trg_vocab_file',
+        fpattern='data/part-*',
+        use_token_batch=True,
+        batch_size=2000,
+        pool_size=10000,
+        sort_type=SortType.POOL,
+        shuffle=True,
+        shuffle_batch=True,
+        start_mark='<s>',
+        end_mark='<e>',
+        unk_mark='<unk>',
+        clip_last_batch=False).batch_generator
+    ```
+
+    :param src_vocab_fpath: The path of vocabulary file of source language.
+    :type src_vocab_fpath: basestring
+    :param trg_vocab_fpath: The path of vocabulary file of target language.
+    :type trg_vocab_fpath: basestring
+    :param fpattern: The pattern to match data files.
+    :type fpattern: basestring
+    :param batch_size: The number of sequences contained in a mini-batch.
+        or the maximum number of tokens (include paddings) contained in a
+        mini-batch.
+    :type batch_size: int
+    :param pool_size: The size of pool buffer.
+    :type pool_size: int
+    :param sort_type: The grain to sort by length: 'global' for all
+        instances; 'pool' for instances in pool; 'none' for no sort.
+    :type sort_type: basestring
+    :param clip_last_batch: Whether to clip the last uncompleted batch.
+    :type clip_last_batch: bool
+    :param tar_fname: The data file in tar if fpattern matches a tar file.
+    :type tar_fname: basestring
+    :param min_length: The minimum length used to filt sequences.
+    :type min_length: int
+    :param max_length: The maximum length used to filt sequences.
+    :type max_length: int
+    :param shuffle: Whether to shuffle all instances.
+    :type shuffle: bool
+    :param shuffle_batch: Whether to shuffle the generated batches.
+    :type shuffle_batch: bool
+    :param use_token_batch: Whether to produce batch data according to
+        token number.
+    :type use_token_batch: bool
+    :param field_delimiter: The delimiter used to split source and target in
+        each line of data file.
+    :type field_delimiter: basestring
+    :param token_delimiter: The delimiter used to split tokens in source or
+        target sentences.
+    :type token_delimiter: basestring
+    :param start_mark: The token representing for the beginning of
+        sentences in dictionary.
+    :type start_mark: basestring
+    :param end_mark: The token representing for the end of sentences
+        in dictionary.
+    :type end_mark: basestring
+    :param unk_mark: The token representing for unknown word in dictionary.
+    :type unk_mark: basestring
+    :param seed: The seed for random.
+    :type seed: int
+    """
+
+    def __init__(self,
+                 src_vocab_fpath,
+                 trg_vocab_fpath,
+                 fpattern,
+                 batch_size,
+                 pool_size,
+                 sort_type=SortType.GLOBAL,
+                 clip_last_batch=True,
+                 tar_fname=None,
+                 min_length=0,
+                 max_length=100,
+                 shuffle=True,
+                 shuffle_batch=False,
+                 use_token_batch=False,
+                 field_delimiter="\t",
+                 token_delimiter=" ",
+                 start_mark="<s>",
+                 end_mark="<e>",
+                 unk_mark="<unk>",
+                 seed=0):
+        self._src_vocab = self.load_dict(src_vocab_fpath)
+        self._only_src = True
+        if trg_vocab_fpath is not None:
+            self._trg_vocab = self.load_dict(trg_vocab_fpath)
+            self._only_src = False
+        self._pool_size = pool_size
+        self._batch_size = batch_size
+        self._use_token_batch = use_token_batch
+        self._sort_type = sort_type
+        self._clip_last_batch = clip_last_batch
+        self._shuffle = shuffle
+        self._shuffle_batch = shuffle_batch
+        self._min_length = min_length
+        self._max_length = max_length
+        self._field_delimiter = field_delimiter
+        self._token_delimiter = token_delimiter
+        self.load_src_trg_ids(end_mark, fpattern, start_mark, tar_fname,
+                              unk_mark)
+        self._random = random.Random(x=seed)
+
+    def load_src_trg_ids(self, end_mark, fpattern, start_mark, tar_fname,
+                         unk_mark):
+        converters = [
+            Converter(
+                vocab=self._src_vocab,
+                beg=self._src_vocab[start_mark],
+                end=self._src_vocab[end_mark],
+                unk=self._src_vocab[unk_mark],
+                delimiter=self._token_delimiter)
+        ]
+        if not self._only_src:
+            converters.append(
+                Converter(
+                    vocab=self._trg_vocab,
+                    beg=self._trg_vocab[start_mark],
+                    end=self._trg_vocab[end_mark],
+                    unk=self._trg_vocab[unk_mark],
+                    delimiter=self._token_delimiter))
+
+        converters = ComposedConverter(converters)
+
+        self._src_seq_ids = []
+        self._trg_seq_ids = None if self._only_src else []
+        self._sample_infos = []
+
+        for i, line in enumerate(self._load_lines(fpattern, tar_fname)):
+            src_trg_ids = converters(line)
+            self._src_seq_ids.append(src_trg_ids[0])
+            lens = [len(src_trg_ids[0])]
+            if not self._only_src:
+                self._trg_seq_ids.append(src_trg_ids[1])
+                lens.append(len(src_trg_ids[1]))
+            self._sample_infos.append(SampleInfo(i, max(lens), min(lens)))
+
+    def _load_lines(self, fpattern, tar_fname):
+        fpaths = glob.glob(fpattern)
+
+        if len(fpaths) == 1 and tarfile.is_tarfile(fpaths[0]):
+            if tar_fname is None:
+                raise Exception("If tar file provided, please set tar_fname.")
+
+            f = tarfile.open(fpaths[0], "r")
+            for line in f.extractfile(tar_fname):
+                line = cpt.to_text(line)
+                fields = line.strip("\n").split(self._field_delimiter)
+                if (not self._only_src and len(fields) == 2) or (
+                        self._only_src and len(fields) == 1):
+                    yield fields
+        else:
+            for fpath in fpaths:
+                if not os.path.isfile(fpath):
+                    raise IOError("Invalid file: %s" % fpath)
+
+                with open(fpath, "rb") as f:
+                    for line in f:
+                        line = cpt.to_text(line)
+                        fields = line.strip("\n").split(self._field_delimiter)
+                        if (not self._only_src and len(fields) == 2) or (
+                                self._only_src and len(fields) == 1):
+                            yield fields
+
+    @staticmethod
+    def load_dict(dict_path, reverse=False):
+        word_dict = {}
+        with open(dict_path, "rb") as fdict:
+            for idx, line in enumerate(fdict):
+                line = cpt.to_text(line)
+                if reverse:
+                    word_dict[idx] = line.strip("\n")
+                else:
+                    word_dict[line.strip("\n")] = idx
+        return word_dict
+
+    def batch_generator(self):
+        # global sort or global shuffle
+        if self._sort_type == SortType.GLOBAL:
+            infos = sorted(
+                self._sample_infos, key=lambda x: x.max_len, reverse=True)
+        else:
+            if self._shuffle:
+                infos = self._sample_infos
+                self._random.shuffle(infos)
+            else:
+                infos = self._sample_infos
+
+            if self._sort_type == SortType.POOL:
+                for i in range(0, len(infos), self._pool_size):
+                    infos[i:i + self._pool_size] = sorted(
+                        infos[i:i + self._pool_size], key=lambda x: x.max_len)
+
+        # concat batch
+        batches = []
+        batch_creator = TokenBatchCreator(
+            self._batch_size
+        ) if self._use_token_batch else SentenceBatchCreator(self._batch_size)
+        batch_creator = MinMaxFilter(self._max_length, self._min_length,
+                                     batch_creator)
+
+        for info in infos:
+            batch = batch_creator.append(info)
+            if batch is not None:
+                batches.append(batch)
+
+        if not self._clip_last_batch and len(batch_creator.batch) != 0:
+            batches.append(batch_creator.batch)
+
+        if self._shuffle_batch:
+            self._random.shuffle(batches)
+
+        for batch in batches:
+            batch_ids = [info.i for info in batch]
+
+            if self._only_src:
+                yield [[self._src_seq_ids[idx]] for idx in batch_ids]
+            else:
+                yield [(self._src_seq_ids[idx], self._trg_seq_ids[idx][:-1],
+                        self._trg_seq_ids[idx][1:]) for idx in batch_ids]
+
+
+#from transformer_model import transformer
+def position_encoding_init(n_position, d_pos_vec):
+    """
+    Generate the initial values for the sinusoid position encoding table.
+    """
+    position_enc = np.array([[
+        pos / np.power(10000, 2 * (j // 2) / d_pos_vec)
+        for j in range(d_pos_vec)
+    ] if pos != 0 else np.zeros(d_pos_vec) for pos in range(n_position)])
+    position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2])  # dim 2i
+    position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2])  # dim 2i+1
+    return position_enc.astype("float32")
+
+
+def multi_head_attention(queries,
+                         keys,
+                         values,
+                         attn_bias,
+                         d_key,
+                         d_value,
+                         d_model,
+                         n_head=1,
+                         dropout_rate=0.,
+                         cache=None):
+    """
+    Multi-Head Attention. Note that attn_bias is added to the logit before
+    computing softmax activiation to mask certain selected positions so that
+    they will not considered in attention weights.
+    """
+    if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3):
+        raise ValueError(
+            "Inputs: quries, keys and values should all be 3-D tensors.")
+
+    def __compute_qkv(queries, keys, values, n_head, d_key, d_value):
+        """
+        Add linear projection to queries, keys, and values.
+        """
+        q = layers.fc(input=queries,
+                      size=d_key * n_head,
+                      num_flatten_dims=2,
+                      param_attr=const_para_attr,
+                      bias_attr=const_bias_attr)
+        k = layers.fc(input=keys,
+                      size=d_key * n_head,
+                      num_flatten_dims=2,
+                      param_attr=const_para_attr,
+                      bias_attr=const_bias_attr)
+        v = layers.fc(input=values,
+                      size=d_value * n_head,
+                      num_flatten_dims=2,
+                      param_attr=const_para_attr,
+                      bias_attr=const_bias_attr)
+        return q, k, v
+
+    def __split_heads(x, n_head):
+        """
+        Reshape the last dimension of inpunt tensor x so that it becomes two
+        dimensions and then transpose. Specifically, input a tensor with shape
+        [bs, max_sequence_length, n_head * hidden_dim] then output a tensor
+        with shape [bs, n_head, max_sequence_length, hidden_dim].
+        """
+        if n_head == 1:
+            return x
+
+        hidden_size = x.shape[-1]
+        # The value 0 in shape attr means copying the corresponding dimension
+        # size of the input as the output dimension size.
+        reshaped = layers.reshape(
+            x=x, shape=[0, 0, n_head, hidden_size // n_head])
+
+        # permuate the dimensions into:
+        # [batch_size, n_head, max_sequence_len, hidden_size_per_head]
+        return layers.transpose(x=reshaped, perm=[0, 2, 1, 3])
+
+    def __combine_heads(x):
+        """
+        Transpose and then reshape the last two dimensions of inpunt tensor x
+        so that it becomes one dimension, which is reverse to __split_heads.
+        """
+        if len(x.shape) == 3: return x
+        if len(x.shape) != 4:
+            raise ValueError("Input(x) should be a 4-D Tensor.")
+
+        trans_x = layers.transpose(x, perm=[0, 2, 1, 3])
+        # The value 0 in shape attr means copying the corresponding dimension
+        # size of the input as the output dimension size.
+        return layers.reshape(
+            x=trans_x,
+            shape=list(map(int, [0, 0, trans_x.shape[2] * trans_x.shape[3]])))
+
+    def scaled_dot_product_attention(q, k, v, attn_bias, d_model, dropout_rate):
+        """
+        Scaled Dot-Product Attention
+        """
+        scaled_q = layers.scale(x=q, scale=d_model**-0.5)
+        product = layers.matmul(x=scaled_q, y=k, transpose_y=True)
+        if attn_bias:
+            product += attn_bias
+        weights = layers.softmax(product)
+        if dropout_rate:
+            weights = layers.dropout(
+                weights,
+                dropout_prob=dropout_rate,
+                seed=ModelHyperParams.dropout_seed,
+                is_test=False)
+        out = layers.matmul(weights, v)
+        return out
+
+    q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value)
+
+    if cache is not None:  # use cache and concat time steps
+        k = cache["k"] = layers.concat([cache["k"], k], axis=1)
+        v = cache["v"] = layers.concat([cache["v"], v], axis=1)
+
+    q = __split_heads(q, n_head)
+    k = __split_heads(k, n_head)
+    v = __split_heads(v, n_head)
+
+    ctx_multiheads = scaled_dot_product_attention(q, k, v, attn_bias, d_model,
+                                                  dropout_rate)
+
+    out = __combine_heads(ctx_multiheads)
+
+    # Project back to the model size.
+    proj_out = layers.fc(input=out,
+                         size=d_model,
+                         num_flatten_dims=2,
+                         param_attr=const_para_attr,
+                         bias_attr=const_bias_attr)
+    return proj_out
+
+
+def positionwise_feed_forward(x, d_inner_hid, d_hid):
+    """
+    Position-wise Feed-Forward Networks.
+    This module consists of two linear transformations with a ReLU activation
+    in between, which is applied to each position separately and identically.
+    """
+    hidden = layers.fc(input=x,
+                       size=d_inner_hid,
+                       num_flatten_dims=2,
+                       act="relu",
+                       param_attr=const_para_attr,
+                       bias_attr=const_bias_attr)
+    out = layers.fc(input=hidden,
+                    size=d_hid,
+                    num_flatten_dims=2,
+                    param_attr=const_para_attr,
+                    bias_attr=const_bias_attr)
+    return out
+
+
+def pre_post_process_layer(prev_out, out, process_cmd, dropout_rate=0.):
+    """
+    Add residual connection, layer normalization and droput to the out tensor
+    optionally according to the value of process_cmd.
+    This will be used before or after multi-head attention and position-wise
+    feed-forward networks.
+    """
+    for cmd in process_cmd:
+        if cmd == "a":  # add residual connection
+            out = out + prev_out if prev_out else out
+        elif cmd == "n":  # add layer normalization
+            out = layers.layer_norm(
+                out,
+                begin_norm_axis=len(out.shape) - 1,
+                param_attr=fluid.initializer.Constant(1.),
+                bias_attr=fluid.initializer.Constant(0.))
+        elif cmd == "d":  # add dropout
+            if dropout_rate:
+                out = layers.dropout(
+                    out,
+                    dropout_prob=dropout_rate,
+                    seed=ModelHyperParams.dropout_seed,
+                    is_test=False)
+    return out
+
+
+pre_process_layer = partial(pre_post_process_layer, None)
+post_process_layer = pre_post_process_layer
+
+
+def prepare_encoder(src_word,
+                    src_pos,
+                    src_vocab_size,
+                    src_emb_dim,
+                    src_max_len,
+                    dropout_rate=0.,
+                    word_emb_param_name=None,
+                    pos_enc_param_name=None):
+    """Add word embeddings and position encodings.
+    The output tensor has a shape of:
+    [batch_size, max_src_length_in_batch, d_model].
+    This module is used at the bottom of the encoder stacks.
+    """
+    if TrainTaskConfig.check_acc:
+        src_word_emb = layers.embedding(
+            src_word,
+            size=[src_vocab_size, src_emb_dim],
+            param_attr=fluid.ParamAttr(
+                name=word_emb_param_name,
+                initializer=fluid.initializer.ConstantInitializer(0.001)))
+    else:
+        src_word_emb = layers.embedding(
+            src_word,
+            size=[src_vocab_size, src_emb_dim],
+            param_attr=fluid.ParamAttr(
+                name=word_emb_param_name,
+                initializer=fluid.initializer.Normal(0., src_emb_dim**-0.5)))
+
+    src_word_emb = layers.scale(x=src_word_emb, scale=src_emb_dim**0.5)
+    src_pos_enc = layers.embedding(
+        src_pos,
+        size=[src_max_len, src_emb_dim],
+        param_attr=fluid.ParamAttr(
+            name=pos_enc_param_name,
+            trainable=False,
+            initializer=fluid.initializer.ConstantInitializer(0.001)))
+    enc_input = src_word_emb + src_pos_enc
+    return layers.dropout(
+        enc_input,
+        dropout_prob=dropout_rate,
+        seed=ModelHyperParams.dropout_seed,
+        is_test=False) if dropout_rate else enc_input
+
+
+prepare_encoder = partial(
+    prepare_encoder, pos_enc_param_name=pos_enc_param_names[0])
+prepare_decoder = partial(
+    prepare_encoder, pos_enc_param_name=pos_enc_param_names[1])
+
+
+def encoder_layer(enc_input,
+                  attn_bias,
+                  n_head,
+                  d_key,
+                  d_value,
+                  d_model,
+                  d_inner_hid,
+                  dropout_rate=0.):
+    """The encoder layers that can be stacked to form a deep encoder.
+    This module consits of a multi-head (self) attention followed by
+    position-wise feed-forward networks and both the two components companied
+    with the post_process_layer to add residual connection, layer normalization
+    and droput.
+    """
+    attn_output = multi_head_attention(enc_input, enc_input, enc_input,
+                                       attn_bias, d_key, d_value, d_model,
+                                       n_head, dropout_rate)
+    attn_output = post_process_layer(enc_input, attn_output, "dan",
+                                     dropout_rate)
+    ffd_output = positionwise_feed_forward(attn_output, d_inner_hid, d_model)
+    return post_process_layer(attn_output, ffd_output, "dan", dropout_rate)
+
+
+def encoder(enc_input,
+            attn_bias,
+            n_layer,
+            n_head,
+            d_key,
+            d_value,
+            d_model,
+            d_inner_hid,
+            dropout_rate=0.):
+    """
+    The encoder is composed of a stack of identical layers returned by calling
+    encoder_layer.
+    """
+    for i in range(n_layer):
+        enc_output = encoder_layer(enc_input, attn_bias, n_head, d_key, d_value,
+                                   d_model, d_inner_hid, dropout_rate)
+        enc_input = enc_output
+    return enc_output
+
+
+def decoder_layer(dec_input,
+                  enc_output,
+                  slf_attn_bias,
+                  dec_enc_attn_bias,
+                  n_head,
+                  d_key,
+                  d_value,
+                  d_model,
+                  d_inner_hid,
+                  dropout_rate=0.,
+                  cache=None):
+    """ The layer to be stacked in decoder part.
+    The structure of this module is similar to that in the encoder part except
+    a multi-head attention is added to implement encoder-decoder attention.
+    """
+    slf_attn_output = multi_head_attention(
+        dec_input,
+        dec_input,
+        dec_input,
+        slf_attn_bias,
+        d_key,
+        d_value,
+        d_model,
+        n_head,
+        dropout_rate,
+        cache, )
+    slf_attn_output = post_process_layer(
+        dec_input,
+        slf_attn_output,
+        "dan",  # residual connection + dropout + layer normalization
+        dropout_rate, )
+    enc_attn_output = multi_head_attention(
+        slf_attn_output,
+        enc_output,
+        enc_output,
+        dec_enc_attn_bias,
+        d_key,
+        d_value,
+        d_model,
+        n_head,
+        dropout_rate, )
+    enc_attn_output = post_process_layer(
+        slf_attn_output,
+        enc_attn_output,
+        "dan",  # residual connection + dropout + layer normalization
+        dropout_rate, )
+    ffd_output = positionwise_feed_forward(
+        enc_attn_output,
+        d_inner_hid,
+        d_model, )
+    dec_output = post_process_layer(
+        enc_attn_output,
+        ffd_output,
+        "dan",  # residual connection + dropout + layer normalization
+        dropout_rate, )
+    return dec_output
+
+
+def decoder(dec_input,
+            enc_output,
+            dec_slf_attn_bias,
+            dec_enc_attn_bias,
+            n_layer,
+            n_head,
+            d_key,
+            d_value,
+            d_model,
+            d_inner_hid,
+            dropout_rate=0.,
+            caches=None):
+    """
+    The decoder is composed of a stack of identical decoder_layer layers.
+    """
+    for i in range(n_layer):
+        cache = None
+        if caches is not None:
+            cache = caches[i]
+
+        dec_output = decoder_layer(
+            dec_input,
+            enc_output,
+            dec_slf_attn_bias,
+            dec_enc_attn_bias,
+            n_head,
+            d_key,
+            d_value,
+            d_model,
+            d_inner_hid,
+            dropout_rate,
+            cache=cache)
+        dec_input = dec_output
+    return dec_output
+
+
+def make_all_inputs(input_fields):
+    """
+    Define the input data layers for the transformer model.
+    """
+    inputs = []
+    for input_field in input_fields:
+        input_var = layers.data(
+            name=input_field,
+            shape=input_descs[input_field][0],
+            dtype=input_descs[input_field][1],
+            lod_level=input_descs[input_field][2]
+            if len(input_descs[input_field]) == 3 else 0,
+            append_batch_size=False)
+        inputs.append(input_var)
+    return inputs
+
+
+def transformer(
+        src_vocab_size,
+        trg_vocab_size,
+        max_length,
+        n_layer,
+        n_head,
+        d_key,
+        d_value,
+        d_model,
+        d_inner_hid,
+        dropout_rate,
+        weight_sharing,
+        label_smooth_eps, ):
+    if weight_sharing:
+        assert src_vocab_size == src_vocab_size, (
+            "Vocabularies in source and target should be same for weight sharing."
+        )
+    enc_inputs = make_all_inputs(encoder_data_input_fields)
+
+    enc_output = wrap_encoder(
+        src_vocab_size,
+        max_length,
+        n_layer,
+        n_head,
+        d_key,
+        d_value,
+        d_model,
+        d_inner_hid,
+        dropout_rate,
+        weight_sharing,
+        enc_inputs, )
+
+    dec_inputs = make_all_inputs(decoder_data_input_fields[:-1])
+
+    predict = wrap_decoder(
+        trg_vocab_size,
+        max_length,
+        n_layer,
+        n_head,
+        d_key,
+        d_value,
+        d_model,
+        d_inner_hid,
+        dropout_rate,
+        weight_sharing,
+        dec_inputs,
+        enc_output, )
+
+    # Padding index do not contribute to the total loss. The weights is used to
+    # cancel padding index in calculating the loss.
+    label, weights = make_all_inputs(label_data_input_fields)
+    if label_smooth_eps:
+        label = layers.label_smooth(
+            label=layers.one_hot(
+                input=label, depth=trg_vocab_size),
+            epsilon=label_smooth_eps)
+
+    cost = layers.softmax_with_cross_entropy(
+        logits=layers.reshape(
+            predict, shape=[-1, trg_vocab_size]),
+        label=label,
+        soft_label=True if label_smooth_eps else False)
+    weighted_cost = cost * weights
+    sum_cost = layers.reduce_sum(weighted_cost)
+    token_num = layers.reduce_sum(weights)
+    avg_cost = sum_cost / token_num
+    avg_cost.stop_gradient = True
+    return sum_cost, avg_cost, predict, token_num
+
+
+def wrap_encoder(src_vocab_size,
+                 max_length,
+                 n_layer,
+                 n_head,
+                 d_key,
+                 d_value,
+                 d_model,
+                 d_inner_hid,
+                 dropout_rate,
+                 weight_sharing,
+                 enc_inputs=None):
+    """
+    The wrapper assembles together all needed layers for the encoder.
+    """
+    if enc_inputs is None:
+        # This is used to implement independent encoder program in inference.
+        src_word, src_pos, src_slf_attn_bias = \
+            make_all_inputs(encoder_data_input_fields)
+    else:
+        src_word, src_pos, src_slf_attn_bias = \
+            enc_inputs
+    enc_input = prepare_encoder(
+        src_word,
+        src_pos,
+        src_vocab_size,
+        d_model,
+        max_length,
+        dropout_rate,
+        word_emb_param_name=word_emb_param_names[0])
+    enc_output = encoder(enc_input, src_slf_attn_bias, n_layer, n_head, d_key,
+                         d_value, d_model, d_inner_hid, dropout_rate)
+    return enc_output
+
+
+def wrap_decoder(trg_vocab_size,
+                 max_length,
+                 n_layer,
+                 n_head,
+                 d_key,
+                 d_value,
+                 d_model,
+                 d_inner_hid,
+                 dropout_rate,
+                 weight_sharing,
+                 dec_inputs=None,
+                 enc_output=None,
+                 caches=None):
+    """
+    The wrapper assembles together all needed layers for the decoder.
+    """
+    if dec_inputs is None:
+        # This is used to implement independent decoder program in inference.
+        trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, \
+        enc_output = make_all_inputs(
+            decoder_data_input_fields + decoder_util_input_fields)
+    else:
+        trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias = dec_inputs
+
+    dec_input = prepare_decoder(
+        trg_word,
+        trg_pos,
+        trg_vocab_size,
+        d_model,
+        max_length,
+        dropout_rate,
+        word_emb_param_name=word_emb_param_names[0]
+        if weight_sharing else word_emb_param_names[1])
+    dec_output = decoder(
+        dec_input,
+        enc_output,
+        trg_slf_attn_bias,
+        trg_src_attn_bias,
+        n_layer,
+        n_head,
+        d_key,
+        d_value,
+        d_model,
+        d_inner_hid,
+        dropout_rate,
+        caches=caches)
+    # Return logits for training and probs for inference.
+    if weight_sharing:
+        predict = layers.matmul(
+            x=dec_output,
+            y=fluid.get_var(word_emb_param_names[0]),
+            transpose_y=True)
+    else:
+        predict = layers.fc(input=dec_output,
+                            size=trg_vocab_size,
+                            num_flatten_dims=2,
+                            param_attr=const_para_attr,
+                            bias_attr=const_bias_attr)
+    if dec_inputs is None:
+        predict = layers.softmax(predict)
+    return predict
+
+
+def fast_decode(
+        src_vocab_size,
+        trg_vocab_size,
+        max_in_len,
+        n_layer,
+        n_head,
+        d_key,
+        d_value,
+        d_model,
+        d_inner_hid,
+        dropout_rate,
+        weight_sharing,
+        beam_size,
+        max_out_len,
+        eos_idx, ):
+    """
+    Use beam search to decode. Caches will be used to store states of history
+    steps which can make the decoding faster.
+    """
+    enc_output = wrap_encoder(src_vocab_size, max_in_len, n_layer, n_head,
+                              d_key, d_value, d_model, d_inner_hid,
+                              dropout_rate, weight_sharing)
+    start_tokens, init_scores, trg_src_attn_bias = \
+        make_all_inputs(fast_decoder_data_input_fields )
+
+    def beam_search():
+        max_len = layers.fill_constant(
+            shape=[1], dtype=start_tokens.dtype, value=max_out_len)
+        step_idx = layers.fill_constant(
+            shape=[1], dtype=start_tokens.dtype, value=0)
+        cond = layers.less_than(x=step_idx, y=max_len)
+        while_op = layers.While(cond)
+        # array states will be stored for each step.
+        ids = layers.array_write(
+            layers.reshape(start_tokens, (-1, 1)), step_idx)
+        scores = layers.array_write(init_scores, step_idx)
+        # cell states will be overwrited at each step.
+        # caches contains states of history steps to reduce redundant
+        # computation in decoder.
+        caches = [{
+            "k": layers.fill_constant_batch_size_like(
+                input=start_tokens,
+                shape=[-1, 0, d_model],
+                dtype=enc_output.dtype,
+                value=0),
+            "v": layers.fill_constant_batch_size_like(
+                input=start_tokens,
+                shape=[-1, 0, d_model],
+                dtype=enc_output.dtype,
+                value=0)
+        } for i in range(n_layer)]
+        with while_op.block():
+            pre_ids = layers.array_read(array=ids, i=step_idx)
+            pre_ids = layers.reshape(pre_ids, (-1, 1, 1))
+            pre_scores = layers.array_read(array=scores, i=step_idx)
+            # sequence_expand can gather sequences according to lod thus can be
+            # used in beam search to sift states corresponding to selected ids.
+            pre_src_attn_bias = layers.sequence_expand(
+                x=trg_src_attn_bias, y=pre_scores)
+            pre_enc_output = layers.sequence_expand(x=enc_output, y=pre_scores)
+            pre_caches = [{
+                "k": layers.sequence_expand(
+                    x=cache["k"], y=pre_scores),
+                "v": layers.sequence_expand(
+                    x=cache["v"], y=pre_scores),
+            } for cache in caches]
+            pre_pos = layers.elementwise_mul(
+                x=layers.fill_constant_batch_size_like(
+                    input=pre_enc_output,  # cann't use pre_ids here since it has lod
+                    value=1,
+                    shape=[-1, 1, 1],
+                    dtype=pre_ids.dtype),
+                y=layers.increment(
+                    x=step_idx, value=1.0, in_place=False),
+                axis=0)
+            logits = wrap_decoder(
+                trg_vocab_size,
+                max_in_len,
+                n_layer,
+                n_head,
+                d_key,
+                d_value,
+                d_model,
+                d_inner_hid,
+                dropout_rate,
+                weight_sharing,
+                dec_inputs=(pre_ids, pre_pos, None, pre_src_attn_bias),
+                enc_output=pre_enc_output,
+                caches=pre_caches)
+            logits = layers.reshape(logits, (-1, trg_vocab_size))
+
+            topk_scores, topk_indices = layers.topk(
+                input=layers.softmax(logits), k=beam_size)
+            accu_scores = layers.elementwise_add(
+                x=layers.log(topk_scores),
+                y=layers.reshape(
+                    pre_scores, shape=[-1]),
+                axis=0)
+            # beam_search op uses lod to distinguish branches.
+            topk_indices = layers.lod_reset(topk_indices, pre_ids)
+            selected_ids, selected_scores = layers.beam_search(
+                pre_ids=pre_ids,
+                pre_scores=pre_scores,
+                ids=topk_indices,
+                scores=accu_scores,
+                beam_size=beam_size,
+                end_id=eos_idx)
+
+            layers.increment(x=step_idx, value=1.0, in_place=True)
+            # update states
+            layers.array_write(selected_ids, i=step_idx, array=ids)
+            layers.array_write(selected_scores, i=step_idx, array=scores)
+            layers.assign(pre_src_attn_bias, trg_src_attn_bias)
+            layers.assign(pre_enc_output, enc_output)
+            for i in range(n_layer):
+                layers.assign(pre_caches[i]["k"], caches[i]["k"])
+                layers.assign(pre_caches[i]["v"], caches[i]["v"])
+            length_cond = layers.less_than(x=step_idx, y=max_len)
+            finish_cond = layers.logical_not(layers.is_empty(x=selected_ids))
+            layers.logical_and(x=length_cond, y=finish_cond, out=cond)
+
+        finished_ids, finished_scores = layers.beam_search_decode(
+            ids, scores, beam_size=beam_size, end_id=eos_idx)
+        return finished_ids, finished_scores
+
+    finished_ids, finished_scores = beam_search()
+    return finished_ids, finished_scores
+
+
+def get_model(is_dist, is_async):
+    sum_cost, avg_cost, predict, token_num = transformer(
+        ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size,
+        ModelHyperParams.max_length + 1, ModelHyperParams.n_layer,
+        ModelHyperParams.n_head, ModelHyperParams.d_key,
+        ModelHyperParams.d_value, ModelHyperParams.d_model,
+        ModelHyperParams.d_inner_hid, ModelHyperParams.dropout,
+        ModelHyperParams.weight_sharing, TrainTaskConfig.label_smooth_eps)
+
+    local_lr_scheduler = LearningRateScheduler(ModelHyperParams.d_model,
+                                               TrainTaskConfig.warmup_steps,
+                                               TrainTaskConfig.learning_rate)
+
+    if not is_dist:
+        optimizer = fluid.optimizer.Adam(
+            learning_rate=local_lr_scheduler.learning_rate,
+            beta1=TrainTaskConfig.beta1,
+            beta2=TrainTaskConfig.beta2,
+            epsilon=TrainTaskConfig.eps)
+        optimizer.minimize(sum_cost)
+    elif is_async:
+        optimizer = fluid.optimizer.SGD(0.003)
+        optimizer.minimize(sum_cost)
+    else:
+        lr_decay = fluid.layers\
+         .learning_rate_scheduler\
+         .noam_decay(ModelHyperParams.d_model,
+            TrainTaskConfig.warmup_steps)
+
+        optimizer = fluid.optimizer.Adam(
+            learning_rate=lr_decay,
+            beta1=TrainTaskConfig.beta1,
+            beta2=TrainTaskConfig.beta2,
+            epsilon=TrainTaskConfig.eps)
+        optimizer.minimize(sum_cost)
+
+    return sum_cost, avg_cost, predict, token_num, local_lr_scheduler
+
+
+def update_args():
+    src_dict = DataReader.load_dict(TrainTaskConfig.src_vocab_fpath)
+    trg_dict = DataReader.load_dict(TrainTaskConfig.trg_vocab_fpath)
+    dict_args = [
+        "src_vocab_size", str(len(src_dict)), "trg_vocab_size",
+        str(len(trg_dict)), "bos_idx",
+        str(src_dict[TrainTaskConfig.special_token[0]]), "eos_idx",
+        str(src_dict[TrainTaskConfig.special_token[1]]), "unk_idx",
+        str(src_dict[TrainTaskConfig.special_token[2]])
+    ]
+    merge_cfg_from_list(dict_args, [TrainTaskConfig, ModelHyperParams])
+
+
+class DistTransformer2x2(TestDistRunnerBase):
+    def run_pserver(self, args):
+        get_model(True, not args.sync_mode)
+        t = self.get_transpiler(args.trainer_id,
+                                fluid.default_main_program(), args.endpoints,
+                                args.trainers, args.sync_mode)
+        pserver_prog = t.get_pserver_program(args.current_endpoint)
+        startup_prog = t.get_startup_program(args.current_endpoint,
+                                             pserver_prog)
+
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(startup_prog)
+        exe.run(pserver_prog)
+
+    def run_trainer(self, place, args):
+
+        sum_cost, avg_cost, predict, token_num, local_lr_scheduler = get_model(
+            args.is_dist, not args.sync_mode)
+
+        if args.is_dist:
+            t = self.get_transpiler(args.trainer_id,
+                                    fluid.default_main_program(),
+                                    args.endpoints, args.trainers,
+                                    args.sync_mode)
+            trainer_prog = t.get_trainer_program()
+            TrainTaskConfig.batch_size = 10
+            TrainTaskConfig.train_file_pattern = TrainTaskConfig.data_path + "train.tok.clean.bpe.32000.en-de.train_{}".format(
+                args.trainer_id)
+        else:
+            TrainTaskConfig.batch_size = 20
+            trainer_prog = fluid.default_main_program()
+
+        startup_exe = fluid.Executor(place)
+
+        TrainTaskConfig.local = not args.is_dist
+
+        train_loop(startup_exe, trainer_prog, 1, sum_cost, avg_cost,
+                   local_lr_scheduler, token_num, predict)
+
+
+if __name__ == "__main__":
+    update_args()
+    runtime_main(DistTransformer2x2)
diff --git a/python/paddle/fluid/tests/unittests/dist_word2vec.py b/python/paddle/fluid/tests/unittests/dist_word2vec.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3e740fc7027a4a562b836c3113b87d55062c185
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dist_word2vec.py
@@ -0,0 +1,125 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import time
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import os
+import signal
+from test_dist_base import TestDistRunnerBase, runtime_main
+
+IS_SPARSE = True
+EMBED_SIZE = 32
+HIDDEN_SIZE = 256
+N = 5
+
+# Fix seed for test
+fluid.default_startup_program().random_seed = 1
+fluid.default_main_program().random_seed = 1
+
+
+class TestDistWord2vec2x2(TestDistRunnerBase):
+    def get_model(self, batch_size=2):
+        BATCH_SIZE = batch_size
+
+        def __network__(words):
+            embed_first = fluid.layers.embedding(
+                input=words[0],
+                size=[dict_size, EMBED_SIZE],
+                dtype='float32',
+                is_sparse=IS_SPARSE,
+                param_attr=fluid.ParamAttr(
+                    name='shared_w',
+                    initializer=fluid.initializer.Constant(value=0.1)))
+            embed_second = fluid.layers.embedding(
+                input=words[1],
+                size=[dict_size, EMBED_SIZE],
+                dtype='float32',
+                is_sparse=IS_SPARSE,
+                param_attr=fluid.ParamAttr(
+                    name='shared_w',
+                    initializer=fluid.initializer.Constant(value=0.1)))
+            embed_third = fluid.layers.embedding(
+                input=words[2],
+                size=[dict_size, EMBED_SIZE],
+                dtype='float32',
+                is_sparse=IS_SPARSE,
+                param_attr=fluid.ParamAttr(
+                    name='shared_w',
+                    initializer=fluid.initializer.Constant(value=0.1)))
+            embed_forth = fluid.layers.embedding(
+                input=words[3],
+                size=[dict_size, EMBED_SIZE],
+                dtype='float32',
+                is_sparse=IS_SPARSE,
+                param_attr=fluid.ParamAttr(
+                    name='shared_w',
+                    initializer=fluid.initializer.Constant(value=0.1)))
+
+            concat_embed = fluid.layers.concat(
+                input=[embed_first, embed_second, embed_third, embed_forth],
+                axis=1)
+            hidden1 = fluid.layers.fc(
+                input=concat_embed,
+                size=HIDDEN_SIZE,
+                act='sigmoid',
+                param_attr=fluid.ParamAttr(
+                    initializer=fluid.initializer.Constant(value=0.1)))
+            predict_word = fluid.layers.fc(
+                input=hidden1,
+                size=dict_size,
+                act='softmax',
+                param_attr=fluid.ParamAttr(
+                    initializer=fluid.initializer.Constant(value=0.1)))
+            cost = fluid.layers.cross_entropy(
+                input=predict_word, label=words[4])
+            avg_cost = fluid.layers.mean(cost)
+            return avg_cost, predict_word
+
+        word_dict = paddle.dataset.imikolov.build_dict()
+        dict_size = len(word_dict)
+
+        first_word = fluid.layers.data(name='firstw', shape=[1], dtype='int64')
+        second_word = fluid.layers.data(
+            name='secondw', shape=[1], dtype='int64')
+        third_word = fluid.layers.data(name='thirdw', shape=[1], dtype='int64')
+        forth_word = fluid.layers.data(name='forthw', shape=[1], dtype='int64')
+        next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64')
+        avg_cost, predict_word = __network__(
+            [first_word, second_word, third_word, forth_word, next_word])
+
+        inference_program = paddle.fluid.default_main_program().clone()
+
+        sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+        sgd_optimizer.minimize(avg_cost)
+
+        train_reader = paddle.batch(
+            paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE)
+        test_reader = paddle.batch(
+            paddle.dataset.imikolov.test(word_dict, N), BATCH_SIZE)
+
+        return inference_program, avg_cost, train_reader, test_reader, None, predict_word
+
+
+if __name__ == "__main__":
+    runtime_main(TestDistWord2vec2x2)
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index e056ef9952a519d6c4d580b27f1118a3a91f13af..56a242b996f67aa4b9c858ab8aaeb1c1cd3bcf60 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -12,9 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 import random
+import six
 import time
 import itertools
 import collections
@@ -32,7 +35,7 @@ def randomize_probability(batch_size, class_num, dtype='float32'):
     prob = np.random.uniform(
         0.1, 1.0, size=(batch_size, class_num)).astype(dtype)
     prob_sum = prob.sum(axis=1)
-    for i in xrange(len(prob)):
+    for i in six.moves.xrange(len(prob)):
         prob[i] /= prob_sum[i]
     return prob
 
@@ -44,28 +47,35 @@ def get_numeric_gradient(place,
                          input_to_check,
                          output_names,
                          delta=0.005,
-                         in_place=False):
+                         in_place=False,
+                         sum_outputs=None):
     # FIXME: change this method by compile time concepts
     set_input(scope, op, inputs, place)
 
     def product(dim):
-        return reduce(lambda a, b: a * b, dim, 1)
+        return six.moves.reduce(lambda a, b: a * b, dim, 1)
 
     def get_output():
         sum = []
+        op.run(scope, place)
         for output_name in output_names:
-            op.run(scope, place)
+            if sum_outputs and output_name not in sum_outputs:
+                continue
             sum.append(
                 np.array(scope.find_var(output_name).get_tensor()).mean())
-        return np.array(sum).mean()
+        return np.array(sum).sum() / len(output_names)
 
     tensor_to_check = scope.find_var(input_to_check).get_tensor()
-    tensor_size = product(tensor_to_check.get_dims())
-    tensor_to_check_dtype = tensor_to_check.dtype()
+    tensor_size = product(tensor_to_check.shape())
+    tensor_to_check_dtype = tensor_to_check._dtype()
     if tensor_to_check_dtype == core.VarDesc.VarType.FP32:
         tensor_to_check_dtype = np.float32
     elif tensor_to_check_dtype == core.VarDesc.VarType.FP64:
         tensor_to_check_dtype = np.float64
+    elif tensor_to_check_dtype == core.VarDesc.VarType.FP16:
+        tensor_to_check_dtype = np.float16
+        # set delta as np.float16, will automatic convert to float32, float64
+        delta = np.array(delta).astype(np.float16)
     else:
         raise ValueError("Not supported data type " + str(
             tensor_to_check_dtype))
@@ -73,20 +83,31 @@ def get_numeric_gradient(place,
     gradient_flat = np.zeros(shape=(tensor_size, ), dtype=tensor_to_check_dtype)
 
     def __get_elem__(tensor, i):
-        if tensor_to_check_dtype == np.float32:
-            return tensor.get_float_element(i)
+        if tensor_to_check_dtype == np.float16:
+            numpy_tensor = np.array(tensor).astype(np.float16)
+            numpy_tensor = numpy_tensor.flatten()
+            return numpy_tensor[i]
+        elif tensor_to_check_dtype == np.float32:
+            return tensor._get_float_element(i)
         else:
-            return tensor.get_double_element(i)
+            return tensor._get_double_element(i)
 
     def __set_elem__(tensor, i, e):
-        if tensor_to_check_dtype == np.float32:
-            tensor.set_float_element(i, e)
+        if tensor_to_check_dtype == np.float16:
+            numpy_tensor = np.array(tensor).astype(np.float16)
+            shape = numpy_tensor.shape
+            numpy_tensor = numpy_tensor.flatten()
+            numpy_tensor[i] = e
+            numpy_tensor = numpy_tensor.reshape(shape).view(np.uint16)
+            tensor.set(numpy_tensor, place)
+        elif tensor_to_check_dtype == np.float32:
+            tensor._set_float_element(i, e)
         else:
-            tensor.set_double_element(i, e)
+            tensor._set_double_element(i, e)
 
     # we only compute gradient of one element each time.
     # we use a for loop to compute the gradient of every element.
-    for i in xrange(tensor_size):
+    for i in six.moves.xrange(tensor_size):
         if in_place:
             set_input(scope, op, inputs, place)
 
@@ -107,7 +128,7 @@ def get_numeric_gradient(place,
         __set_elem__(tensor_to_check, i, origin)
         gradient_flat[i] = (y_pos - y_neg) / delta / 2
 
-    return gradient_flat.reshape(tensor_to_check.get_dims())
+    return gradient_flat.reshape(tensor_to_check.shape())
 
 
 class OpTest(unittest.TestCase):
@@ -125,7 +146,7 @@ class OpTest(unittest.TestCase):
 
     @classmethod
     def tearDownClass(cls):
-        '''Restore random seeds'''
+        """Restore random seeds"""
         np.random.set_state(cls._np_rand_state)
         random.setstate(cls._py_rand_state)
 
@@ -133,13 +154,18 @@ class OpTest(unittest.TestCase):
         if not self.call_once:
             self.call_once = True
             self.dtype = data_type
+            # See the comment of np_dtype_to_fluid_dtype
+            # If the input type is uint16, we assume use float16
+            # for lodtensor dtype.
+            if self.dtype == np.uint16:
+                self.dtype == np.float16
 
     def infer_dtype_from_inputs_outputs(self, inputs, outputs):
         def infer_dtype(numpy_dict):
             assert isinstance(
                 numpy_dict,
                 dict), "self.inputs, self.outputs must be numpy_dict"
-            for var_name, var_value in numpy_dict.iteritems():
+            for var_name, var_value in six.iteritems(numpy_dict):
                 if isinstance(var_value, (np.ndarray, np.generic)):
                     self.try_call_once(var_value.dtype)
                 elif isinstance(var_value, (list, tuple)):
@@ -161,19 +187,25 @@ class OpTest(unittest.TestCase):
                 for name, np_value in self.inputs[var_name]:
                     tensor = core.LoDTensor()
                     if isinstance(np_value, tuple):
-                        tensor.set(np_value[0], place)
+                        tensor.set(
+                            OpTest.np_value_to_fluid_value(np_value[0]), place)
                         tensor.set_recursive_sequence_lengths(np_value[1])
                     else:
-                        tensor.set(np_value, place)
+                        tensor.set(
+                            OpTest.np_value_to_fluid_value(np_value), place)
                     feed_map[name] = tensor
             else:
                 tensor = core.LoDTensor()
                 if isinstance(self.inputs[var_name], tuple):
-                    tensor.set(self.inputs[var_name][0], place)
+                    tensor.set(
+                        OpTest.np_value_to_fluid_value(self.inputs[var_name][
+                            0]), place)
                     tensor.set_recursive_sequence_lengths(self.inputs[var_name][
                         1])
                 else:
-                    tensor.set(self.inputs[var_name], place)
+                    tensor.set(
+                        OpTest.np_value_to_fluid_value(self.inputs[var_name]),
+                        place)
                 feed_map[var_name] = tensor
 
         return feed_map
@@ -197,7 +229,7 @@ class OpTest(unittest.TestCase):
 
     def _get_io_vars(self, block, numpy_inputs):
         inputs = {}
-        for name, value in numpy_inputs.iteritems():
+        for name, value in six.iteritems(numpy_inputs):
             if isinstance(value, list):
                 var_list = [
                     block.var(sub_name) for sub_name, sub_value in value
@@ -217,7 +249,7 @@ class OpTest(unittest.TestCase):
         outs, _ = self._calc_output(place)
         return outs
 
-    def _calc_output(self, place, parallel=False):
+    def _calc_output(self, place, parallel=False, no_check_set=None):
 
         program = Program()
         block = program.global_block()
@@ -240,7 +272,9 @@ class OpTest(unittest.TestCase):
         # if the fetch_list is customized by user, we use it directly.
         # if not, fill the fetch_list by the user configured outputs in test.
         if len(fetch_list) == 0:
-            for var_name, var in outputs.iteritems():
+            for var_name, var in six.iteritems(outputs):
+                if no_check_set is not None and var_name in no_check_set:
+                    continue
                 if isinstance(var, list):
                     for v in var:
                         fetch_list.append(v)
@@ -251,19 +285,25 @@ class OpTest(unittest.TestCase):
             for out_name, out_dup in Operator.get_op_outputs(self.op_type):
                 fetch_list.append(str(out_name))
         # fetch_list = map(block.var, fetch_list)
-        if not isinstance(fetch_list[0], Variable):
-            fetch_list = map(block.var, fetch_list)
+        if not isinstance(fetch_list[0], fluid.framework.Variable):
+            fetch_list = list(map(block.var, fetch_list))
         outs = executor.run(program,
                             feed=feed_map,
                             fetch_list=fetch_list,
                             return_numpy=False)
         return outs, fetch_list
 
-    def check_output_with_place(self, place, atol):
-        outs, fetch_list = self._calc_output(place)
+    def check_output_with_place(self,
+                                place,
+                                atol,
+                                no_check_set=None,
+                                equal_nan=False):
+        outs, fetch_list = self._calc_output(place, no_check_set=no_check_set)
         for out_name, out_dup in Operator.get_op_outputs(self.op_type):
             if out_name not in self.outputs:
                 continue
+            if no_check_set is not None and out_name in no_check_set:
+                continue
 
             def find_actual(target_name, fetch_list):
                 found = [
@@ -289,7 +329,7 @@ class OpTest(unittest.TestCase):
                         if isinstance(expect, tuple) else expect
                     self.assertTrue(
                         np.allclose(
-                            actual_t, expect_t, atol=atol),
+                            actual_t, expect_t, atol=atol, equal_nan=equal_nan),
                         "Output (" + sub_out_name + ") has diff at " +
                         str(place))
                     if isinstance(expect, tuple):
@@ -305,36 +345,46 @@ class OpTest(unittest.TestCase):
                 expect_t = expect[0] if isinstance(expect, tuple) else expect
                 self.assertTrue(
                     np.allclose(
-                        actual_t, expect_t, atol=atol),
+                        actual_t, expect_t, atol=atol, equal_nan=equal_nan),
                     "Output (" + out_name + ") has diff at " + str(place) +
-                    str(actual_t) + "\n" + str(expect_t))
+                    "\nExpect " + str(expect_t) + "\n" + "But Got" +
+                    str(actual_t))
                 if isinstance(expect, tuple):
                     self.assertListEqual(actual.recursive_sequence_lengths(),
                                          expect[1], "Output (" + out_name +
                                          ") has different lod at " + str(place))
 
     def _get_places(self):
+        if self.dtype == np.float16:
+            if core.is_compiled_with_cuda() and core.op_support_gpu(
+                    self.op_type):
+                place = core.CUDAPlace(0)
+                if core.is_float16_supported(place):
+                    return [place]
+            else:
+                return []
         places = [fluid.CPUPlace()]
         if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type):
             places.append(core.CUDAPlace(0))
         return places
 
-    def check_output(self, atol=1e-5):
+    def check_output(self, atol=1e-5, no_check_set=None, equal_nan=False):
         places = self._get_places()
         for place in places:
-            self.check_output_with_place(place, atol)
+            self.check_output_with_place(place, atol, no_check_set, equal_nan)
 
     def check_output_customized(self, checker):
         places = self._get_places()
         for place in places:
             outs = self.calc_output(place)
             outs = [np.array(out) for out in outs]
+            outs.sort(key=len)
             checker(outs)
 
     def __assert_is_close(self, numeric_grads, analytic_grads, names,
                           max_relative_error, msg_prefix):
 
-        for a, b, name in itertools.izip(numeric_grads, analytic_grads, names):
+        for a, b, name in six.moves.zip(numeric_grads, analytic_grads, names):
             abs_a = np.abs(a)
             abs_a[abs_a < 1e-3] = 1
 
@@ -344,9 +394,9 @@ class OpTest(unittest.TestCase):
             def err_msg():
                 offset = np.argmax(diff_mat > max_relative_error)
                 return ("%s Variable %s max gradient diff %f over limit %f, "
-                        "the first error element is %d, %f, %f") % (
-                            msg_prefix, name, max_diff, max_relative_error,
-                            offset, a.flatten()[offset], b.flatten()[offset])
+                        "the first error element is %d, expected %f, but got %f"
+                        ) % (msg_prefix, name, max_diff, max_relative_error,
+                             offset, a.flatten()[offset], b.flatten()[offset])
 
             self.assertLessEqual(max_diff, max_relative_error, err_msg())
 
@@ -357,13 +407,14 @@ class OpTest(unittest.TestCase):
                    numeric_grad_delta=0.005,
                    in_place=False,
                    max_relative_error=0.005,
-                   user_defined_grads=None):
+                   user_defined_grads=None,
+                   sum_outputs=None):
         places = self._get_places()
         for place in places:
             self.check_grad_with_place(place, inputs_to_check, output_names,
                                        no_grad_set, numeric_grad_delta,
                                        in_place, max_relative_error,
-                                       user_defined_grads)
+                                       user_defined_grads, sum_outputs)
 
     def check_grad_with_place(self,
                               place,
@@ -373,7 +424,8 @@ class OpTest(unittest.TestCase):
                               numeric_grad_delta=0.005,
                               in_place=False,
                               max_relative_error=0.005,
-                              user_defined_grads=None):
+                              user_defined_grads=None,
+                              sum_outputs=None):
         self.scope = core.Scope()
         op_inputs = self.inputs if hasattr(self, "inputs") else dict()
         op_outputs = self.outputs if hasattr(self, "outputs") else dict()
@@ -396,7 +448,8 @@ class OpTest(unittest.TestCase):
                 input_to_check,
                 output_names,
                 delta=numeric_grad_delta,
-                in_place=in_place) for input_to_check in inputs_to_check
+                in_place=in_place,
+                sum_outputs=sum_outputs) for input_to_check in inputs_to_check
         ]
         analytic_grads = self._get_gradient(inputs_to_check, place,
                                             output_names, no_grad_set)
@@ -435,6 +488,21 @@ class OpTest(unittest.TestCase):
             input.dtype = np.uint16
         return input
 
+    @staticmethod
+    def fluid_dtype_to_np_dtype(self, dtype):
+        """
+        See above, convert the dtype to normal type.
+        """
+        if dtype == np.uint16:
+            dtype = np.float16
+        return dtype
+
+    @staticmethod
+    def np_value_to_fluid_value(input):
+        if input.dtype == np.float16:
+            input = input.view(np.uint16)
+        return input
+
     def _get_gradient(self,
                       input_to_check,
                       place,
@@ -457,9 +525,9 @@ class OpTest(unittest.TestCase):
             if isinstance(place, fluid.CUDAPlace(0)):
                 use_cuda = True
             executor = fluid.ParallelExecutor(
-                use_cuda=use_cuda, loss_name=loss.name, main_program=program)
+                use_cuda=use_cuda, loss_name=loss.name, main_program=prog)
         else:
             executor = Executor(place)
-        return map(np.array,
-                   executor.run(prog, feed_dict, fetch_list,
-                                return_numpy=False))
+        return list(
+            map(np.array,
+                executor.run(prog, feed_dict, fetch_list, return_numpy=False)))
diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
index cddf00765f4894126988c794763c34629449e8e6..74e9d5c5f91e53a315c85d428571ce45bacede8a 100644
--- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
+++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import multiprocessing
 import os
 import unittest
@@ -35,7 +37,9 @@ class TestParallelExecutorBase(unittest.TestCase):
                                   feed_dict=None,
                                   seed=None,
                                   use_parallel_executor=True,
-                                  balance_parameter_opt_between_cards=False):
+                                  use_reduce=False,
+                                  optimizer=fluid.optimizer.Adam,
+                                  use_fast_executor=False):
         def run_executor(exe, feed, fetch_list, program=None):
             if isinstance(exe, fluid.ParallelExecutor):
                 res = exe.run(fetch_list=fetch_list, feed=feed)
@@ -50,22 +54,30 @@ class TestParallelExecutorBase(unittest.TestCase):
         main = fluid.Program()
         startup = fluid.Program()
         startup.random_seed = 1  # Fix random seed
+        main.random_seed = 1
         with fluid.program_guard(main, startup):
             if seed is not None:
                 startup.random_seed = seed
+                main.random_seed = seed
+
             loss = method(use_feed=feed_dict is not None)
-            adam = fluid.optimizer.Adam()
-            adam.minimize(loss)
+
+            optimizer().minimize(loss)
+
             if memory_opt:
                 fluid.memory_optimize(main)
+
             place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
             startup_exe = fluid.Executor(place)
             startup_exe.run(startup)
             exec_strategy = fluid.ExecutionStrategy()
             exec_strategy.allow_op_delay = allow_op_delay
+            if use_fast_executor:
+                exec_strategy.use_experimental_executor = True
 
             build_strategy = fluid.BuildStrategy()
-            build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce if balance_parameter_opt_between_cards else fluid.BuildStrategy.ReduceStrategy.AllReduce
+            build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce \
+                if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce
 
             if use_parallel_executor:
                 exe = fluid.ParallelExecutor(
@@ -84,7 +96,7 @@ class TestParallelExecutorBase(unittest.TestCase):
             first_loss, = run_executor(
                 exe=exe, feed=feed_dict, fetch_list=[loss.name])
 
-            for i in xrange(iter):
+            for i in range(iter):
                 run_executor(exe=exe, feed=feed_dict, fetch_list=[])
 
             last_loss, = run_executor(
@@ -92,8 +104,8 @@ class TestParallelExecutorBase(unittest.TestCase):
             end = time.time()
 
             if batch_size is not None:
-                print "%.4f Instance per second" % (
-                    (batch_size * iter + 2) / (end - begin))
+                print("%.4f Instance per second" % (
+                    (batch_size * iter + 2) / (end - begin)))
 
             avg_last_loss_val = np.array(last_loss).mean()
             avg_first_loss_val = np.array(first_loss).mean()
@@ -101,6 +113,6 @@ class TestParallelExecutorBase(unittest.TestCase):
                     float(avg_first_loss_val)):
                 sys.exit("got NaN loss, training failed.")
 
-            print first_loss, last_loss
+            print(first_loss, last_loss)
             # self.assertGreater(first_loss[0], last_loss[0])
             return first_loss, last_loss
diff --git a/python/paddle/fluid/tests/unittests/test_accuracy_op.py b/python/paddle/fluid/tests/unittests/test_accuracy_op.py
index 212a87e529da83c40ba8852e81bdf43d4611897b..1b2b53f2d4ce91ae7b5b191ed770b5338f0948c8 100644
--- a/python/paddle/fluid/tests/unittests/test_accuracy_op.py
+++ b/python/paddle/fluid/tests/unittests/test_accuracy_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
@@ -26,7 +28,7 @@ class TestAccuracyOp(OpTest):
         label = np.random.randint(0, 2, (n, 1))
         self.inputs = {'Out': infer, 'Indices': indices, "Label": label}
         num_correct = 0
-        for rowid in xrange(n):
+        for rowid in range(n):
             for ele in indices[rowid]:
                 if ele == label[rowid]:
                     num_correct += 1
diff --git a/python/paddle/fluid/tests/unittests/test_activation_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_activation_mkldnn_op.py
index 7d554c2276c9acd710d14c8f8b32c802e3e17515..611d0dd076b827b0f528f2e3a31182cc4939d1f1 100644
--- a/python/paddle/fluid/tests/unittests/test_activation_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_mkldnn_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 import paddle.fluid.core as core
diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py
index 5ed387fb1247f1a91147cb6981f1adc7c2eeb8a2..30651c1326328180592520447e597aa722146a42 100644
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 import paddle.fluid.core as core
@@ -313,9 +315,9 @@ class TestAbs(OpTest):
         self.init_dtype()
 
         x = np.random.uniform(-1, 1, [4, 4]).astype(self.dtype)
-        # Because we set delta = 0.005 in caculating numeric gradient,
+        # Because we set delta = 0.005 in calculating numeric gradient,
         # if x is too small, such as 0.002, x_neg will be -0.003
-        # x_pos will be 0.007, so the numeric gradient is unaccurate.
+        # x_pos will be 0.007, so the numeric gradient is inaccurate.
         # we should avoid this
         x[np.abs(x) < 0.005] = 0.02
         out = np.abs(x)
diff --git a/python/paddle/fluid/tests/unittests/test_adadelta_op.py b/python/paddle/fluid/tests/unittests/test_adadelta_op.py
index 1b892e64c7654a1a3905672813452650885790a5..969a7da3b71b69296f3313342adbf989c60edb50 100644
--- a/python/paddle/fluid/tests/unittests/test_adadelta_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adadelta_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_adagrad_op.py b/python/paddle/fluid/tests/unittests/test_adagrad_op.py
index 2f0ea79f4d6afe91ee7e0d747f3d8f4884d8f9ee..fc3b7ce2fd87afc22030bcca55236fb949c1f129 100644
--- a/python/paddle/fluid/tests/unittests/test_adagrad_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adagrad_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 import paddle.fluid.core as core
diff --git a/python/paddle/fluid/tests/unittests/test_adam_op.py b/python/paddle/fluid/tests/unittests/test_adam_op.py
index 3c65f3d44adcebdca92f78f7834d4878a9fa3dfe..5318d2f9766ce671925be614feef57d679270b19 100644
--- a/python/paddle/fluid/tests/unittests/test_adam_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adam_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
@@ -273,7 +275,7 @@ class TestSparseAdamOp(unittest.TestCase):
         self.setup(scope, place)
 
         op_args = dict()
-        for key, np_array in self.dense_inputs.iteritems():
+        for key, np_array in self.dense_inputs.items():
             var = scope.var(key).get_tensor()
             var.set(np_array, place)
             op_args[key] = key
@@ -290,7 +292,7 @@ class TestSparseAdamOp(unittest.TestCase):
         adam_op = Operator("adam", **op_args)
         adam_op.run(scope, place)
 
-        for key, np_array in self.outputs.iteritems():
+        for key, np_array in self.outputs.items():
             out_var = scope.var(key).get_tensor()
             actual = np.array(out_var)
             actual = actual.reshape([actual.size])
diff --git a/python/paddle/fluid/tests/unittests/test_adamax_op.py b/python/paddle/fluid/tests/unittests/test_adamax_op.py
index 8099beefa583d152715334e83f0c6e8e4a3e7a0d..a6d1be7616c73019cd8f66dcf0c108cd58ec600b 100644
--- a/python/paddle/fluid/tests/unittests/test_adamax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adamax_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_anchor_generator_op.py b/python/paddle/fluid/tests/unittests/test_anchor_generator_op.py
index 9c7d5d41f0c512a9fb609dce304c1eed929d28b5..d31eaa0114c3b035add3e6ca792696b5cafb9690 100644
--- a/python/paddle/fluid/tests/unittests/test_anchor_generator_op.py
+++ b/python/paddle/fluid/tests/unittests/test_anchor_generator_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 import sys
diff --git a/python/paddle/fluid/tests/unittests/test_arg_min_max_op.py b/python/paddle/fluid/tests/unittests/test_arg_min_max_op.py
index e04412f809cdd75d07d28a60f0c2f19041a684f6..0712e102b30fc72c7f8b62eb9230e7f4ab615ef0 100644
--- a/python/paddle/fluid/tests/unittests/test_arg_min_max_op.py
+++ b/python/paddle/fluid/tests/unittests/test_arg_min_max_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_argsort_op.py b/python/paddle/fluid/tests/unittests/test_argsort_op.py
index b29a102a3880406156481fdac54ca7043d3415db..7bc6f2599d617b192908da9b57d0cd715019bd71 100644
--- a/python/paddle/fluid/tests/unittests/test_argsort_op.py
+++ b/python/paddle/fluid/tests/unittests/test_argsort_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_array_read_write_op.py b/python/paddle/fluid/tests/unittests/test_array_read_write_op.py
index a49e9035a43e04fc1d1b2328d7562c053320b24b..b86d0bc43a9f84988f2b1b27f7aeffce46a46bd9 100644
--- a/python/paddle/fluid/tests/unittests/test_array_read_write_op.py
+++ b/python/paddle/fluid/tests/unittests/test_array_read_write_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import paddle.fluid.core as core
 import paddle.fluid.layers as layers
@@ -80,8 +82,9 @@ class TestArrayReadWrite(unittest.TestCase):
 
         append_backward(total_sum_scaled)
 
-        g_vars = map(default_main_program().global_block().var,
-                     [each_x.name + "@GRAD" for each_x in x])
+        g_vars = list(
+            map(default_main_program().global_block().var,
+                [each_x.name + "@GRAD" for each_x in x]))
         g_out = [
             item.sum()
             for item in exe.run(
diff --git a/python/paddle/fluid/tests/unittests/test_assign_op.py b/python/paddle/fluid/tests/unittests/test_assign_op.py
index e93c02bd3ee9f710cbb9bff4e195dfc3caabe422..ba2eecfaf197ea63c187e77ae7ae8cf34873d66b 100644
--- a/python/paddle/fluid/tests/unittests/test_assign_op.py
+++ b/python/paddle/fluid/tests/unittests/test_assign_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import op_test
 import numpy
 import unittest
diff --git a/python/paddle/fluid/tests/unittests/test_assign_value_op.py b/python/paddle/fluid/tests/unittests/test_assign_value_op.py
index 02f2e6eddc80fcce4ca5a444cff82db355c085ca..5a9d8efef1f3e5a9e116720c2ffe32c2ef0a082f 100644
--- a/python/paddle/fluid/tests/unittests/test_assign_value_op.py
+++ b/python/paddle/fluid/tests/unittests/test_assign_value_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 import op_test
diff --git a/python/paddle/fluid/tests/unittests/test_attention_lstm_op.py b/python/paddle/fluid/tests/unittests/test_attention_lstm_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b9c3efe0fa9e9f1b8ad09029079898622e7d489
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_attention_lstm_op.py
@@ -0,0 +1,208 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+from test_fusion_lstm_op import fc, ACTIVATION
+from test_softmax_op import stable_softmax
+
+
+def attention_lstm(
+        x,  # T x M
+        lod,  # 1 x N
+        h0,  # N x D
+        c0,  # N x D
+        fcws,  # (M+D) x 1, 1x1
+        fcbs,  # 1 x 1, 1x1
+        w,  # (M+D) x 4D
+        b,  # 1 x 4D
+        act_gate,
+        act_cell,
+        act_cand):
+
+    T = sum(lod[0])
+    N = len(lod[0])
+    M = x.shape[1]
+    D = b.shape[1] // 4
+    assert T == x.shape[0]
+    assert len(fcws) == len(fcbs)
+    hidden = []
+    cell = []
+
+    start_offset = 0
+    for bid in range(N):
+        seq_len = lod[0][bid]
+        xi = np.copy(x[start_offset:start_offset + seq_len, :]).reshape(seq_len,
+                                                                        M)
+        prev_cell = np.copy(c0[bid]).reshape([1, D])
+        prev_hidden = np.copy(h0[bid]).reshape([1, D])
+        for step in range(seq_len):
+            expanded_cell = np.repeat(prev_cell, seq_len, axis=0)
+            tmp = np.concatenate((xi, expanded_cell), axis=1)
+            assert tmp.shape[0] == seq_len
+            assert tmp.shape[1] == M + D
+            for fcid in range(len(fcbs)):
+                tmp = fc(tmp, fcws[fcid], fcbs[fcid])
+                tmp = ACTIVATION['relu'](tmp)
+            tmp = np.reshape(tmp, (1, seq_len))
+            tmp = stable_softmax(tmp).reshape(seq_len, 1)
+            lstmx = xi * tmp  # seq * M
+            lstmx = np.sum(lstmx.reshape(seq_len, M), axis=0).reshape([1, M])
+            lstmin = np.concatenate((prev_hidden, lstmx), axis=1)
+            lstmout = fc(lstmin, w, b).reshape([1, 4 * D])
+
+            g_f, g_i, g_o, cand = np.split(lstmout, 4, axis=1)
+            g_f = act_gate(g_f).reshape([1, D])
+            g_i = act_gate(g_i).reshape([1, D])
+            g_o = act_gate(g_o).reshape([1, D])
+            cand = act_cand(cand).reshape([1, D])
+
+            cell_t = (prev_cell * g_f) + (g_i * cand)
+            hidden_t = g_o * act_cell(cell_t)
+
+            hidden.append(hidden_t.flatten())
+            cell.append(cell_t.flatten())
+
+            prev_cell = cell_t.reshape([1, D])
+            prev_hidden = hidden_t.reshape([1, D])
+
+        start_offset += seq_len
+
+    hidden = np.array(hidden).astype('float32').reshape([T, D])
+    cell = np.array(cell).astype('float32').reshape([T, D])
+    return hidden, cell
+
+
+class TestAttentionLSTMOp(OpTest):
+    def set_conf(self):
+        pass
+
+    def setUp(self):
+        self.op_type = 'attention_lstm'
+        self.lod = [[3]]
+        self.M = 30
+        self.D = 15
+        self.has_initial_hidden = True
+        self.act_gate = 'sigmoid'
+        self.act_cell = 'tanh'
+        self.act_cand = 'tanh'
+        self.set_conf()
+
+        T = sum(self.lod[0])
+        bs = len(self.lod[0])
+
+        x = np.random.normal(size=(T, self.M)).astype('float32')
+        c0 = np.random.normal(size=(bs, self.D)).astype('float32')
+        if self.has_initial_hidden:
+            h0 = np.random.normal(size=(bs, self.D)).astype('float32')
+        else:
+            h0 = np.zeros((bs, self.D)).astype('float32')
+
+        fcw1 = np.random.normal(size=(self.M + self.D, 1)).astype('float32')
+        fcb1 = np.random.normal(size=(1, 1)).astype('float32')
+        fcw2 = np.random.normal(size=(1, 1)).astype('float32')
+        fcb2 = np.random.normal(size=(1, 1)).astype('float32')
+
+        # lstm weight and bias
+        w = np.random.normal(size=(self.M + self.D,
+                                   self.D * 4)).astype('float32')
+        b = np.random.normal(size=(1, self.D * 4)).astype('float32')
+
+        h, c = attention_lstm(x, self.lod, h0, c0, [fcw1, fcw2], [fcb1, fcb2],
+                              w, b, ACTIVATION[self.act_gate],
+                              ACTIVATION[self.act_cell],
+                              ACTIVATION[self.act_cand])
+
+        self.inputs = {
+            'X': (x, self.lod),
+            'C0': c0,
+            'AttentionWeight': fcw1,
+            'AttentionBias': fcb1,
+            'AttentionScalar': fcw2,
+            'AttentionScalarBias': fcb2,
+            'LSTMWeight': w,
+            'LSTMBias': b
+        }
+
+        if self.has_initial_hidden:
+            self.inputs['H0'] = h0
+
+        self.outputs = {
+            'Hidden': (h, self.lod),
+            'Cell': (c, self.lod),
+        }
+        self.attrs = {
+            'gate_activation': self.act_gate,
+            'cell_activation': self.act_cell,
+            'candidate_activation': self.act_cand
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestAttentionOpNonInit(TestAttentionLSTMOp):
+    def set_conf(self):
+        self.has_initial_hidden = False
+
+
+class TestAttentionOpAct(TestAttentionLSTMOp):
+    def set_conf(self):
+        self.M = 3
+        self.D = 2
+        self.act_gate = 'relu'
+        self.act_cell = 'tanh'
+        self.act_cand = 'sigmoid'
+
+
+class TestAttentionOpMD1(TestAttentionLSTMOp):
+    def set_conf(self):
+        self.M = 36
+        self.D = 8
+
+
+class TestAttentionOpMD2(TestAttentionLSTMOp):
+    def set_conf(self):
+        self.M = 8
+        self.D = 8
+
+
+class TestAttentionOpMD3(TestAttentionLSTMOp):
+    def set_conf(self):
+        self.M = 15
+        self.D = 30
+
+
+class TestAttentionOpBS1(TestAttentionLSTMOp):
+    def set_conf(self):
+        self.lod = [[5]]
+        self.M = 16
+        self.D = 32
+
+
+class TestAttentionOpBS2(TestAttentionLSTMOp):
+    def set_conf(self):
+        self.lod = [[3, 6]]
+
+
+class TestAttentionOpBS5(TestAttentionLSTMOp):
+    def set_conf(self):
+        self.lod = [[3, 2, 4, 7, 5]]
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_auc_op.py b/python/paddle/fluid/tests/unittests/test_auc_op.py
index 948836039be48ad74d5556100f06231bb89f26d3..1de4a9d016a177944253d12094722d3a05614be2 100644
--- a/python/paddle/fluid/tests/unittests/test_auc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_auc_op.py
@@ -12,66 +12,43 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
+from paddle.fluid import metrics
 
 
 class TestAucOp(OpTest):
     def setUp(self):
         self.op_type = "auc"
         pred = np.random.random((128, 2)).astype("float32")
-        indices = np.random.randint(0, 2, (128, 2))
         labels = np.random.randint(0, 2, (128, 1))
         num_thresholds = 200
-        self.inputs = {'Out': pred, 'Indices': indices, 'Label': labels}
-        self.attrs = {'curve': 'ROC', 'num_thresholds': num_thresholds}
-        # NOTE: sklearn use a different way to generate thresholds
-        #       which will cause the result differs slightly:
-        # from sklearn.metrics import roc_curve, auc
-        # fpr, tpr, thresholds = roc_curve(labels, pred)
-        # auc_value = auc(fpr, tpr)
-        # we caculate AUC again using numpy for testing
-        kepsilon = 1e-7  # to account for floating point imprecisions
-        thresholds = [(i + 1) * 1.0 / (num_thresholds - 1)
-                      for i in range(num_thresholds - 2)]
-        thresholds = [0.0 - kepsilon] + thresholds + [1.0 + kepsilon]
 
-        # caculate TP, FN, TN, FP count
-        tp_list = np.ndarray((num_thresholds, ))
-        fn_list = np.ndarray((num_thresholds, ))
-        tn_list = np.ndarray((num_thresholds, ))
-        fp_list = np.ndarray((num_thresholds, ))
-        for idx_thresh, thresh in enumerate(thresholds):
-            tp, fn, tn, fp = 0, 0, 0, 0
-            for i, lbl in enumerate(labels):
-                if lbl:
-                    if pred[i, 0] >= thresh:
-                        tp += 1
-                    else:
-                        fn += 1
-                else:
-                    if pred[i, 0] >= thresh:
-                        fp += 1
-                    else:
-                        tn += 1
-            tp_list[idx_thresh] = tp
-            fn_list[idx_thresh] = fn
-            tn_list[idx_thresh] = tn
-            fp_list[idx_thresh] = fp
+        stat_pos = np.zeros((num_thresholds + 1, )).astype("int64")
+        stat_neg = np.zeros((num_thresholds + 1, )).astype("int64")
 
-        epsilon = 1e-6
-        tpr = (tp_list.astype("float32") + epsilon) / (
-            tp_list + fn_list + epsilon)
-        fpr = fp_list.astype("float32") / (fp_list + tn_list + epsilon)
-        rec = (tp_list.astype("float32") + epsilon) / (
-            tp_list + fp_list + epsilon)
-
-        x = fpr[:num_thresholds - 1] - fpr[1:]
-        y = (tpr[:num_thresholds - 1] + tpr[1:]) / 2.0
-        auc_value = np.sum(x * y)
+        self.inputs = {
+            'Predict': pred,
+            'Label': labels,
+            "StatPos": stat_pos,
+            "StatNeg": stat_neg
+        }
+        self.attrs = {'curve': 'ROC', 'num_thresholds': num_thresholds}
 
-        self.outputs = {'AUC': auc_value}
+        python_auc = metrics.Auc(name="auc",
+                                 curve='ROC',
+                                 num_thresholds=num_thresholds)
+        python_auc.update(pred, labels)
+
+        self.outputs = {
+            'AUC': np.array(python_auc.eval()),
+            'BatchAUC': np.array(python_auc.eval()),
+            'StatPosOut': np.array(python_auc._stat_pos),
+            'StatNegOut': np.array(python_auc._stat_neg)
+        }
 
     def test_check_output(self):
         self.check_output()
diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_batch_norm_mkldnn_op.py
index 18fa5461590134d2032a29e40699109c12092c6d..1286cee8dc1855c1b1695da46ae0b5222c065114 100644
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_mkldnn_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 import paddle.fluid.core as core
diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
index a62ee9596d0f6c58135b4a13249b638e84e63c3c..80261eff4e747f87658bc7c9114c21bee511df09 100644
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 import paddle.fluid.core as core
@@ -129,7 +131,6 @@ def create_or_get_tensor(scope, var_name, var, place):
     if var is not None:
         assert isinstance(var, np.ndarray)
         tensor.set_recursive_sequence_lengths([])
-        tensor.set_dims(var.shape)
         tensor.set(var, place)
     return tensor
 
@@ -416,7 +417,7 @@ class TestBatchNormOpTraining(unittest.TestCase):
             self.__assert_close(scale_grad, out[6], "scale_grad")
             self.__assert_close(bias_grad, out[7], "bias_grad")
 
-            print "op test forward passed: ", str(place), data_layout
+            print("op test forward passed: ", str(place), data_layout)
 
         places = [core.CPUPlace()]
 
diff --git a/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py b/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py
index db5771f7b0ad74c73b81d502209c17dce3ce8457..51eee41ab2d4d1113426991c63bee949cca15ad4 100644
--- a/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py
+++ b/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 
 import numpy as np
@@ -100,6 +102,8 @@ class TestBeamSearchDecodeOp(unittest.TestCase):
             np.array_equal(np.array(sentence_scores), expected_data))
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
 class TestBeamSearchDecodeOpGPU(TestBeamSearchDecodeOp):
     def setUp(self):
         self.scope = core.Scope()
diff --git a/python/paddle/fluid/tests/unittests/test_beam_search_op.py b/python/paddle/fluid/tests/unittests/test_beam_search_op.py
index 167451edd8c46c006c8019678a304a38f18cb946..c28dda4b53ce5d394ff11222e5df8d257b4e80da 100644
--- a/python/paddle/fluid/tests/unittests/test_beam_search_op.py
+++ b/python/paddle/fluid/tests/unittests/test_beam_search_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import logging
 from paddle.fluid.op import Operator, DynamicRecurrentOp
 import paddle.fluid.core as core
@@ -59,8 +61,7 @@ class BeamSearchOpTester(unittest.TestCase):
             np.allclose(
                 np.array(selected_scores),
                 np.array([0.5, 0.6, 0.9, 0.7])[:, np.newaxis]))
-        self.assertEqual(selected_ids.lod(),
-                         [[0L, 2L, 4L], [0L, 1L, 2L, 3L, 4L]])
+        self.assertEqual(selected_ids.lod(), [[0, 2, 4], [0, 1, 2, 3, 4]])
 
     def _create_pre_ids(self):
         np_data = np.array([[1, 2, 3, 4]], dtype='int64')
diff --git a/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py b/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
index b04f25ef874cc6204211a4f5f5991a0ec8c473dd..bed847c3c168c906a89c32631b2a8f0ba2e6e7be 100644
--- a/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_bilinear_tensor_product_op.py b/python/paddle/fluid/tests/unittests/test_bilinear_tensor_product_op.py
index d20a11e27eaac12534ea33c398247adb8db01d4b..46831119c5fee938780ec8fdb9d0cdb3b63a473d 100644
--- a/python/paddle/fluid/tests/unittests/test_bilinear_tensor_product_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bilinear_tensor_product_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py b/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py
index d5bd726c4a82ee839703c69a933100bb056cb736..5cc8e2ba15d260b988ee66a5711aed42ca04c10b 100644
--- a/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py
@@ -11,6 +11,8 @@
 #WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #See the License for the specific language governing permissions and
 #limitations under the License.
+
+from __future__ import print_function
 import unittest
 import numpy as np
 from op_test import OpTest
@@ -48,7 +50,7 @@ def bipartite_match(distance, match_indices, match_dist):
 
 def argmax_match(distance, match_indices, match_dist, threshold):
     r, c = distance.shape
-    for j in xrange(c):
+    for j in range(c):
         if match_indices[j] != -1:
             continue
         col_dist = distance[:, j]
diff --git a/python/paddle/fluid/tests/unittests/test_box_coder_op.py b/python/paddle/fluid/tests/unittests/test_box_coder_op.py
index 4ce9a4783e2332b6882164a70e1462c6a6d31bef..2511c5c22e012babdeb71a71d3546456ea2ceaf3 100644
--- a/python/paddle/fluid/tests/unittests/test_box_coder_op.py
+++ b/python/paddle/fluid/tests/unittests/test_box_coder_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 import sys
diff --git a/python/paddle/fluid/tests/unittests/test_calc_gradient.py b/python/paddle/fluid/tests/unittests/test_calc_gradient.py
index 06e676cd83e77549afd679e730426c590cc046bf..4120a18b72f87c7e750a0fb68780292b58e3a7f4 100644
--- a/python/paddle/fluid/tests/unittests/test_calc_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_calc_gradient.py
@@ -12,12 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
-import paddle.fluid.framework as framework
-import paddle.fluid.optimizer as optimizer
 from paddle.fluid.backward import calc_gradient
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_cast_op.py b/python/paddle/fluid/tests/unittests/test_cast_op.py
index b8d3ed3aa3eb0e47e79f46cdf681a3b9cca46036..71a2ccb6da47588d84c263105560626435ac461a 100644
--- a/python/paddle/fluid/tests/unittests/test_cast_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cast_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import op_test
 import unittest
 import numpy as np
diff --git a/python/paddle/fluid/tests/unittests/test_checkpoint.py b/python/paddle/fluid/tests/unittests/test_checkpoint.py
deleted file mode 100644
index e22400a045ced16c46b0bf005155f621f249d263..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_checkpoint.py
+++ /dev/null
@@ -1,75 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.fluid as fluid
-import unittest
-import os
-import tempfile
-
-
-class TestCheckpoint(unittest.TestCase):
-    def setUp(self):
-        self.dirname = tempfile.mktemp()
-        self.max_num_checkpoints = 3
-        self.epoch_interval = 1
-        self.step_interval = 1
-        self.trainer_id = 0
-        self.chief = self.trainer_id == 0
-        self.place = fluid.CPUPlace()
-        self.epoch_id = 100
-        self.step_id = 20
-
-    def test_checkpoint(self):
-        self.save_checkpoint()
-        serial = fluid.io.get_latest_checkpoint_serial(self.dirname)
-        self.assertTrue(serial >= 0)
-        trainer_args = ["epoch_id", "step_id"]
-        epoch_id, step_id = fluid.io.load_trainer_args(
-            self.dirname, serial, self.trainer_id, trainer_args)
-        self.assertEqual(self.step_id, int(step_id))
-        self.assertEqual(self.epoch_id, int(epoch_id))
-
-        program = fluid.Program()
-        with fluid.program_guard(program):
-            exe = fluid.Executor(self.place)
-            fluid.io.load_checkpoint(exe, self.dirname, serial, program)
-
-        fluid.io.clean_checkpoint(self.dirname, delete_dir=True)
-        self.assertFalse(os.path.isdir(self.dirname))
-
-    def save_checkpoint(self):
-        config = fluid.CheckpointConfig(self.dirname, self.max_num_checkpoints,
-                                        self.epoch_interval, self.step_interval)
-
-        trainer_args = {}
-        trainer_args["epoch_id"] = self.epoch_id
-        trainer_args["step_id"] = self.step_id
-
-        program = fluid.Program()
-        with fluid.program_guard(program):
-            program.global_block().create_var(
-                name="scale_0",
-                psersistable=True,
-                dtype="float32",
-                shape=[32, 32])
-
-            exe = fluid.Executor(self.place)
-            for i in xrange(10):
-                fluid.io.save_checkpoint(exe, config.checkpoint_dir,
-                                         self.trainer_id, trainer_args, program,
-                                         config.max_num_checkpoints)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_chunk_eval_op.py b/python/paddle/fluid/tests/unittests/test_chunk_eval_op.py
index 23932194f0ca97954ec9ade3fdcaebd7a32749a0..48eb8e9f7585d41d541ac3645e9a50dc79058de7 100644
--- a/python/paddle/fluid/tests/unittests/test_chunk_eval_op.py
+++ b/python/paddle/fluid/tests/unittests/test_chunk_eval_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
@@ -63,7 +65,7 @@ class TestChunkEvalOp(OpTest):
         # generate chunk beginnings
         chunk_begins = sorted(
             np.random.choice(
-                range(starts[-1]), num_chunks, replace=False))
+                list(range(starts[-1])), num_chunks, replace=False))
         seq_chunk_begins = []
         begin_idx = 0
         # divide chunks into sequences
@@ -93,7 +95,7 @@ class TestChunkEvalOp(OpTest):
                                   self.num_infer_chunks + self.num_label_chunks
                                   - self.num_correct_chunks)
         correct_chunks = np.random.choice(
-            range(len(chunks)), self.num_correct_chunks, replace=False)
+            list(range(len(chunks))), self.num_correct_chunks, replace=False)
         infer_chunks = np.random.choice(
             [x for x in range(len(chunks)) if x not in correct_chunks],
             self.num_infer_chunks - self.num_correct_chunks,
@@ -138,7 +140,8 @@ class TestChunkEvalOp(OpTest):
         infer.fill(self.num_chunk_types * self.num_tag_types)
         label = np.copy(infer)
         starts = np.random.choice(
-            range(1, self.batch_size), self.num_sequences - 1,
+            list(range(1, self.batch_size)),
+            self.num_sequences - 1,
             replace=False).tolist()
         starts.extend([0, self.batch_size])
         starts = sorted(starts)
diff --git a/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py b/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py
index 129958fa2818418dccce91683a9424e6324c6ac2..6103c3aafc0bb154194314830c5c8c5d89460cfe 100644
--- a/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_clip_op.py b/python/paddle/fluid/tests/unittests/test_clip_op.py
index 3df80c8ec8fa39c0aaa2b8726fe3b37aef488442..32677bdb4c897b4e20f8fb166b080ac6e6a221b7 100644
--- a/python/paddle/fluid/tests/unittests/test_clip_op.py
+++ b/python/paddle/fluid/tests/unittests/test_clip_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_compare_op.py b/python/paddle/fluid/tests/unittests/test_compare_op.py
index 405afebae85eaae6f6af0012058ad58c8bb69a2f..437ad35538a5fa380f950fd3b71e334276214ec7 100644
--- a/python/paddle/fluid/tests/unittests/test_compare_op.py
+++ b/python/paddle/fluid/tests/unittests/test_compare_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import op_test
 import unittest
 import numpy
diff --git a/python/paddle/fluid/tests/unittests/test_compat.py b/python/paddle/fluid/tests/unittests/test_compat.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c2c46f99a82875b917a330d6ec76062222420de
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_compat.py
@@ -0,0 +1,505 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import paddle.compat as cpt
+import six
+
+
+class TestCompatible(unittest.TestCase):
+    def test_type(self):
+        if six.PY2:
+            self.assertEqual(cpt.int_type, int)
+            self.assertEqual(cpt.long_type, long)
+        else:
+            self.assertEqual(cpt.int_type, int)
+            self.assertEqual(cpt.long_type, int)
+
+    def test_to_text(self):
+        # Only support python2.x and python3.x now
+        self.assertTrue(six.PY2 | six.PY3)
+
+        if six.PY2:
+            # check None
+            self.assertIsNone(cpt.to_text(None))
+
+            # check all string related types
+            self.assertTrue(isinstance(cpt.to_text(str("")), unicode))
+            self.assertTrue(isinstance(cpt.to_text(str("123")), unicode))
+            self.assertTrue(isinstance(cpt.to_text(b""), unicode))
+            self.assertTrue(isinstance(cpt.to_text(b""), unicode))
+            self.assertTrue(isinstance(cpt.to_text(u""), unicode))
+            self.assertTrue(isinstance(cpt.to_text(u""), unicode))
+
+            self.assertEqual(u"", cpt.to_text(str("")))
+            self.assertEqual(u"123", cpt.to_text(str("123")))
+            self.assertEqual(u"", cpt.to_text(b""))
+            self.assertEqual(u"123", cpt.to_text(b"123"))
+            self.assertEqual(u"", cpt.to_text(u""))
+            self.assertEqual(u"123", cpt.to_text(u"123"))
+
+            # check list types, not inplace
+            l = [""]
+            l2 = cpt.to_text(l)
+            self.assertTrue(isinstance(l2, list))
+            self.assertFalse(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual([u""], l2)
+            l = ["", "123"]
+            l2 = cpt.to_text(l)
+            self.assertTrue(isinstance(l2, list))
+            self.assertFalse(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual([u"", u"123"], l2)
+            l = ["", b'123', u"321"]
+            l2 = cpt.to_text(l)
+            self.assertTrue(isinstance(l2, list))
+            self.assertFalse(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual([u"", u"123", u"321"], l2)
+            for i in l2:
+                self.assertTrue(isinstance(i, unicode))
+
+            # check list types, inplace
+            l = [""]
+            l2 = cpt.to_text(l, inplace=True)
+            self.assertTrue(isinstance(l2, list))
+            self.assertTrue(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual([u""], l2)
+            l = ["", "123"]
+            l2 = cpt.to_text(l, inplace=True)
+            self.assertTrue(isinstance(l2, list))
+            self.assertTrue(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual([u"", u"123"], l2)
+            l = ["", b"123", u"321"]
+            l2 = cpt.to_text(l, inplace=True)
+            self.assertTrue(isinstance(l2, list))
+            self.assertTrue(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual([u"", u"123", u"321"], l2)
+
+            # check set types, not inplace
+            l = set("")
+            l2 = cpt.to_text(l, inplace=False)
+            self.assertTrue(isinstance(l2, set))
+            self.assertFalse(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual(set(u""), l2)
+            l = set([b"", b"123"])
+            l2 = cpt.to_text(l, inplace=False)
+            self.assertTrue(isinstance(l2, set))
+            self.assertFalse(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual(set([u"", u"123"]), l2)
+            l = set(["", b"123", u"321"])
+            l2 = cpt.to_text(l, inplace=False)
+            self.assertTrue(isinstance(l2, set))
+            self.assertFalse(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual(set([u"", u"123", u"321"]), l2)
+            for i in l2:
+                self.assertTrue(isinstance(i, unicode))
+
+            # check set types, inplace
+            l = set("")
+            l2 = cpt.to_text(l, inplace=True)
+            self.assertTrue(isinstance(l2, set))
+            self.assertTrue(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual(set(u""), l2)
+            l = set([b"", b"123"])
+            l2 = cpt.to_text(l, inplace=True)
+            self.assertTrue(isinstance(l2, set))
+            self.assertTrue(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual(set([u"", u"123"]), l2)
+            l = set(["", b"123", u"321"])
+            l2 = cpt.to_text(l, inplace=True)
+            self.assertTrue(isinstance(l2, set))
+            self.assertTrue(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual(set([u"", u"123", u"321"]), l2)
+
+        elif six.PY3:
+            self.assertIsNone(cpt.to_text(None))
+
+            self.assertTrue(isinstance(cpt.to_text(str("")), str))
+            self.assertTrue(isinstance(cpt.to_text(str("123")), str))
+            self.assertTrue(isinstance(cpt.to_text(b""), str))
+            self.assertTrue(isinstance(cpt.to_text(b""), str))
+            self.assertTrue(isinstance(cpt.to_text(u""), str))
+            self.assertTrue(isinstance(cpt.to_text(u""), str))
+
+            self.assertEqual("", cpt.to_text(str("")))
+            self.assertEqual("123", cpt.to_text(str("123")))
+            self.assertEqual("", cpt.to_text(b""))
+            self.assertEqual("123", cpt.to_text(b"123"))
+            self.assertEqual("", cpt.to_text(u""))
+            self.assertEqual("123", cpt.to_text(u"123"))
+
+            # check list types, not inplace
+            l = [""]
+            l2 = cpt.to_text(l)
+            self.assertTrue(isinstance(l2, list))
+            self.assertFalse(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual([""], l2)
+            l = ["", "123"]
+            l2 = cpt.to_text(l)
+            self.assertTrue(isinstance(l2, list))
+            self.assertFalse(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual(["", "123"], l2)
+            l = ["", b"123", u"321"]
+            l2 = cpt.to_text(l)
+            self.assertTrue(isinstance(l2, list))
+            self.assertFalse(l is l2)
+            self.assertNotEqual(l, l2)
+            self.assertEqual(["", "123", "321"], l2)
+
+            # check list types, inplace
+            l = [""]
+            l2 = cpt.to_text(l, inplace=True)
+            self.assertTrue(isinstance(l2, list))
+            self.assertTrue(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual([""], l2)
+            l = ["", b"123"]
+            l2 = cpt.to_text(l, inplace=True)
+            self.assertTrue(isinstance(l2, list))
+            self.assertTrue(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual(["", "123"], l2)
+            l = ["", b"123", u"321"]
+            l2 = cpt.to_text(l, inplace=True)
+            self.assertTrue(isinstance(l2, list))
+            self.assertTrue(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual(["", "123", "321"], l2)
+            for i in l2:
+                self.assertTrue(isinstance(i, str))
+
+            # check set types, not inplace
+            l = set("")
+            l2 = cpt.to_text(l, inplace=False)
+            self.assertTrue(isinstance(l2, set))
+            self.assertFalse(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual(set(""), l2)
+            l = set([b"", b"123"])
+            l2 = cpt.to_text(l, inplace=False)
+            self.assertTrue(isinstance(l2, set))
+            self.assertFalse(l is l2)
+            self.assertNotEqual(l, l2)
+            self.assertEqual(set(["", "123"]), l2)
+            l = set(["", b"123", u"321"])
+            l2 = cpt.to_text(l, inplace=False)
+            self.assertTrue(isinstance(l2, set))
+            self.assertFalse(l is l2)
+            self.assertNotEqual(l, l2)
+            self.assertEqual(set(["", "123", "321"]), l2)
+
+            # check set types, inplace
+            l = set("")
+            l2 = cpt.to_text(l, inplace=True)
+            self.assertTrue(isinstance(l2, set))
+            self.assertTrue(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual(set(""), l2)
+            l = set([b"", b"123"])
+            l2 = cpt.to_text(l, inplace=True)
+            self.assertTrue(isinstance(l2, set))
+            self.assertTrue(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual(set(["", "123"]), l2)
+            l = set(["", b"123", u"321"])
+            l2 = cpt.to_text(l, inplace=True)
+            self.assertTrue(isinstance(l2, set))
+            self.assertTrue(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual(set(["", "123", "321"]), l2)
+            for i in l2:
+                self.assertTrue(isinstance(i, str))
+
+    def test_to_bytes(self):
+        # Only support python2.x and python3.x now
+        self.assertTrue(six.PY2 | six.PY3)
+
+        if six.PY2:
+            # check None
+            self.assertIsNone(cpt.to_bytes(None))
+
+            # check all string related types
+            self.assertTrue(isinstance(cpt.to_bytes(str("")), bytes))
+            self.assertTrue(isinstance(cpt.to_bytes(str("123")), bytes))
+            self.assertTrue(isinstance(cpt.to_bytes(b""), bytes))
+            self.assertTrue(isinstance(cpt.to_bytes(b""), bytes))
+            self.assertTrue(isinstance(cpt.to_bytes(u""), bytes))
+            self.assertTrue(isinstance(cpt.to_bytes(u""), bytes))
+
+            self.assertEqual(b"", cpt.to_bytes(str("")))
+            self.assertEqual(b"123", cpt.to_bytes(str("123")))
+            self.assertEqual(b"", cpt.to_bytes(b""))
+            self.assertEqual(b"123", cpt.to_bytes(b"123"))
+            self.assertEqual(b"", cpt.to_bytes(u""))
+            self.assertEqual(b"123", cpt.to_bytes(u"123"))
+
+            # check list types, not inplace
+            l = [""]
+            l2 = cpt.to_bytes(l)
+            self.assertTrue(isinstance(l2, list))
+            self.assertFalse(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual([b""], l2)
+            l = ["", "123"]
+            l2 = cpt.to_bytes(l)
+            self.assertTrue(isinstance(l2, list))
+            self.assertFalse(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual([b"", b"123"], l2)
+            l = ["", b'123', u"321"]
+            l2 = cpt.to_bytes(l)
+            self.assertTrue(isinstance(l2, list))
+            self.assertFalse(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual([b"", b"123", b"321"], l2)
+            for i in l2:
+                self.assertTrue(isinstance(i, bytes))
+
+            # check list types, inplace
+            l = [""]
+            l2 = cpt.to_bytes(l, inplace=True)
+            self.assertTrue(isinstance(l2, list))
+            self.assertTrue(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual([b""], l2)
+            l = ["", "123"]
+            l2 = cpt.to_bytes(l, inplace=True)
+            self.assertTrue(isinstance(l2, list))
+            self.assertTrue(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual([b"", b"123"], l2)
+            l = ["", b"123", u"321"]
+            l2 = cpt.to_bytes(l, inplace=True)
+            self.assertTrue(isinstance(l2, list))
+            self.assertTrue(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual([b"", b"123", b"321"], l2)
+
+            # check set types, not inplace
+            l = set("")
+            l2 = cpt.to_bytes(l, inplace=False)
+            self.assertTrue(isinstance(l2, set))
+            self.assertFalse(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual(set(b""), l2)
+            l = set([b"", b"123"])
+            l2 = cpt.to_bytes(l, inplace=False)
+            self.assertTrue(isinstance(l2, set))
+            self.assertFalse(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual(set([b"", b"123"]), l2)
+            l = set(["", b"123", u"321"])
+            l2 = cpt.to_bytes(l, inplace=False)
+            self.assertTrue(isinstance(l2, set))
+            self.assertFalse(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual(set([b"", b"123", b"321"]), l2)
+            for i in l2:
+                self.assertTrue(isinstance(i, bytes))
+
+            # check set types, inplace
+            l = set("")
+            l2 = cpt.to_bytes(l, inplace=True)
+            self.assertTrue(isinstance(l2, set))
+            self.assertTrue(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual(set(b""), l2)
+            l = set([b"", b"123"])
+            l2 = cpt.to_bytes(l, inplace=True)
+            self.assertTrue(isinstance(l2, set))
+            self.assertTrue(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual(set([b"", b"123"]), l2)
+            l = set(["", b"123", u"321"])
+            l2 = cpt.to_bytes(l, inplace=True)
+            self.assertTrue(isinstance(l2, set))
+            self.assertTrue(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual(set([b"", b"123", b"321"]), l2)
+
+        elif six.PY3:
+            self.assertIsNone(cpt.to_bytes(None))
+
+            self.assertTrue(isinstance(cpt.to_bytes(str("")), bytes))
+            self.assertTrue(isinstance(cpt.to_bytes(str("123")), bytes))
+            self.assertTrue(isinstance(cpt.to_bytes(b""), bytes))
+            self.assertTrue(isinstance(cpt.to_bytes(b""), bytes))
+            self.assertTrue(isinstance(cpt.to_bytes(u""), bytes))
+            self.assertTrue(isinstance(cpt.to_bytes(u""), bytes))
+
+            self.assertEqual(b"", cpt.to_bytes(str("")))
+            self.assertEqual(b"123", cpt.to_bytes(str("123")))
+            self.assertEqual(b"", cpt.to_bytes(b""))
+            self.assertEqual(b"123", cpt.to_bytes(b"123"))
+            self.assertEqual(b"", cpt.to_bytes(u""))
+            self.assertEqual(b"123", cpt.to_bytes(u"123"))
+
+            # check list types, not inplace
+            l = [""]
+            l2 = cpt.to_bytes(l)
+            self.assertTrue(isinstance(l2, list))
+            self.assertFalse(l is l2)
+            self.assertNotEqual(l, l2)
+            self.assertEqual([b""], l2)
+            l = ["", "123"]
+            l2 = cpt.to_bytes(l)
+            self.assertTrue(isinstance(l2, list))
+            self.assertFalse(l is l2)
+            self.assertNotEqual(l, l2)
+            self.assertEqual([b"", b"123"], l2)
+            l = ["", b"123", u"321"]
+            l2 = cpt.to_bytes(l)
+            self.assertTrue(isinstance(l2, list))
+            self.assertFalse(l is l2)
+            self.assertNotEqual(l, l2)
+            self.assertEqual([b"", b"123", b"321"], l2)
+
+            # check list types, inplace
+            l = [""]
+            l2 = cpt.to_bytes(l, inplace=True)
+            self.assertTrue(isinstance(l2, list))
+            self.assertTrue(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual([b""], l2)
+            l = ["", b"123"]
+            l2 = cpt.to_bytes(l, inplace=True)
+            self.assertTrue(isinstance(l2, list))
+            self.assertTrue(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual([b"", b"123"], l2)
+            l = ["", b"123", u"321"]
+            l2 = cpt.to_bytes(l, inplace=True)
+            self.assertTrue(isinstance(l2, list))
+            self.assertTrue(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual([b"", b"123", b"321"], l2)
+            for i in l2:
+                self.assertTrue(isinstance(i, bytes))
+
+            # check set types, not inplace
+            l = set([""])
+            l2 = cpt.to_bytes(l, inplace=False)
+            self.assertTrue(isinstance(l2, set))
+            self.assertFalse(l is l2)
+            self.assertNotEqual(l, l2)
+            self.assertEqual(set([b""]), l2)
+            l = set([u"", u"123"])
+            l2 = cpt.to_bytes(l, inplace=False)
+            self.assertTrue(isinstance(l2, set))
+            self.assertFalse(l is l2)
+            self.assertNotEqual(l, l2)
+            self.assertEqual(set([b"", b"123"]), l2)
+            l = set(["", b"123", u"321"])
+            l2 = cpt.to_bytes(l, inplace=False)
+            self.assertTrue(isinstance(l2, set))
+            self.assertFalse(l is l2)
+            self.assertNotEqual(l, l2)
+            self.assertEqual(set([b"", b"123", b"321"]), l2)
+
+            # check set types, inplace
+            l = set("")
+            l2 = cpt.to_bytes(l, inplace=True)
+            self.assertTrue(isinstance(l2, set))
+            self.assertTrue(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual(set(b""), l2)
+            l = set([u"", u"123"])
+            l2 = cpt.to_bytes(l, inplace=True)
+            self.assertTrue(isinstance(l2, set))
+            self.assertTrue(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual(set([b"", b"123"]), l2)
+            l = set(["", b"123", u"321"])
+            l2 = cpt.to_bytes(l, inplace=True)
+            self.assertTrue(isinstance(l2, set))
+            self.assertTrue(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual(set([b"", b"123", b"321"]), l2)
+            for i in l2:
+                self.assertTrue(isinstance(i, bytes))
+
+    def test_round(self):
+        self.assertEqual(3.0, cpt.round(3.4))
+        self.assertEqual(4.0, cpt.round(3.5))
+        self.assertEqual(0.0, cpt.round(0.1))
+        self.assertEqual(0.0, cpt.round(0.0))
+        self.assertEqual(-0.0, cpt.round(-0.0))
+        self.assertEqual(-0.0, cpt.round(-0.1))
+        self.assertEqual(-3.0, cpt.round(-3.4))
+        self.assertEqual(-4.0, cpt.round(-3.5))
+        self.assertEqual(5.0, cpt.round(5))
+        self.assertRaises(TypeError, cpt.round, None)
+
+    def test_floor_division(self):
+        self.assertEqual(0.0, cpt.floor_division(3, 4))
+        self.assertEqual(1.0, cpt.floor_division(4, 3))
+        self.assertEqual(2.0, cpt.floor_division(6, 3))
+        self.assertEqual(-2.0, cpt.floor_division(-4, 3))
+        self.assertEqual(-2.0, cpt.floor_division(-6, 3))
+        self.assertRaises(ZeroDivisionError, cpt.floor_division, 3, 0)
+        self.assertRaises(TypeError, cpt.floor_division, None, None)
+
+    def test_get_exception_message(self):
+        exception_message = "test_message"
+        self.assertRaises(AssertionError, cpt.get_exception_message, None)
+        if six.PY2:
+            self.assertRaises(AttributeError, cpt.get_exception_message,
+                              exception_message)
+            try:
+                raise RuntimeError(exception_message)
+            except Exception as e:
+                self.assertEqual(exception_message,
+                                 cpt.get_exception_message(e))
+                self.assertIsNotNone(e)
+
+            try:
+                raise Exception(exception_message)
+            except Exception as e:
+                self.assertEqual(exception_message,
+                                 cpt.get_exception_message(e))
+                self.assertIsNotNone(e)
+
+        if six.PY3:
+            try:
+                raise RuntimeError(exception_message)
+            except Exception as e:
+                self.assertEqual(exception_message,
+                                 cpt.get_exception_message(e))
+                self.assertIsNotNone(e)
+
+            try:
+                raise Exception(exception_message)
+            except Exception as e:
+                self.assertEqual(exception_message,
+                                 cpt.get_exception_message(e))
+                self.assertIsNotNone(e)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_concat_op.py b/python/paddle/fluid/tests/unittests/test_concat_op.py
index e9f3c45dc40b3333fe7304f8e4313d156bd5374c..436ab7d49f4cafcd30366ae57c40d49e6f7d614f 100644
--- a/python/paddle/fluid/tests/unittests/test_concat_op.py
+++ b/python/paddle/fluid/tests/unittests/test_concat_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_conditional_block.py b/python/paddle/fluid/tests/unittests/test_conditional_block.py
index 084b8d37386fac0366c190f5f30dd39467072498..5b2b71d050c42b4fea84bab89824d3f5c164b36e 100644
--- a/python/paddle/fluid/tests/unittests/test_conditional_block.py
+++ b/python/paddle/fluid/tests/unittests/test_conditional_block.py
@@ -12,20 +12,23 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import paddle.fluid.layers as layers
 import paddle.fluid.core as core
 from paddle.fluid.framework import default_startup_program, default_main_program
 from paddle.fluid.executor import Executor
 from paddle.fluid.backward import append_backward
+from paddle.fluid.layers.control_flow import ConditionalBlock
 import numpy
 
 
-class ConditionalBlock(unittest.TestCase):
+class ConditionalBlockTest(unittest.TestCase):
     def test_forward(self):
         data = layers.data(name='X', shape=[1], dtype='float32')
         data.stop_gradient = False
-        cond = layers.ConditionalBlock(inputs=[data])
+        cond = ConditionalBlock(inputs=[data])
         out = layers.create_tensor(dtype='float32')
         with cond.block():
             hidden = layers.fc(input=data, size=10)
@@ -38,7 +41,7 @@ class ConditionalBlock(unittest.TestCase):
         x = numpy.random.random(size=(10, 1)).astype('float32')
 
         outs = exe.run(feed={'X': x}, fetch_list=[out])[0]
-        print outs
+        print(outs)
         loss = layers.mean(out)
         append_backward(loss=loss)
         outs = exe.run(
@@ -46,7 +49,7 @@ class ConditionalBlock(unittest.TestCase):
             fetch_list=[
                 default_main_program().block(0).var(data.name + "@GRAD")
             ])[0]
-        print outs
+        print(outs)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_const_value.py b/python/paddle/fluid/tests/unittests/test_const_value.py
index d1075d514e9b2b692f271f10a005815a66b421fb..0b2431d7726e845da33f6bcf9c74058788dd9654 100644
--- a/python/paddle/fluid/tests/unittests/test_const_value.py
+++ b/python/paddle/fluid/tests/unittests/test_const_value.py
@@ -12,11 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import paddle.fluid.framework as framework
 
 
-class ConditionalBlock(unittest.TestCase):
+class ConstantTest(unittest.TestCase):
     def test_const_value(self):
         self.assertEqual(framework.GRAD_VAR_SUFFIX, "@GRAD")
         self.assertEqual(framework.TEMP_VAR_NAME, "@TEMP@")
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_mkldnn_op.py
index db6be21baaa54d33af9f5c44d1815e4b389eb884..1902a9869807ba7ce3f9828c124256cc6752857e 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_mkldnn_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 
 from test_conv2d_op import TestConv2dOp, TestWithPad, TestWithStride
@@ -20,16 +22,19 @@ from test_conv2d_op import TestConv2dOp, TestWithPad, TestWithStride
 class TestMKLDNN(TestConv2dOp):
     def init_kernel_type(self):
         self.use_mkldnn = True
+        self.data_format = "NCHW"
 
 
 class TestMKLDNNWithPad(TestWithPad):
     def init_kernel_type(self):
         self.use_mkldnn = True
+        self.data_format = "NCHW"
 
 
 class TestMKLDNNWithStride(TestWithStride):
     def init_kernel_type(self):
         self.use_mkldnn = True
+        self.data_format = "NCHW"
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
index a478649541ba9828e55c4239090d5aee554223ac..6a2732e9399aa5a93f4c47eb73bfd23dba608c3d 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 
@@ -24,12 +26,12 @@ def conv2d_forward_naive(input, filter, group, conv_param):
     out_c, f_c, f_h, f_w = filter.shape
     assert f_c * group == in_c
     assert np.mod(out_c, group) == 0
-    sub_out_c = out_c / group
+    sub_out_c = out_c // group
 
     stride, pad, dilation = conv_param['stride'], conv_param['pad'], conv_param[
         'dilation']
-    out_h = 1 + (in_h + 2 * pad[0] - (dilation[0] * (f_h - 1) + 1)) / stride[0]
-    out_w = 1 + (in_w + 2 * pad[1] - (dilation[1] * (f_w - 1) + 1)) / stride[1]
+    out_h = 1 + (in_h + 2 * pad[0] - (dilation[0] * (f_h - 1) + 1)) // stride[0]
+    out_w = 1 + (in_w + 2 * pad[1] - (dilation[1] * (f_w - 1) + 1)) // stride[1]
     out = np.zeros((in_n, out_c, out_h, out_w))
 
     d_bolck_h = (dilation[0] * (f_h - 1) + 1)
@@ -66,6 +68,7 @@ class TestConv2dOp(OpTest):
         self.op_type = "conv2d"
         self.use_cudnn = False
         self.use_mkldnn = False
+        self.data_format = "AnyLayout"
         self.dtype = np.float32
         self.init_kernel_type()
         self.init_group()
@@ -93,7 +96,8 @@ class TestConv2dOp(OpTest):
             'groups': self.groups,
             'dilations': self.dilations,
             'use_cudnn': self.use_cudnn,
-            'use_mkldnn': self.use_mkldnn
+            'use_mkldnn': self.use_mkldnn,
+            'data_format': self.data_format
         }
         self.outputs = {'Output': output}
 
@@ -101,66 +105,42 @@ class TestConv2dOp(OpTest):
         return core.is_compiled_with_cuda() and self.use_cudnn
 
     def test_check_output(self):
-        if self.testcudnn():
-            place = core.CUDAPlace(0)
-            self.check_output_with_place(place, atol=1e-5)
-        else:
-            self.check_output()
+        place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace()
+        self.check_output_with_place(place, atol=1e-5)
 
     def test_check_grad(self):
         if self.dtype == np.float16:
             return
-        if self.testcudnn():
-            place = core.CUDAPlace(0)
-            self.check_grad_with_place(
-                place,
-                set(['Input', 'Filter']),
-                'Output',
-                max_relative_error=0.02)
-        else:
-            self.check_grad(
-                set(['Input', 'Filter']), 'Output', max_relative_error=0.02)
+        place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace()
+        self.check_grad_with_place(
+            place, set(['Input', 'Filter']), 'Output', max_relative_error=0.02)
 
     def test_check_grad_no_filter(self):
         if self.dtype == np.float16:
             return
-        if self.testcudnn():
-            place = core.CUDAPlace(0)
-            self.check_grad_with_place(
-                place, ['Input'],
-                'Output',
-                max_relative_error=0.02,
-                no_grad_set=set(['Filter']))
-        else:
-            self.check_grad(
-                ['Input'],
-                'Output',
-                max_relative_error=0.02,
-                no_grad_set=set(['Filter']))
+        place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace()
+        self.check_grad_with_place(
+            place, ['Input'],
+            'Output',
+            max_relative_error=0.02,
+            no_grad_set=set(['Filter']))
 
     def test_check_grad_no_input(self):
         if self.dtype == np.float16:
             return
-        if self.testcudnn():
-            place = core.CUDAPlace(0)
-            self.check_grad_with_place(
-                place, ['Filter'],
-                'Output',
-                max_relative_error=0.02,
-                no_grad_set=set(['Input']))
-        else:
-            self.check_grad(
-                ['Filter'],
-                'Output',
-                max_relative_error=0.02,
-                no_grad_set=set(['Input']))
+        place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace()
+        self.check_grad_with_place(
+            place, ['Filter'],
+            'Output',
+            max_relative_error=0.02,
+            no_grad_set=set(['Input']))
 
     def init_test_case(self):
         self.pad = [0, 0]
         self.stride = [1, 1]
         self.input_size = [2, 3, 5, 5]  # NCHW
         assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] / self.groups
+        f_c = self.input_size[1] // self.groups
         self.filter_size = [6, f_c, 3, 3]
 
     def init_dilation(self):
@@ -179,7 +159,7 @@ class TestWithPad(TestConv2dOp):
         self.stride = [1, 1]
         self.input_size = [2, 3, 5, 5]  # NCHW
         assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] / self.groups
+        f_c = self.input_size[1] // self.groups
         self.filter_size = [6, f_c, 3, 3]
 
 
@@ -189,7 +169,7 @@ class TestWithStride(TestConv2dOp):
         self.stride = [2, 2]
         self.input_size = [2, 3, 6, 6]  # NCHW
         assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] / self.groups
+        f_c = self.input_size[1] // self.groups
         self.filter_size = [6, f_c, 3, 3]
 
 
@@ -204,7 +184,7 @@ class TestWith1x1(TestConv2dOp):
         self.stride = [1, 1]
         self.input_size = [2, 3, 5, 5]  # NCHW
         assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] / self.groups
+        f_c = self.input_size[1] // self.groups
         self.filter_size = [6, f_c, 1, 1]
 
     def init_group(self):
@@ -217,7 +197,7 @@ class TestWithDilation(TestConv2dOp):
         self.stride = [1, 1]
         self.input_size = [2, 3, 10, 10]  # NCHW
         assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] / self.groups
+        f_c = self.input_size[1] // self.groups
         self.filter_size = [6, f_c, 3, 3]
 
     def init_dilation(self):
@@ -233,7 +213,7 @@ class TestWithInput1x1Filter1x1(TestConv2dOp):
         self.stride = [1, 1]
         self.input_size = [2, 3, 1, 1]  # NCHW
         assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] / self.groups
+        f_c = self.input_size[1] // self.groups
         self.filter_size = [6, f_c, 1, 1]
 
     def init_group(self):
@@ -350,7 +330,7 @@ class TestDepthwiseConv(TestConv2dOp):
         self.input_size = [2, 3, 5, 5]  # NCHW
         self.groups = 3
         assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] / self.groups
+        f_c = self.input_size[1] // self.groups
         self.filter_size = [6, f_c, 3, 3]
         self.op_type = "depthwise_conv2d"
 
@@ -362,7 +342,7 @@ class TestDepthwiseConv2(TestConv2dOp):
         self.input_size = [2, 3, 5, 5]  # NCHW
         self.groups = 3
         assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] / self.groups
+        f_c = self.input_size[1] // self.groups
         self.filter_size = [6, f_c, 3, 3]
         self.op_type = "depthwise_conv2d"
 
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
index 07545e7feb46c85a4b80f9b846be27d36cbfb59a..2a320e735bd7db5dc138f8263ba1b5cb115ba197 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 
@@ -25,7 +27,7 @@ def conv2dtranspose_forward_naive(input_, filter_, attrs):
     groups = attrs['groups']
     assert in_c == f_c
     out_c = f_out_c * groups
-    sub_in_c = in_c / groups
+    sub_in_c = in_c // groups
 
     stride, pad, dilations = attrs['strides'], attrs['paddings'], attrs[
         'dilations']
@@ -191,12 +193,16 @@ class TestWithDilation(TestConv2dTransposeOp):
 
 
 # ------------ test_cudnn ------------
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
 class TestCUDNN(TestConv2dTransposeOp):
     def init_op_type(self):
         self.use_cudnn = True
         self.op_type = "conv2d_transpose"
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
 class TestCUDNNWithPad(TestWithPad):
     def init_test_case(self):
         self.pad = [1, 1]
@@ -212,6 +218,8 @@ class TestCUDNNWithPad(TestWithPad):
         self.op_type = "conv2d_transpose"
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
 class TestCUDNNWithStride(TestWithStride):
     def init_test_case(self):
         self.pad = [1, 1]
@@ -227,6 +235,8 @@ class TestCUDNNWithStride(TestWithStride):
         self.op_type = "conv2d_transpose"
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
 class TestCUDNNWithGroups(TestWithGroups):
     def init_test_case(self):
         self.pad = [1, 1]
@@ -250,7 +260,7 @@ class TestDepthwiseConvTranspose(TestConv2dTransposeOp):
         self.input_size = [2, 8, 16, 16]  # NCHW
         self.groups = 8
         assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] / self.groups
+        f_c = self.input_size[1] // self.groups
         self.filter_size = [self.input_size[1], f_c, 4, 4]
         self.op_type = "depthwise_conv2d_transpose"
 
diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_op.py b/python/paddle/fluid/tests/unittests/test_conv3d_op.py
index dd4ef7cc94ea1e8de5fe4775408389907d47d0d6..ddaf99fe061205f0f2e4c592c9e28e27e657c16a 100644
--- a/python/paddle/fluid/tests/unittests/test_conv3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 
@@ -24,14 +26,14 @@ def conv3d_forward_naive(input, filter, group, conv_param):
     out_c, f_c, f_d, f_h, f_w = filter.shape
     assert f_c * group == in_c
     assert np.mod(out_c, group) == 0
-    sub_out_c = out_c / group
+    sub_out_c = out_c // group
 
     stride, pad, dilation = conv_param['stride'], conv_param['pad'], conv_param[
         'dilations']
 
-    out_d = 1 + (in_d + 2 * pad[0] - (dilation[0] * (f_d - 1) + 1)) / stride[0]
-    out_h = 1 + (in_h + 2 * pad[1] - (dilation[1] * (f_h - 1) + 1)) / stride[1]
-    out_w = 1 + (in_w + 2 * pad[2] - (dilation[2] * (f_w - 1) + 1)) / stride[2]
+    out_d = 1 + (in_d + 2 * pad[0] - (dilation[0] * (f_d - 1) + 1)) // stride[0]
+    out_h = 1 + (in_h + 2 * pad[1] - (dilation[1] * (f_h - 1) + 1)) // stride[1]
+    out_w = 1 + (in_w + 2 * pad[2] - (dilation[2] * (f_w - 1) + 1)) // stride[2]
 
     out = np.zeros((in_n, out_c, out_d, out_h, out_w))
 
@@ -166,7 +168,7 @@ class TestConv3dOp(OpTest):
         self.stride = [1, 1, 1]
         self.input_size = [2, 3, 4, 4, 4]  # NCDHW
         assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] / self.groups
+        f_c = self.input_size[1] // self.groups
         self.filter_size = [6, f_c, 3, 3, 3]
 
     def init_dilation(self):
@@ -185,7 +187,7 @@ class TestCase1(TestConv3dOp):
         self.stride = [1, 1, 1]
         self.input_size = [2, 3, 4, 4, 4]  # NCDHW
         assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] / self.groups
+        f_c = self.input_size[1] // self.groups
         self.filter_size = [6, f_c, 3, 3, 3]
 
 
@@ -205,7 +207,7 @@ class TestWith1x1(TestConv3dOp):
         self.stride = [1, 1, 1]
         self.input_size = [2, 3, 4, 4, 4]  # NCHW
         assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] / self.groups
+        f_c = self.input_size[1] // self.groups
         self.filter_size = [6, f_c, 1, 1, 1]
 
     def init_dilation(self):
@@ -221,7 +223,7 @@ class TestWithInput1x1Filter1x1(TestConv3dOp):
         self.stride = [1, 1, 1]
         self.input_size = [2, 3, 1, 1, 1]  # NCHW
         assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] / self.groups
+        f_c = self.input_size[1] // self.groups
         self.filter_size = [6, f_c, 1, 1, 1]
 
     def init_dilation(self):
@@ -237,7 +239,7 @@ class TestWithDilation(TestConv3dOp):
         self.stride = [1, 1, 1]
         self.input_size = [2, 3, 6, 6, 6]  # NCDHW
         assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] / self.groups
+        f_c = self.input_size[1] // self.groups
         self.filter_size = [6, f_c, 2, 2, 2]
 
     def init_dilation(self):
diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py b/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py
index c9f26d10df8ff39d6bd77b1597336600f676d362..8d9075961cbec32bc34fcf0c92cfbb7e6c00d886 100644
--- a/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 
@@ -25,7 +27,7 @@ def conv3dtranspose_forward_naive(input_, filter_, attrs):
     groups = attrs['groups']
     assert in_c == f_c
     out_c = f_out_c * groups
-    sub_in_c = in_c / groups
+    sub_in_c = in_c // groups
 
     stride, pad, dilations = attrs['strides'], attrs['paddings'], attrs[
         'dilations']
@@ -197,12 +199,16 @@ class TestWithDilation(TestConv3dTransposeOp):
 
 
 # ------------ test_cudnn ------------
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
 class TestCUDNN(TestConv3dTransposeOp):
     def init_op_type(self):
         self.use_cudnn = True
         self.op_type = "conv3d_transpose"
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
 class TestCUDNNWithPad(TestWithPad):
     def init_test_case(self):
         self.pad = [1, 1, 1]
@@ -218,6 +224,8 @@ class TestCUDNNWithPad(TestWithPad):
         self.op_type = "conv3d_transpose"
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
 class TestCUDNNWithStride(TestWithStride):
     def init_test_case(self):
         self.pad = [1, 1, 1]
@@ -233,6 +241,8 @@ class TestCUDNNWithStride(TestWithStride):
         self.op_type = "conv3d_transpose"
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
 class TestCUDNNWithGroups(TestWithGroups):
     def init_test_case(self):
         self.pad = [1, 1, 1]
diff --git a/python/paddle/fluid/tests/unittests/test_conv_shift_op.py b/python/paddle/fluid/tests/unittests/test_conv_shift_op.py
index 5d4d244f439a671d895f9237b793e6c6bbf2895b..b7364e869e7420e610363eafcc4964b825e57326 100644
--- a/python/paddle/fluid/tests/unittests/test_conv_shift_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv_shift_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
@@ -21,9 +23,9 @@ def conv_shift_forward(x, y):
     out = np.zeros_like(x)
     M = x.shape[1]
     N = y.shape[1]
-    y_half_width = (N - 1) / 2
-    for i in xrange(M):
-        for j in xrange(N):
+    y_half_width = (N - 1) // 2
+    for i in range(M):
+        for j in range(N):
             out[:, i] += x[:, (i + j + M - y_half_width) % M] * y[:, j]
     return out
 
diff --git a/python/paddle/fluid/tests/unittests/test_cos_sim_op.py b/python/paddle/fluid/tests/unittests/test_cos_sim_op.py
index 1b27cd57670e6c9db5eae6b226989a5c772866ce..3c3fd6d4d71503ccc3678ca69d55bcc8536c8c6a 100644
--- a/python/paddle/fluid/tests/unittests/test_cos_sim_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cos_sim_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_create_op_doc_string.py b/python/paddle/fluid/tests/unittests/test_create_op_doc_string.py
index 5e6f9a20a93e467980f5a4f23fbcb6118317fe44..fd34c8fc9390b69afd93229b56aa9189da2a8b28 100644
--- a/python/paddle/fluid/tests/unittests/test_create_op_doc_string.py
+++ b/python/paddle/fluid/tests/unittests/test_create_op_doc_string.py
@@ -12,13 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import paddle.fluid.layers as layers
 
 
 class TestDocString(unittest.TestCase):
     def test_layer_doc_string(self):
-        print layers.dropout.__doc__
+        print(layers.dropout.__doc__)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_crf_decoding_op.py b/python/paddle/fluid/tests/unittests/test_crf_decoding_op.py
index 122b076c2d3e3a69f52a2c335e2bc89707b4fa9b..51bd1300e61d58c934a40abf81ab8f137e44910f 100644
--- a/python/paddle/fluid/tests/unittests/test_crf_decoding_op.py
+++ b/python/paddle/fluid/tests/unittests/test_crf_decoding_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import random
 import numpy as np
diff --git a/python/paddle/fluid/tests/unittests/test_crop_op.py b/python/paddle/fluid/tests/unittests/test_crop_op.py
index 4016089c01644f0389855ab114360f90c50a1bbe..d7bcfba8deab1b73e4cbab8a27f9eeef9a37d29b 100644
--- a/python/paddle/fluid/tests/unittests/test_crop_op.py
+++ b/python/paddle/fluid/tests/unittests/test_crop_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_cross_entropy_op.py b/python/paddle/fluid/tests/unittests/test_cross_entropy_op.py
index c5b9e92d69133e593a2ce223e83006eda590daa5..f22badbea0c67b210f7ac4e14e5d647f1cffa6cc 100644
--- a/python/paddle/fluid/tests/unittests/test_cross_entropy_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cross_entropy_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest, randomize_probability
@@ -105,5 +107,136 @@ class TestCrossEntropyOp3(OpTest):
             ["X"], "Y", max_relative_error=0.05, numeric_grad_delta=0.001)
 
 
+class TestCrossEntropyOp4(OpTest):
+    """Test high rank tensor cross-entropy with discrete one-hot labels.
+    """
+
+    def setUp(self):
+        self.op_type = "cross_entropy"
+        shape = [10, 2, 4]
+        ins_num = np.prod(np.array(shape))
+        class_num = 10
+
+        X_2d = randomize_probability(ins_num, class_num, dtype='float64')
+
+        label_2d = np.random.randint(0, class_num, (ins_num, 1), dtype="int64")
+        cross_entropy_2d = np.asmatrix(
+            [[-np.log(X_2d[i][label_2d[i][0]])] for i in range(X_2d.shape[0])],
+            dtype="float64")
+
+        X = X_2d.reshape(shape + [class_num])
+        label = label_2d.reshape(shape + [1])
+        cross_entropy = np.array(cross_entropy_2d).reshape(shape + [1])
+
+        self.inputs = {"X": X, "Label": label}
+        self.outputs = {"Y": cross_entropy}
+        self.attrs = {"soft_label": False}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Y", numeric_grad_delta=0.001)
+
+
+class TestCrossEntropyOp5(OpTest):
+    """Test high rank tensor cross-entropy with vectorized soft labels.
+    """
+
+    def setUp(self):
+        self.op_type = "cross_entropy"
+        shape = [4, 3]
+        ins_num = np.prod(np.array(shape))
+        class_num = 37
+
+        X_2d = randomize_probability(ins_num, class_num)
+        label_2d = np.random.uniform(0.1, 1.0,
+                                     [ins_num, class_num]).astype("float32")
+        label_2d /= label_2d.sum(axis=1, keepdims=True)
+        cross_entropy_2d = (-label_2d * np.log(X_2d)).sum(
+            axis=1, keepdims=True).astype("float32")
+
+        X = X_2d.reshape(shape + [class_num])
+        label = label_2d.reshape(shape + [class_num])
+        cross_entropy = np.array(cross_entropy_2d).reshape(shape + [1])
+
+        self.inputs = {"X": X, "Label": label}
+        self.outputs = {"Y": cross_entropy}
+        self.attrs = {"soft_label": True}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(
+            ["X"], "Y", max_relative_error=0.05, numeric_grad_delta=0.001)
+
+
+class TestCrossEntropyOp6(OpTest):
+    """Test high rank tensor cross-entropy with vectorized one-hot representation of labels.
+    """
+
+    def setUp(self):
+        self.op_type = "cross_entropy"
+        shape = [4, 3, 2]
+        ins_num = np.prod(np.array(shape))
+        class_num = 17
+
+        X_2d = randomize_probability(ins_num, class_num)
+        label_index_2d = np.random.randint(
+            0, class_num, (ins_num), dtype="int32")
+        label_2d = np.zeros(X_2d.shape)
+        label_2d[np.arange(ins_num), label_index_2d] = 1
+
+        cross_entropy_2d = np.asmatrix(
+            [[-np.log(X_2d[i][label_index_2d[i]])]
+             for i in range(X_2d.shape[0])],
+            dtype="float32")
+
+        X = X_2d.reshape(shape + [class_num])
+        label = label_2d.reshape(shape + [class_num])
+        cross_entropy = np.array(cross_entropy_2d).reshape(shape + [1])
+
+        self.inputs = {"X": X, "Label": label.astype(np.float32)}
+        self.outputs = {"Y": cross_entropy}
+        self.attrs = {"soft_label": True}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(
+            ["X"], "Y", max_relative_error=0.05, numeric_grad_delta=0.001)
+
+
+class TestCrossEntropyOp7(OpTest):
+    """Test cross-entropy with ignore index.
+    """
+
+    def setUp(self):
+        self.op_type = "cross_entropy"
+        batch_size = 30
+        class_num = 10
+        ignore_index = 3
+
+        X = randomize_probability(batch_size, class_num, dtype='float64')
+
+        label = np.random.randint(0, class_num, (batch_size, 1), dtype="int64")
+        cross_entropy = np.asmatrix(
+            [[-np.log(X[i][label[i][0]])]
+             if label[i][0] != ignore_index else [0]
+             for i in range(X.shape[0])],
+            dtype="float64")
+        self.inputs = {"X": X, "Label": label}
+        self.outputs = {"Y": cross_entropy}
+        self.attrs = {"soft_label": False, "ignore_index": ignore_index}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Y", numeric_grad_delta=0.001)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_ctc_align.py b/python/paddle/fluid/tests/unittests/test_ctc_align.py
index 131b4076f45ae25b45bb3f64da07a5c3aacc43d5..5f17d2d407cca9a4c95d919d05a3a03b784d1942 100644
--- a/python/paddle/fluid/tests/unittests/test_ctc_align.py
+++ b/python/paddle/fluid/tests/unittests/test_ctc_align.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import sys
 import unittest
 import numpy as np
diff --git a/python/paddle/fluid/tests/unittests/test_cumsum_op.py b/python/paddle/fluid/tests/unittests/test_cumsum_op.py
index 04e7f0b94510987a1872c2d625ac4d29a3c6feba..13a4eacece8a211513d6537db0d09b80c238178e 100644
--- a/python/paddle/fluid/tests/unittests/test_cumsum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cumsum_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_data_balance.py b/python/paddle/fluid/tests/unittests/test_data_balance.py
index 6d810920d55ccf069ff408c553069e8f5e590271..4bd24510bc8ac7f0fbaad3fd1919ab589cd21c4b 100644
--- a/python/paddle/fluid/tests/unittests/test_data_balance.py
+++ b/python/paddle/fluid/tests/unittests/test_data_balance.py
@@ -12,16 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import paddle.fluid as fluid
-import paddle.v2 as paddle
+import paddle
 import numpy as np
 
 
 class TestDataBalance(unittest.TestCase):
     def prepare_data(self):
         def fake_data_generator():
-            for n in xrange(self.total_ins_num):
+            for n in range(self.total_ins_num):
                 yield np.ones((3, 4)) * n, n
 
         # Prepare data
@@ -41,7 +43,7 @@ class TestDataBalance(unittest.TestCase):
 
     def prepare_lod_data(self):
         def fake_data_generator():
-            for n in xrange(1, self.total_ins_num + 1):
+            for n in range(1, self.total_ins_num + 1):
                 d1 = (np.ones((n, 3)) * n).astype('float32')
                 d2 = (np.array(n).reshape((1, 1))).astype('int32')
                 yield d1, d2
@@ -58,9 +60,9 @@ class TestDataBalance(unittest.TestCase):
                             (0, 1))
                     ]
                     lod = [0]
-                    for _ in xrange(self.batch_size):
+                    for _ in range(self.batch_size):
                         try:
-                            ins = generator.next()
+                            ins = next(generator)
                         except StopIteration:
                             eof = True
                             break
@@ -82,7 +84,7 @@ class TestDataBalance(unittest.TestCase):
         self.data_file_name = './data_balance_test.recordio'
         self.lod_data_file_name = './data_balance_with_lod_test.recordio'
         self.total_ins_num = 50
-        self.batch_size = 10
+        self.batch_size = 12
         self.prepare_data()
         self.prepare_lod_data()
 
@@ -142,8 +144,7 @@ class TestDataBalance(unittest.TestCase):
                 filenames=[self.lod_data_file_name],
                 shapes=[[-1, 3], [-1, 1]],
                 lod_levels=[1, 0],
-                dtypes=['float32', 'int32'],
-                thread_num=1)
+                dtypes=['float32', 'int32'])
             ins, label = fluid.layers.read_file(data_reader)
 
             place = fluid.CUDAPlace(0) if self.use_cuda else fluid.CPUPlace()
@@ -156,7 +157,7 @@ class TestDataBalance(unittest.TestCase):
                 main_program=main_prog,
                 build_strategy=build_strategy)
 
-            if (parallel_exe.device_count > self.batch_size):
+            if parallel_exe.device_count > self.batch_size:
                 print("WARNING: Unittest TestDataBalance skipped. \
                     For the result is not correct when device count \
                     is larger than batch size.")
@@ -190,3 +191,7 @@ class TestDataBalance(unittest.TestCase):
     def test_all(self):
         self.main()
         self.main_lod()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_debugger.py b/python/paddle/fluid/tests/unittests/test_debugger.py
index 870952f2f916dcdec5991ac5c10d2da3a7ab18a8..f4c9466d63a201ba9a5e77515ae64a33bedc5b23 100644
--- a/python/paddle/fluid/tests/unittests/test_debugger.py
+++ b/python/paddle/fluid/tests/unittests/test_debugger.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import paddle.fluid as fluid
 import paddle.fluid.core as core
diff --git a/python/paddle/fluid/tests/unittests/test_decayed_adagrad_op.py b/python/paddle/fluid/tests/unittests/test_decayed_adagrad_op.py
index 84c44d4817366518a2cbc3f0a777ab32b67f3d11..a664a1529f4de1f372241319b57fad6b0ba8b8a2 100644
--- a/python/paddle/fluid/tests/unittests/test_decayed_adagrad_op.py
+++ b/python/paddle/fluid/tests/unittests/test_decayed_adagrad_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_default_scope_funcs.py b/python/paddle/fluid/tests/unittests/test_default_scope_funcs.py
index a3bf7b544b91c70ffe3894219c118ec9887aba81..01a7b6824885b32e922a8eb34f5d8117ee3e584f 100644
--- a/python/paddle/fluid/tests/unittests/test_default_scope_funcs.py
+++ b/python/paddle/fluid/tests/unittests/test_default_scope_funcs.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 from paddle.fluid.default_scope_funcs import *
 import unittest
 
@@ -39,7 +41,7 @@ class TestDefaultScopeFuncs(unittest.TestCase):
             self.assertTrue(i.is_int())
             self.assertEqual(10, i.get_int())
 
-        for _ in xrange(10):
+        for _ in range(10):
             scoped_function(__new_scope__)
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_desc_clone.py b/python/paddle/fluid/tests/unittests/test_desc_clone.py
new file mode 100644
index 0000000000000000000000000000000000000000..08579c7dd62ea6aea87b053345211914a6be6237
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_desc_clone.py
@@ -0,0 +1,198 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import time
+import math
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import os
+import signal
+import six
+import collections
+
+SEED = 1
+DTYPE = "float32"
+paddle.dataset.mnist.fetch()
+
+
+# random seed must set before configuring the network.
+# fluid.default_startup_program().random_seed = SEED
+def cnn_model(data):
+    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+        input=data,
+        filter_size=5,
+        num_filters=20,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+        input=conv_pool_1,
+        filter_size=5,
+        num_filters=50,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+
+    # TODO(dzhwinter) : refine the initializer and random seed settting
+    SIZE = 10
+    input_shape = conv_pool_2.shape
+    param_shape = [six.moves.reduce(lambda a, b: a * b, input_shape[1:], 1)
+                   ] + [SIZE]
+    scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5
+
+    predict = fluid.layers.fc(
+        input=conv_pool_2,
+        size=SIZE,
+        act="softmax",
+        param_attr=fluid.param_attr.ParamAttr(
+            initializer=fluid.initializer.NormalInitializer(
+                loc=0.0, scale=scale)))
+    return predict
+
+
+def get_model(batch_size):
+    # Input data
+    images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+    # Train program
+    predict = cnn_model(images)
+    cost = fluid.layers.cross_entropy(input=predict, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+
+    # Evaluator
+    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+    batch_acc = fluid.layers.accuracy(
+        input=predict, label=label, total=batch_size_tensor)
+
+    inference_program = fluid.default_main_program().clone()
+    # Optimization
+    opt = fluid.optimizer.AdamOptimizer(
+        learning_rate=0.001, beta1=0.9, beta2=0.999)
+
+    # Reader
+    train_reader = paddle.batch(
+        paddle.dataset.mnist.train(), batch_size=batch_size)
+    test_reader = paddle.batch(
+        paddle.dataset.mnist.test(), batch_size=batch_size)
+    opt.minimize(avg_cost)
+    return inference_program, avg_cost, train_reader, test_reader, batch_acc, predict
+
+
+def get_transpiler(trainer_id, main_program, pserver_endpoints, trainers):
+    t = fluid.DistributeTranspiler()
+    t.transpile(
+        trainer_id=trainer_id,
+        program=main_program,
+        pservers=pserver_endpoints,
+        trainers=trainers)
+    return t
+
+
+def operator_equal(a, b):
+    for k, v in six.iteritems(a.__dict__):
+        if isinstance(v, fluid.framework.Program) or \
+                isinstance(v, fluid.framework.Block):
+            continue
+
+        elif isinstance(v, core.OpDesc):
+            if v.serialize_to_string() != b.__dict__[k].serialize_to_string():
+                raise ValueError("In operator_equal not equal:{0}\n".format(k))
+
+        elif isinstance(v, collections.OrderedDict):
+            v0 = sorted(list(six.iteritems(v)), key=lambda x: x[0])
+            v1 = sorted(list(six.iteritems(b.__dict__[k])), key=lambda x: x[0])
+
+            if v0 != v1:
+                raise ValueError("In operator_equal not equal:{0}\n".format(k))
+
+        elif (v != b.__dict__[k]):
+            raise ValueError("In operator_equal not equal:{0}\n".format(k))
+
+    return True
+
+
+def block_equal(a, b):
+    for k, v in six.iteritems(a.__dict__):
+        if isinstance(v, core.ProgramDesc) or isinstance(
+                v, fluid.framework.Program) or isinstance(v, core.BlockDesc):
+            continue
+
+        elif k == "ops":
+            assert (len(a.ops) == len(b.ops))
+            for i in range(0, len(a.ops)):
+                if not operator_equal(a.ops[i], b.ops[i]):
+                    raise ValueError("In block_equal not equal:{0}\n".format(k))
+
+        elif isinstance(v, collections.OrderedDict):
+            for key, value in six.iteritems(v):
+                if str(value) != str(b.__dict__[k][key]):
+                    raise ValueError("In block_equal not equal:{0}\n".format(k))
+
+        elif (v != b.__dict__[k]):
+            raise ValueError("In block_equal not equal:{0}\n".format(k))
+
+    return True
+
+
+def program_equal(a, b):
+    for k, v in six.iteritems(a.__dict__):
+        if isinstance(v, core.ProgramDesc):
+            continue
+
+        elif k == 'blocks':
+            for i in range(0, len(a.blocks)):
+                if not block_equal(a.blocks[i], b.blocks[i]):
+                    raise ValueError("In operator_equal not equal:{0}\n".format(
+                        k))
+                    return False
+            assert (len(a.blocks) == len(b.blocks))
+
+        elif (v != b.__dict__[k]):
+            raise ValueError("In program_equal not equal:{0}\n".format(k))
+
+    return True
+
+
+class TestDistMnist(unittest.TestCase):
+    def test_desc_clone(self):
+        get_model(batch_size=20)
+
+        pserver_endpoints = "127.0.0.1:9123"
+        trainers = 1
+        current_endpoint = "127.0.0.1:9123"
+        t = get_transpiler(0,
+                           fluid.default_main_program(), pserver_endpoints,
+                           trainers)
+
+        pserver_prog = t.get_pserver_program(current_endpoint)
+        startup_prog = t.get_startup_program(current_endpoint, pserver_prog)
+        main = pserver_prog.clone()
+        startup = startup_prog.clone()
+
+        self.assertTrue(program_equal(main, pserver_prog))
+        self.assertTrue(program_equal(startup, startup_prog))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_detection_map_op.py b/python/paddle/fluid/tests/unittests/test_detection_map_op.py
index 05d3367ad8ec2bc3df794015a7c25e943a26c68c..f6eb8f2c6d8b94f92e24ff789c91efb53a645a46 100644
--- a/python/paddle/fluid/tests/unittests/test_detection_map_op.py
+++ b/python/paddle/fluid/tests/unittests/test_detection_map_op.py
@@ -12,8 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
+import six
 import sys
 import collections
 import math
@@ -176,7 +179,7 @@ class TestDetectionMAPOp(OpTest):
             true_pos[label].append([score, tp])
             false_pos[label].append([score, fp])
 
-        for (label, label_pos_num) in label_count.items():
+        for (label, label_pos_num) in six.iteritems(label_count):
             if label_pos_num == 0 or label not in true_pos: continue
             label_true_pos = true_pos[label]
             label_false_pos = false_pos[label]
diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0f5da5a1ae43847dff6348ea5f3e3bfd5e89ab9
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -0,0 +1,344 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import time
+
+import unittest
+import os
+import sys
+import six
+import signal
+import subprocess
+import argparse
+
+
+class TestDistRunnerBase(object):
+    def get_model(self, batch_size=2):
+        raise NotImplementedError(
+            "get_model should be implemented by child classes.")
+
+    def get_transpiler(self, trainer_id, main_program, pserver_endpoints,
+                       trainers, sync_mode):
+        # NOTE: import fluid until runtime, or else forking processes will cause error.
+        import paddle
+        import paddle.fluid as fluid
+        t = fluid.DistributeTranspiler()
+        t.transpile(
+            trainer_id=trainer_id,
+            program=main_program,
+            pservers=pserver_endpoints,
+            trainers=trainers,
+            sync_mode=sync_mode)
+        return t
+
+    def run_pserver(self, args):
+        import paddle
+        import paddle.fluid as fluid
+        self.get_model(batch_size=2)
+        if args.mem_opt:
+            fluid.memory_optimize(fluid.default_main_program())
+        t = self.get_transpiler(args.trainer_id,
+                                fluid.default_main_program(), args.endpoints,
+                                args.trainers, args.sync_mode)
+        pserver_prog = t.get_pserver_program(args.current_endpoint)
+        startup_prog = t.get_startup_program(args.current_endpoint,
+                                             pserver_prog)
+
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(startup_prog)
+        exe.run(pserver_prog)
+
+    def run_trainer(self, place, args):
+        import paddle
+        import paddle.fluid as fluid
+        test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \
+            self.get_model(batch_size=2)
+        if args.mem_opt:
+            fluid.memory_optimize(fluid.default_main_program())
+        if args.is_dist:
+            t = self.get_transpiler(args.trainer_id,
+                                    fluid.default_main_program(),
+                                    args.endpoints, args.trainers,
+                                    args.sync_mode)
+            trainer_prog = t.get_trainer_program()
+        else:
+            trainer_prog = fluid.default_main_program()
+
+        startup_exe = fluid.Executor(place)
+        startup_exe.run(fluid.default_startup_program())
+
+        strategy = fluid.ExecutionStrategy()
+        strategy.num_threads = 1
+        strategy.allow_op_delay = False
+        build_stra = fluid.BuildStrategy()
+
+        if args.use_reduce:
+            build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
+        else:
+            build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
+
+        exe = fluid.ParallelExecutor(
+            True,
+            loss_name=avg_cost.name,
+            exec_strategy=strategy,
+            build_strategy=build_stra)
+
+        feed_var_list = [
+            var for var in trainer_prog.global_block().vars.values()
+            if var.is_data
+        ]
+
+        feeder = fluid.DataFeeder(feed_var_list, place)
+        reader_generator = test_reader()
+
+        data = next(reader_generator)
+        first_loss, = exe.run(fetch_list=[avg_cost.name],
+                              feed=feeder.feed(data))
+        print(first_loss)
+
+        for i in six.moves.xrange(5):
+            data = next(reader_generator)
+            loss, = exe.run(fetch_list=[avg_cost.name], feed=feeder.feed(data))
+
+        data = next(reader_generator)
+        last_loss, = exe.run(fetch_list=[avg_cost.name], feed=feeder.feed(data))
+        print(last_loss)
+
+
+def runtime_main(test_class):
+    import paddle
+    import paddle.fluid as fluid
+    import paddle.fluid.core as core
+
+    parser = argparse.ArgumentParser(description='Run dist test.')
+    parser.add_argument(
+        '--role', type=str, required=True, choices=['pserver', 'trainer'])
+    parser.add_argument('--endpoints', type=str, required=False, default="")
+    parser.add_argument('--is_dist', action='store_true')
+    parser.add_argument('--trainer_id', type=int, required=False, default=0)
+    parser.add_argument('--trainers', type=int, required=False, default=1)
+    parser.add_argument(
+        '--current_endpoint', type=str, required=False, default="")
+    parser.add_argument('--sync_mode', action='store_true')
+    parser.add_argument('--mem_opt', action='store_true')
+    parser.add_argument('--use_reduce', action='store_true')
+
+    args = parser.parse_args()
+
+    model = test_class()
+    if args.role == "pserver" and args.is_dist:
+        model.run_pserver(args)
+    else:
+        p = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        model.run_trainer(p, args)
+
+
+import paddle.compat as cpt
+import socket
+from contextlib import closing
+
+
+class TestDistBase(unittest.TestCase):
+    def _setup_config(self):
+        raise NotImplementedError("tests should have _setup_config implemented")
+
+    def setUp(self):
+        self._trainers = 2
+        self._pservers = 2
+        self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
+            self._find_free_port(), self._find_free_port())
+        self._python_interp = "python"
+        self._sync_mode = True
+        self._mem_opt = False
+        self._use_reduce = False
+        self._setup_config()
+
+    def _find_free_port(self):
+        with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
+            s.bind(('', 0))
+            return s.getsockname()[1]
+
+    def start_pserver(self, model_file, check_error_log):
+        ps0_ep, ps1_ep = self._ps_endpoints.split(",")
+        ps_cmd = "%s %s --role pserver --endpoints %s --trainer_id 0 --current_endpoint %s --trainers %d --is_dist"
+        ps0_cmd = ps_cmd % \
+            (self._python_interp, model_file, self._ps_endpoints, ps0_ep,
+             self._trainers)
+        ps1_cmd = ps_cmd % \
+            (self._python_interp, model_file, self._ps_endpoints, ps1_ep,
+             self._trainers)
+
+        if self._sync_mode:
+            ps0_cmd += " --sync_mode"
+            ps1_cmd += " --sync_mode"
+        if self._mem_opt:
+            ps0_cmd += " --mem_opt"
+            ps1_cmd += " --mem_opt"
+
+        ps0_pipe = subprocess.PIPE
+        ps1_pipe = subprocess.PIPE
+        if check_error_log:
+            print(ps0_cmd)
+            print(ps1_cmd)
+            ps0_pipe = open("/tmp/ps0_err.log", "wb")
+            ps1_pipe = open("/tmp/ps1_err.log", "wb")
+
+        ps0_proc = subprocess.Popen(
+            ps0_cmd.strip().split(" "), stdout=subprocess.PIPE, stderr=ps0_pipe)
+        ps1_proc = subprocess.Popen(
+            ps1_cmd.strip().split(" "), stdout=subprocess.PIPE, stderr=ps1_pipe)
+
+        if not check_error_log:
+            return ps0_proc, ps1_proc, None, None
+        else:
+            return ps0_proc, ps1_proc, ps0_pipe, ps1_pipe
+
+    def _wait_ps_ready(self, pid):
+        retry_times = 50
+        while True:
+            assert retry_times >= 0, "wait ps ready failed"
+            time.sleep(3)
+            try:
+                # the listen_and_serv_op would touch a file which contains the listen port
+                # on the /tmp directory until it was ready to process all the RPC call.
+                os.stat("/tmp/paddle.%d.port" % pid)
+                return
+            except os.error as e:
+                sys.stderr.write('waiting for pserver: %s, left retry %d\n' %
+                                 (e, retry_times))
+                retry_times -= 1
+
+    def check_with_place(self, model_file, delta=1e-3, check_error_log=False):
+        # TODO(typhoonzero): should auto adapt GPU count on the machine.
+        required_envs = {
+            "PATH": os.getenv("PATH"),
+            "PYTHONPATH": os.getenv("PYTHONPATH"),
+            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH"),
+            "FLAGS_fraction_of_gpu_memory_to_use": "0.15",
+            "FLAGS_cudnn_deterministic": "1"
+        }
+
+        if check_error_log:
+            required_envs["GLOG_v"] = "7"
+            required_envs["GLOG_logtostderr"] = "1"
+
+        # Run local to get a base line
+        env_local = {"CUDA_VISIBLE_DEVICES": "0"}
+        env_local.update(required_envs)
+        local_cmd = "%s %s --role trainer" % (self._python_interp, model_file)
+        if not check_error_log:
+            local_proc = subprocess.Popen(
+                local_cmd.split(" "),
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                env=env_local)
+        else:
+            err_log = open("/tmp/trainer.err.log", "wb")
+            local_proc = subprocess.Popen(
+                local_cmd.split(" "),
+                stdout=subprocess.PIPE,
+                stderr=err_log,
+                env=env_local)
+
+        local_proc.wait()
+        out, err = local_proc.communicate()
+        local_ret = cpt.to_text(out)
+        sys.stderr.write('local_loss: %s\n' % local_ret)
+        sys.stderr.write('local_stderr: %s\n' % err)
+
+        # Run dist train to compare with local results
+        ps0, ps1, ps0_pipe, ps1_pipe = self.start_pserver(model_file,
+                                                          check_error_log)
+        self._wait_ps_ready(ps0.pid)
+        self._wait_ps_ready(ps1.pid)
+
+        ps0_ep, ps1_ep = self._ps_endpoints.split(",")
+        tr_cmd = "%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --trainers %d --is_dist"
+        tr0_cmd = tr_cmd % \
+            (self._python_interp, model_file, self._ps_endpoints,
+             0, ps0_ep, self._trainers)
+        tr1_cmd = tr_cmd % \
+            (self._python_interp, model_file, self._ps_endpoints,
+             1, ps1_ep, self._trainers)
+
+        if self._sync_mode:
+            tr0_cmd += " --sync_mode"
+            tr1_cmd += " --sync_mode"
+        if self._mem_opt:
+            tr0_cmd += " --mem_opt"
+            tr1_cmd += " --mem_opt"
+        if self._use_reduce:
+            tr0_cmd += " --use_reduce"
+            tr1_cmd += " --use_reduce"
+
+        env0 = {"CUDA_VISIBLE_DEVICES": "0"}
+        env1 = {"CUDA_VISIBLE_DEVICES": "1"}
+        env0.update(required_envs)
+        env1.update(required_envs)
+        FNULL = open(os.devnull, 'w')
+
+        tr0_pipe = subprocess.PIPE
+        tr1_pipe = subprocess.PIPE
+        if check_error_log:
+            print("tr0_cmd:", tr0_cmd)
+            print("tr1_cmd:", tr1_cmd)
+            tr0_pipe = open("/tmp/tr0_err.log", "wb")
+            tr1_pipe = open("/tmp/tr1_err.log", "wb")
+
+        tr0_proc = subprocess.Popen(
+            tr0_cmd.strip().split(" "),
+            stdout=subprocess.PIPE,
+            stderr=tr0_pipe,
+            env=env0)
+        tr1_proc = subprocess.Popen(
+            tr1_cmd.strip().split(" "),
+            stdout=subprocess.PIPE,
+            stderr=tr1_pipe,
+            env=env1)
+
+        tr0_proc.wait()
+        tr1_proc.wait()
+        out, err = tr0_proc.communicate()
+        sys.stderr.write('dist_stderr: %s\n' % err)
+        loss_data0 = cpt.to_text(out)
+        sys.stderr.write('dist_loss: %s\n' % loss_data0)
+        lines = loss_data0.split("\n")
+        dist_first_loss = eval(lines[0].replace(" ", ","))[0]
+        dist_last_loss = eval(lines[1].replace(" ", ","))[0]
+
+        local_lines = local_ret.split("\n")
+        local_first_loss = eval(local_lines[0])[0]
+        local_last_loss = eval(local_lines[1])[0]
+
+        # close trainer file
+        if check_error_log:
+            tr0_pipe.close()
+            tr1_pipe.close()
+
+            ps0_pipe.close()
+            ps1_pipe.close()
+        # FIXME: use terminate() instead of sigkill.
+        os.kill(ps0.pid, signal.SIGKILL)
+        os.kill(ps1.pid, signal.SIGKILL)
+        ps0.terminate()
+        ps1.terminate()
+        ps0.wait()
+        ps1.wait()
+        FNULL.close()
+
+        self.assertAlmostEqual(local_first_loss, dist_first_loss, delta=delta)
+        self.assertAlmostEqual(local_last_loss, dist_last_loss, delta=delta)
diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist.py b/python/paddle/fluid/tests/unittests/test_dist_mnist.py
index ad2d57f7c5f127be87e963508e1dd150fdd30225..59a137c18c9435ef5c5772d0cc08f197c1d86603 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist.py
@@ -12,199 +12,56 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import numpy as np
-import argparse
-import time
-import math
-
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.profiler as profiler
-from paddle.fluid import core
+from __future__ import print_function
 import unittest
-from multiprocessing import Process
-import os
-import signal
-
-SEED = 1
-DTYPE = "float32"
-paddle.dataset.mnist.fetch()
-
-
-# random seed must set before configuring the network.
-# fluid.default_startup_program().random_seed = SEED
-def cnn_model(data):
-    conv_pool_1 = fluid.nets.simple_img_conv_pool(
-        input=data,
-        filter_size=5,
-        num_filters=20,
-        pool_size=2,
-        pool_stride=2,
-        act="relu")
-    conv_pool_2 = fluid.nets.simple_img_conv_pool(
-        input=conv_pool_1,
-        filter_size=5,
-        num_filters=50,
-        pool_size=2,
-        pool_stride=2,
-        act="relu")
-
-    # TODO(dzhwinter) : refine the initializer and random seed settting
-    SIZE = 10
-    input_shape = conv_pool_2.shape
-    param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE]
-    scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5
-
-    predict = fluid.layers.fc(
-        input=conv_pool_2,
-        size=SIZE,
-        act="softmax",
-        param_attr=fluid.param_attr.ParamAttr(
-            initializer=fluid.initializer.NormalInitializer(
-                loc=0.0, scale=scale)))
-    return predict
-
-
-def get_model(batch_size):
-    # Input data
-    images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-
-    # Train program
-    predict = cnn_model(images)
-    cost = fluid.layers.cross_entropy(input=predict, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
-
-    # Evaluator
-    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
-    batch_acc = fluid.layers.accuracy(
-        input=predict, label=label, total=batch_size_tensor)
-
-    inference_program = fluid.default_main_program().clone()
-    # Optimization
-    opt = fluid.optimizer.AdamOptimizer(
-        learning_rate=0.001, beta1=0.9, beta2=0.999)
-
-    # Reader
-    train_reader = paddle.batch(
-        paddle.dataset.mnist.train(), batch_size=batch_size)
-    test_reader = paddle.batch(
-        paddle.dataset.mnist.test(), batch_size=batch_size)
-    opt.minimize(avg_cost)
-    return inference_program, avg_cost, train_reader, test_reader, batch_acc, predict
-
-
-def get_transpiler(trainer_id, main_program, pserver_endpoints, trainers):
-    t = fluid.DistributeTranspiler()
-    t.transpile(
-        trainer_id=trainer_id,
-        program=main_program,
-        pservers=pserver_endpoints,
-        trainers=trainers)
-    return t
-
-
-def run_pserver(pserver_endpoints, trainers, current_endpoint):
-    get_model(batch_size=20)
-    t = get_transpiler(0,
-                       fluid.default_main_program(), pserver_endpoints,
-                       trainers)
-    pserver_prog = t.get_pserver_program(current_endpoint)
-    startup_prog = t.get_startup_program(current_endpoint, pserver_prog)
-
-    place = fluid.CPUPlace()
-    exe = fluid.Executor(place)
-    exe.run(startup_prog)
-
-    exe.run(pserver_prog)
-
-
-class TestDistMnist(unittest.TestCase):
-    def setUp(self):
-        self._trainers = 1
-        self._pservers = 1
-        self._ps_endpoints = "127.0.0.1:9123"
-
-    def start_pserver(self, endpoint):
-        p = Process(
-            target=run_pserver,
-            args=(self._ps_endpoints, self._trainers, endpoint))
-        p.start()
-        return p.pid
-
-    def _wait_ps_ready(self, pid):
-        retry_times = 5
-        while True:
-            assert retry_times >= 0, "wait ps ready failed"
-            time.sleep(1)
-            try:
-                # the listen_and_serv_op would touch a file which contains the listen port
-                # on the /tmp directory until it was ready to process all the RPC call.
-                os.stat("/tmp/paddle.%d.port" % pid)
-                return
-            except os.error:
-                retry_times -= 1
-
-    def stop_pserver(self, pid):
-        os.kill(pid, signal.SIGTERM)
-
-    def test_with_place(self):
-        p = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
-
-        pserver_pid = self.start_pserver(self._ps_endpoints)
-        self._wait_ps_ready(pserver_pid)
-
-        self.run_trainer(p, 0)
-
-        self.stop_pserver(pserver_pid)
-
-    def run_trainer(self, place, trainer_id):
-        test_program, avg_cost, train_reader, test_reader, batch_acc, predict = get_model(
-            batch_size=20)
-        t = get_transpiler(trainer_id,
-                           fluid.default_main_program(), self._ps_endpoints,
-                           self._trainers)
-
-        trainer_prog = t.get_trainer_program()
-
-        exe = fluid.Executor(place)
-        exe.run(fluid.default_startup_program())
-
-        feed_var_list = [
-            var for var in trainer_prog.global_block().vars.itervalues()
-            if var.is_data
-        ]
-
-        feeder = fluid.DataFeeder(feed_var_list, place)
-        for pass_id in xrange(10):
-            for batch_id, data in enumerate(train_reader()):
-                exe.run(trainer_prog, feed=feeder.feed(data))
-
-                if (batch_id + 1) % 10 == 0:
-                    acc_set = []
-                    avg_loss_set = []
-                    for test_data in test_reader():
-                        acc_np, avg_loss_np = exe.run(
-                            program=test_program,
-                            feed=feeder.feed(test_data),
-                            fetch_list=[batch_acc, avg_cost])
-                        acc_set.append(float(acc_np))
-                        avg_loss_set.append(float(avg_loss_np))
-                    # get test acc and loss
-                    acc_val = np.array(acc_set).mean()
-                    avg_loss_val = np.array(avg_loss_set).mean()
-                    if float(acc_val
-                             ) > 0.8:  # Smaller value to increase CI speed
-                        return
-                    else:
-                        print(
-                            'PassID {0:1}, BatchID {1:04}, Test Loss {2:2.2}, Acc {3:2.2}'.
-                            format(pass_id, batch_id + 1,
-                                   float(avg_loss_val), float(acc_val)))
-                        if math.isnan(float(avg_loss_val)):
-                            assert ("got Nan loss, training failed.")
+from test_dist_base import TestDistBase
 
 
+class TestDistMnist2x2(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+        self._use_reduce = False
+
+    def test_se_resnext(self):
+        self.check_with_place("dist_mnist.py", delta=1e-7)
+
+
+class TestDistMnist2x2WithMemopt(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+        self._mem_opt = True
+
+    def test_se_resnext(self):
+        self.check_with_place("dist_mnist.py", delta=1e-7)
+
+
+class TestDistMnistAsync(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = False
+        self._use_reduce = False
+
+    def test_se_resnext(self):
+        self.check_with_place("dist_mnist.py", delta=200)
+
+
+# FIXME(typhoonzero): enable these tests once we have 4
+# 4 GPUs on CI machine, and the base class should be updated.
+#
+# class TestDistMnist2x2ReduceMode(TestDistBase):
+#     def _setup_config(self):
+#         self._sync_mode = True
+#         self._use_reduce = True
+
+#     def test_se_resnext(self):
+#         self.check_with_place("dist_mnist.py", delta=1e-7)
+
+# class TestDistMnistAsyncReduceMode(TestDistBase):
+#     def _setup_config(self):
+#         self._sync_mode = False
+#         self._use_reduce = True
+
+#     def test_se_resnext(self):
+#         self.check_with_place("dist_mnist.py", delta=200)
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0e9fa38e7d1eadd89eff9a8ba4442f888b8120e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
@@ -0,0 +1,37 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+from test_dist_base import TestDistBase
+
+
+class TestDistSeResneXt2x2(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+
+    def test_se_resnext(self):
+        self.check_with_place("dist_se_resnext.py", delta=1e-7)
+
+
+class TestDistSeResneXt2x2Async(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = False
+
+    def test_se_resnext(self):
+        self.check_with_place("dist_se_resnext.py", delta=100)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_train.py b/python/paddle/fluid/tests/unittests/test_dist_train.py
index 562e66b0625083fe840d64967249f0215cfda1f9..083525ccf54d389b60c4aaa9f8c6223f07c773cd 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_train.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_train.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import os
 import time
 import unittest
@@ -22,6 +24,15 @@ import numpy
 
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
+from paddle.fluid.layers.io import ListenAndServ
+from paddle.fluid.layers.io import Recv
+from paddle.fluid.layers.io import Send
+
+from paddle.fluid import core
+
+RPC_OP_ROLE_ATTR_NAME = op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName(
+)
+RPC_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.RPC
 
 
 class TestSendOp(unittest.TestCase):
@@ -65,8 +76,7 @@ class TestSendOp(unittest.TestCase):
         main = fluid.Program()
 
         with fluid.program_guard(main):
-            serv = layers.ListenAndServ(
-                "127.0.0.1:0", ["X"], optimizer_mode=False)
+            serv = ListenAndServ("127.0.0.1:0", ["X"], optimizer_mode=False)
             with serv.do():
                 out_var = main.global_block().create_var(
                     name="scale_0.tmp_0",
@@ -87,20 +97,31 @@ class TestSendOp(unittest.TestCase):
     def init_client(self, place, port):
         main = fluid.Program()
         with fluid.program_guard(main):
+            main.global_block().append_op(
+                type="fetch_barrier",
+                inputs={},
+                outputs={"Out": []},
+                attrs={
+                    "endpoints": ["127.0.0.1:{0}".format(port)],
+                    RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
+                })
+
             x = layers.data(
                 shape=[32, 32],
                 dtype='float32',
                 name='X',
                 append_batch_size=False)
             fluid.initializer.Constant(value=2.3)(x, main.global_block())
+
             get_var = main.global_block().create_var(
                 name="scale_0.tmp_0",  # server side var
                 dtype="float32",
                 persistable=False,
                 shape=[32, 32])
             fluid.initializer.Constant(value=2.3)(get_var, main.global_block())
-            layers.Send("127.0.0.1:%d" % port, [x])
-            o = layers.Recv("127.0.0.1:%d" % port, [get_var])
+
+            Send("127.0.0.1:%d" % port, [x])
+            o = Recv("127.0.0.1:%d" % port, [get_var])
 
         exe = fluid.Executor(place)
         self.dist_out = exe.run(main, fetch_list=o)  # o is a list
diff --git a/python/paddle/fluid/tests/unittests/test_dist_transformer.py b/python/paddle/fluid/tests/unittests/test_dist_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8e6ce4cfe18384e405f1602429628914d2c2e00
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_transformer.py
@@ -0,0 +1,70 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import paddle
+from test_dist_base import TestDistBase
+
+
+def download_files():
+    url_prefix = 'http://paddle-unittest-data.cdn.bcebos.com/dist_transformer/'
+    vocab_url = url_prefix + 'vocab.bpe.32000'
+    vocab_md5 = 'a86d345ca6e27f6591d0dccb1b9be853'
+    paddle.dataset.common.download(vocab_url, 'test_dist_transformer',
+                                   vocab_md5)
+
+    local_train_url = url_prefix + 'train.tok.clean.bpe.32000.en-de'
+    local_train_md5 = '033eb02b9449e6dd823f050782ac8914'
+    paddle.dataset.common.download(local_train_url, 'test_dist_transformer',
+                                   local_train_md5)
+
+    train0_url = url_prefix + 'train.tok.clean.bpe.32000.en-de.train_0'
+    train0_md5 = 'ddce7f602f352a0405267285379a38b1'
+    paddle.dataset.common.download(train0_url, 'test_dist_transformer',
+                                   train0_md5)
+
+    train1_url = url_prefix + 'train.tok.clean.bpe.32000.en-de.train_1'
+    train1_md5 = '8757798200180285b1a619cd7f408747'
+    paddle.dataset.common.download(train1_url, 'test_dist_transformer',
+                                   train1_md5)
+
+    test_url = url_prefix + 'newstest2013.tok.bpe.32000.en-de'
+    test_md5 = '9dd74a266dbdb25314183899f269b4a2'
+    paddle.dataset.common.download(test_url, 'test_dist_transformer', test_md5)
+
+
+class TestDistTransformer2x2Sync(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+
+    def test_transformer(self):
+        download_files()
+        #Note: loss on test dataset of the first 5 batch are:
+        # 10.518872, 10.518871, 10.518868, 10.518862, 10.518855
+        self.check_with_place("dist_transformer.py", delta=1e-7)
+
+
+class TestDistTransformer2x2Async(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = False
+
+    def test_transformer(self):
+        download_files()
+        self.check_with_place("dist_transformer.py", delta=1.0)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
index 75b4b4e50da04521021dcb1e97cfe495f2619433..a198b25520f97ce23b9c1ebb9cd82fc458222d73 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
@@ -12,10 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
+import math
+
 import unittest
 import paddle.fluid as fluid
 from paddle.fluid.transpiler.distribute_transpiler import delete_ops
 import traceback
+import collections
+import six
 
 
 class TranspilerTest(unittest.TestCase):
@@ -27,7 +33,6 @@ class TranspilerTest(unittest.TestCase):
         self.pserver_eps = "127.0.0.1:6174,127.0.0.1:6175"
         self.pserver1_ep = "127.0.0.1:6174"
         self.pserver2_ep = "127.0.0.1:6175"
-        self.slice_var_up = True
         self.sync_mode = True
         self.transpiler = None
 
@@ -43,45 +48,78 @@ class TranspilerTest(unittest.TestCase):
         avg_cost = fluid.layers.mean(cost)
         sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1)
         sgd_optimizer.minimize(avg_cost)
-        return
 
     def get_main_program(self):
         main = fluid.Program()
+        main.random_seed = 1
         with fluid.program_guard(main):
             self.net_conf()
         self.origin_prog = main.clone()
         return main
 
-    def get_trainer(self):
-        t = self._transpiler_instance()
-        return t.get_trainer_program()
+    def get_trainer(self, config=None):
+        src = fluid.default_startup_program().clone()
+
+        t = self._transpiler_instance(config)
+
+        trainer_main = t.get_trainer_program(wait_port=False)
+        trainer_startup = fluid.default_startup_program()
+
+        assert (src.num_blocks == 1)
+        assert (trainer_startup.num_blocks == src.num_blocks)
 
-    def get_pserver(self, ep):
-        t = self._transpiler_instance()
+        return trainer_main, trainer_startup
+
+    def get_pserver(self, ep, config=None, sync_mode=True):
+        t = self._transpiler_instance(config, sync_mode)
         pserver = t.get_pserver_program(ep)
         startup = t.get_startup_program(ep, pserver)
         return pserver, startup
 
-    def _transpiler_instance(self):
+    def _transpiler_instance(self, config=None, sync_mode=True):
         if not self.transpiler:
             main = self.get_main_program()
-            self.transpiler = fluid.DistributeTranspiler()
+            self.transpiler = fluid.DistributeTranspiler(config=config)
             self.transpiler.transpile(
                 self.trainer_id,
                 program=main,
                 pservers=self.pserver_eps,
                 trainers=self.trainers,
-                slice_var_up=self.slice_var_up,
-                sync_mode=self.sync_mode)
+                sync_mode=sync_mode)
+
         return self.transpiler
 
+    def transpiler_test_impl(self):
+        pass
 
-class TestBasicModel(TranspilerTest):
     def test_transpiler(self):
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.unique_name.guard():
+            with fluid.program_guard(main, startup):
+                self.transpiler_test_impl()
+
+
+class TestBasicModel(TranspilerTest):
+    def transpiler_test_impl(self):
         pserver, startup = self.get_pserver(self.pserver1_ep)
         pserver2, startup2 = self.get_pserver(self.pserver2_ep)
 
-        trainer = self.get_trainer()
+        trainer, trainer_startup = self.get_trainer()
+
+        # splited var blocks should be in startup program
+        self.assertTrue("fc_w.block0" in trainer_startup.global_block().vars)
+        self.assertTrue("fc_w.block1" in trainer_startup.global_block().vars)
+        self.assertTrue("fc_w" in trainer_startup.global_block().vars)
+        self.assertTrue("fc_b" in trainer_startup.global_block().vars)
+        self.assertTrue("fc_w@GRAD" not in trainer_startup.global_block().vars)
+        self.assertTrue("fc_b@GRAD" not in trainer_startup.global_block().vars)
+
+        src = [op.type for op in trainer_startup.global_block().ops]
+        dst = ['fill_constant', 'fill_constant', 'uniform_random', 'recv', 'recv', \
+               'fetch_barrier', 'concat']
+
+        self.assertEqual(src, dst)
 
         self.assertEqual([op.type for op in trainer.global_block().ops], [
             'mul', 'elementwise_add', 'elementwise_sub', 'square', 'mean',
@@ -124,18 +162,71 @@ class TestBasicModel(TranspilerTest):
         self.assertEqual(set(pserver_params), set(trainer_params))
 
 
+class TestBasicModelWithLargeBlockSize(TranspilerTest):
+    def transpiler_test_impl(self):
+        config = fluid.DistributeTranspilerConfig()
+        config.min_block_size = 1048576
+
+        pserver, startup = self.get_pserver(self.pserver1_ep, config)
+        pserver2, startup2 = self.get_pserver(self.pserver2_ep, config)
+
+        trainer, _ = self.get_trainer(config)
+
+        self.assertEqual([op.type for op in trainer.global_block().ops], [
+            'mul', 'elementwise_add', 'elementwise_sub', 'square', 'mean',
+            'fill_constant', 'mean_grad', 'square_grad', 'elementwise_sub_grad',
+            'elementwise_add_grad', 'send', 'mul_grad', 'send', 'send_barrier',
+            'recv', 'recv', 'fetch_barrier'
+        ])
+
+        self.assertEqual(len(pserver.blocks), 2)
+        # block0: listen_and_serv
+        self.assertEqual([op.type for op in pserver.blocks[0].ops],
+                         ["listen_and_serv"])
+        # block1~2: optimize pass
+        self.assertEqual([op.type for op in pserver.blocks[1].ops],
+                         ["sum", "scale", "sgd"])
+        # confirm startup program
+        self.assertEqual([op.type for op in startup.global_block().ops],
+                         ["fill_constant", "fill_constant"])
+        # the variable #fc_w will be split into two blocks
+        fc_w_var = startup2.global_block().var("fc_w")
+        self.assertEqual(fc_w_var.shape, (1000, 1000))
+        # all parameters should be optimized on pserver
+
+        pserver_params = []
+        for prog in [pserver, pserver2]:
+            for blk in prog.blocks:
+                for op in blk.ops:
+                    if "Param" in op.input_names:
+                        param_name = op.input("Param")[0]
+                        is_block_idx = param_name.find(".block")
+                        if is_block_idx != -1:
+                            origin_param_name = param_name[:is_block_idx]
+                        else:
+                            origin_param_name = param_name
+                        pserver_params.append(origin_param_name)
+        trainer_params = []
+        for op in self.origin_prog.global_block().ops:
+            if "Param" in op.input_names:
+                trainer_params.append(op.input("Param")[0])
+        self.assertEqual(set(pserver_params), set(trainer_params))
+
+
 class TestNoSliceVar(TranspilerTest):
     def setUp(self):
         super(TestNoSliceVar, self).setUp()
-        self.slice_var_up = False
 
-    def test_transpiler(self):
-        _, startup = self.get_pserver(self.pserver1_ep)
-        _, startup2 = self.get_pserver(self.pserver2_ep)
+    def transpiler_test_impl(self):
+        config = fluid.DistributeTranspilerConfig()
+        config.slice_var_up = False
 
-        if startup.global_block().vars.has_key("fc_w"):
+        _, startup = self.get_pserver(self.pserver1_ep, config)
+        _, startup2 = self.get_pserver(self.pserver2_ep, config)
+
+        if "fc_w" in startup.global_block().vars:
             fc_w_var = startup.global_block().vars["fc_w"]
-        elif startup2.global_block().vars.has_key("fc_w"):
+        elif "fc_w" in startup2.global_block().vars:
             fc_w_var = startup2.global_block().vars["fc_w"]
 
         self.assertEqual(fc_w_var.shape, (1000, 1000))
@@ -159,11 +250,10 @@ class TestLRDecay(TranspilerTest):
                 decay_rate=0.1,
                 staircase=True))
         sgd_optimizer.minimize(avg_cost)
-        return
 
-    def test_transpiler(self):
+    def transpiler_test_impl(self):
         pserver, startup = self.get_pserver(self.pserver1_ep)
-        trainer = self.get_trainer()
+        trainer, _ = self.get_trainer()
 
         self.assertEqual(len(pserver.blocks), 4)
         lr_decay_ops = [op.type for op in pserver.blocks[1].ops]
@@ -189,16 +279,15 @@ class TestLRDecayConditional(TranspilerTest):
             learning_rate=fluid.layers.piecewise_decay([10000, 20000],
                                                        [1.0, 0.5, 1.0]))
         sgd_optimizer.minimize(avg_cost)
-        return
 
-    def test_transpiler(self):
+    def transpiler_test_impl(self):
         pserver, startup = self.get_pserver(self.pserver1_ep)
-        trainer = self.get_trainer()
+        trainer, _ = self.get_trainer()
 
         serv_op = pserver.blocks[0].ops[0]
         sub_blocks = []
         optimize_blocks = []
-        for b in serv_op.attrs["optimize_blocks"]:
+        for b in serv_op.all_attrs()["optimize_blocks"]:
             optimize_blocks.append(b.idx)
         for b in pserver.blocks:
             if b.idx not in optimize_blocks:
@@ -238,11 +327,10 @@ class TestL2Decay(TranspilerTest):
         avg_cost = fluid.layers.mean(cost)
         sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1)
         sgd_optimizer.minimize(avg_cost)
-        return
 
-    def test_transpiler(self):
+    def transpiler_test_impl(self):
         pserver, startup = self.get_pserver(self.pserver1_ep)
-        trainer = self.get_trainer()
+        trainer, _ = self.get_trainer()
 
         self.assertEqual(len(pserver.blocks), 3)
         self.assertEqual([op.type for op in pserver.blocks[1].ops],
@@ -253,10 +341,322 @@ class TestL2Decay(TranspilerTest):
         # TODO(typhoonzero): test clipping and L2Decay ops are removed from trainer
 
 
-    # FIXME(typhoonzero): need to add test for async case:
-    # see https://github.com/PaddlePaddle/Paddle/issues/11691
-class TestAsyncSGD(TranspilerTest):
-    pass
+class TestL2DecayWithPiecewise(TranspilerTest):
+    def net_conf(self):
+        x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
+        y_predict = fluid.layers.fc(input=x,
+                                    size=1000,
+                                    act=None,
+                                    param_attr=fluid.ParamAttr(name='fc_w'),
+                                    bias_attr=fluid.ParamAttr(name='fc_b'))
+        y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+        cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+        avg_cost = fluid.layers.mean(cost)
+        base_lr = 1.0
+        bd = [1, 10, 20, 30]
+        lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
+        sgd_optimizer = fluid.optimizer.Momentum(
+            learning_rate=fluid.layers.piecewise_decay(
+                boundaries=bd, values=lr),
+            momentum=0.9,
+            regularization=fluid.regularizer.L2Decay(1e-4))
+        sgd_optimizer.minimize(avg_cost)
+
+    def transpiler_test_impl(self):
+        pserver, startup = self.get_pserver(self.pserver1_ep)
+        trainer, _ = self.get_trainer()
+
+        self.assertEqual(len(pserver.blocks), 9)
+        self.assertEqual([op.type for op in pserver.blocks[1].ops], [
+            "increment", "cast", "fill_constant", "fill_constant", "less_than",
+            "logical_not", "conditional_block", "fill_constant",
+            "fill_constant", "less_than", "logical_not", "logical_and",
+            "logical_and", "conditional_block", "fill_constant",
+            "fill_constant", "less_than", "logical_not", "logical_and",
+            "logical_and", "conditional_block", "fill_constant",
+            "fill_constant", "less_than", "logical_not", "logical_and",
+            "logical_and", "conditional_block", "fill_constant",
+            "conditional_block"
+        ])
+        self.assertEqual(
+            [op.type for op in pserver.blocks[7].ops],
+            ["sum", "scale", "scale", "elementwise_add", "momentum"])
+        self.assertEqual(
+            [op.type for op in pserver.blocks[8].ops],
+            ["sum", "scale", "scale", "elementwise_add", "momentum"])
+
+
+class TestDistLookupTableBase(TranspilerTest):
+    def network_with_table(self, is_sparse, is_distributed):
+        self.table_size = 1000
+        self.emb_size = 64
+        self.lookup_table_name = 'shared_w'
+
+        def emb_pool(ids):
+            emb = fluid.layers.embedding(
+                input=ids,
+                size=[self.table_size, self.emb_size],
+                dtype='float32',
+                param_attr=self.lookup_table_name,  # share parameter
+                is_sparse=is_sparse,
+                is_distributed=is_distributed)
+            pool = fluid.layers.sequence_pool(input=emb, pool_type='average')
+            return pool
+
+        title_ids = fluid.layers.data(
+            name='title_ids', shape=[1], dtype='int64', lod_level=1)
+        brand_ids = fluid.layers.data(
+            name='brand_ids', shape=[1], dtype='int64', lod_level=1)
+        title_emb = emb_pool(title_ids)
+        brand_emb = emb_pool(brand_ids)
+        fc0 = fluid.layers.concat(input=[title_emb, brand_emb], axis=1)
+        predict = fluid.layers.fc(input=fc0,
+                                  size=2,
+                                  act=None,
+                                  param_attr=fluid.ParamAttr(name='fc_w'),
+                                  bias_attr=fluid.ParamAttr(name='fc_b'))
+
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+        cost = fluid.layers.cross_entropy(input=predict, label=label)
+        avg_cost = fluid.layers.mean(cost)
+        optimizer = fluid.optimizer.Adam(learning_rate=0.003)
+        optimizer.minimize(avg_cost)
+
+
+class TestLocalLookupTable(TestDistLookupTableBase):
+    def net_conf(self):
+        self.network_with_table(is_sparse=True, is_distributed=False)
+
+    def transpiler_test_impl(self):
+        pserver1, startup1 = self.get_pserver(self.pserver1_ep)
+
+        self.assertEqual(len(pserver1.blocks), 3)
+        # 0 listen_and_serv
+        # 1 optimize for fc_w or fc_b adam
+        self.assertEqual([op.type for op in pserver1.blocks[1].ops],
+                         ["sum", "scale", "adam", "scale", "scale"])
+        # 2 optimize for table adam
+        # NOTE: if param is not selected rows, the grad will scaled to grad / trainer_num
+        self.assertEqual([op.type for op in pserver1.blocks[2].ops],
+                         ["sum", "scale", "adam", "scale", "scale"])
+
+        trainer, _ = self.get_trainer()
+        self.assertEqual(len(trainer.blocks), 1)
+        ops = [
+            'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool',
+            'concat', 'mul', 'elementwise_add', 'cross_entropy', 'mean',
+            'fill_constant', 'mean_grad', 'cross_entropy_grad',
+            'elementwise_add_grad', 'send', 'mul_grad', 'send', 'concat_grad',
+            'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad',
+            'lookup_table_grad', 'sum', 'split_selected_rows', 'send',
+            'send_barrier', 'recv', 'recv', 'recv', 'fetch_barrier', 'concat'
+        ]
+        self.assertEqual([op.type for op in trainer.blocks[0].ops], ops)
+
+
+class TestDistLookupTable(TestDistLookupTableBase):
+    def net_conf(self):
+        self.network_with_table(is_sparse=True, is_distributed=True)
+
+    def transpiler_test_impl(self):
+        pserver1, startup1 = self.get_pserver(self.pserver1_ep)
+
+        self.assertEqual(len(pserver1.blocks), 6)
+        # 0 listen_and_serv
+        # 1 optimize for fc_w or fc_b adam
+        self.assertEqual([op.type for op in pserver1.blocks[1].ops],
+                         ["sum", "scale", "adam", "scale", "scale"])
+        # 2 optimize for table sgd
+        self.assertEqual([op.type for op in pserver1.blocks[2].ops],
+                         ["sum", "sgd"])
+        # 3 prefetch -> lookup_sparse_table for data0
+        self.assertEqual([op.type for op in pserver1.blocks[3].ops],
+                         ["lookup_sparse_table"])
+        # 4 prefetch -> lookup_sparse_table for data1
+        self.assertEqual([op.type for op in pserver1.blocks[4].ops],
+                         ["lookup_sparse_table"])
+        # 5 save table
+        self.assertEqual([op.type for op in pserver1.blocks[5].ops], ["save"])
+
+        trainer, _ = self.get_trainer()
+        self.assertEqual(len(trainer.blocks), 1)
+        ops = [
+            'split_ids', 'prefetch', 'merge_ids', 'sequence_pool', 'split_ids',
+            'prefetch', 'merge_ids', 'sequence_pool', 'concat', 'mul',
+            'elementwise_add', 'cross_entropy', 'mean', 'fill_constant',
+            'mean_grad', 'cross_entropy_grad', 'elementwise_add_grad', 'send',
+            'mul_grad', 'send', 'concat_grad', 'sequence_pool_grad',
+            'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad',
+            'sum', 'split_ids', 'send', 'send_barrier', 'recv', 'recv',
+            'fetch_barrier'
+        ]
+        self.assertEqual([op.type for op in trainer.blocks[0].ops], ops)
+
+
+class TestAsyncLocalLookupTable(TestDistLookupTableBase):
+    def net_conf(self):
+        self.network_with_table(is_sparse=True, is_distributed=False)
+
+    def transpiler_test_impl(self):
+        config = fluid.DistributeTranspilerConfig()
+        pserver1, startup1 = self.get_pserver(self.pserver1_ep, config, False)
+
+        self.assertEqual(len(pserver1.blocks), 3)
+        # 0 listen_and_serv
+        # 1 optimize for fc_w or fc_b adam
+        self.assertEqual([op.type for op in pserver1.blocks[1].ops],
+                         ["adam", "scale", "scale"])
+        # 2 optimize for table adam
+        # NOTE: if param is not selected rows, the grad will scaled to grad / trainer_num
+        self.assertEqual([op.type for op in pserver1.blocks[2].ops],
+                         ["adam", "scale", "scale"])
+
+        trainer, _ = self.get_trainer(config)
+        self.assertEqual(len(trainer.blocks), 1)
+        ops = [
+            'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool',
+            'concat', 'mul', 'elementwise_add', 'cross_entropy', 'mean',
+            'fill_constant', 'mean_grad', 'cross_entropy_grad',
+            'elementwise_add_grad', 'send', 'mul_grad', 'send', 'concat_grad',
+            'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad',
+            'lookup_table_grad', 'sum', 'split_selected_rows', 'send', 'recv',
+            'recv', 'recv', 'concat'
+        ]
+        self.assertEqual([op.type for op in trainer.blocks[0].ops], ops)
+
+
+class TestAsyncDistLookupTable(TestDistLookupTableBase):
+    def net_conf(self):
+        self.network_with_table(is_sparse=True, is_distributed=True)
+
+    def transpiler_test_impl(self):
+        config = fluid.DistributeTranspilerConfig()
+
+        pserver1, startup1 = self.get_pserver(self.pserver1_ep, config, False)
+
+        self.assertEqual(len(pserver1.blocks), 6)
+        # 0 listen_and_serv
+        # 1 optimize for fc_w or fc_b adam
+        self.assertEqual([op.type for op in pserver1.blocks[1].ops],
+                         ["adam", "scale", "scale"])
+        # 2 optimize for table sgd
+        self.assertEqual([op.type for op in pserver1.blocks[2].ops], ["sgd"])
+        # 3 prefetch -> lookup_sparse_table for data0
+        self.assertEqual([op.type for op in pserver1.blocks[3].ops],
+                         ["lookup_sparse_table"])
+        # 4 prefetch -> lookup_sparse_table for data1
+        self.assertEqual([op.type for op in pserver1.blocks[4].ops],
+                         ["lookup_sparse_table"])
+        # 5 save table
+        self.assertEqual([op.type for op in pserver1.blocks[5].ops], ["save"])
+
+        trainer, _ = self.get_trainer(config)
+        self.assertEqual(len(trainer.blocks), 1)
+        ops = [
+            'split_ids', 'prefetch', 'merge_ids', 'sequence_pool', 'split_ids',
+            'prefetch', 'merge_ids', 'sequence_pool', 'concat', 'mul',
+            'elementwise_add', 'cross_entropy', 'mean', 'fill_constant',
+            'mean_grad', 'cross_entropy_grad', 'elementwise_add_grad', 'send',
+            'mul_grad', 'send', 'concat_grad', 'sequence_pool_grad',
+            'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad',
+            'sum', 'split_ids', 'send', 'recv', 'recv'
+        ]
+        self.assertEqual([op.type for op in trainer.blocks[0].ops], ops)
+
+
+class TestDistLookupTableSliceSize(TestDistLookupTableBase):
+    def net_conf(self):
+        self.network_with_table(is_sparse=True, is_distributed=True)
+
+    def transpiler_test_impl(self):
+        config = fluid.DistributeTranspilerConfig()
+        pserver1, _ = self.get_pserver(self.pserver1_ep, config)
+
+        self.assertTrue(self.transpiler.has_distributed_lookup_table)
+        lookup_table_var = pserver1.global_block().vars[
+            self.transpiler.table_name]
+        row_size = lookup_table_var.shape[0]
+        calc_row_size = int(math.ceil(self.table_size / self.pservers))
+        self.assertEqual(row_size, calc_row_size)
+
+
+class TestDistArgsInProgram(TestDistLookupTableBase):
+    def net_conf(self):
+        self.network_with_table(is_sparse=True, is_distributed=True)
+
+    def transpiler_test_impl(self):
+        trainer, _ = self.get_trainer()
+
+        self.assertTrue(trainer._is_distributed)
+        self.assertTrue(trainer._is_chief)
+        self.assertEqual(trainer._distributed_lookup_table,
+                         self.lookup_table_name)
+        self.assertEqual(trainer._endpoints,
+                         [self.pserver1_ep, self.pserver2_ep])
+
+
+class TestRMSPropOptimizer(TranspilerTest):
+    def net_conf(self):
+        x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
+        y_predict = fluid.layers.fc(input=x,
+                                    size=1000,
+                                    act=None,
+                                    param_attr=fluid.ParamAttr(name='fc_w'),
+                                    bias_attr=fluid.ParamAttr(name='fc_b'))
+        y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+        cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+        avg_cost = fluid.layers.mean(cost)
+        optimizer = fluid.optimizer.RMSProp(learning_rate=0.1)
+        optimizer.minimize(avg_cost)
+
+    def transpiler_test_impl(self):
+        pserver, startup = self.get_pserver(self.pserver1_ep)
+        pserver2, startup2 = self.get_pserver(self.pserver2_ep)
+
+        self.assertEqual(len(pserver.blocks), 3)
+        # block1~2: optimize pass
+        self.assertEqual([op.type for op in pserver.blocks[1].ops],
+                         ["sum", "scale", "rmsprop"])
+        # the variable #fc_w will be split into two blocks
+        fc_w_var = startup.global_block().var("fc_w.block1")
+        self.assertEqual(fc_w_var.shape, (500, 1000))
+        moment_var = startup.global_block().var("momentum_1")
+        self.assertEqual(moment_var.shape, (500, 1000))
+
+
+class TestLoadSliceVar(TranspilerTest):
+    def net_conf(self):
+        x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
+        y_predict = fluid.layers.fc(input=x,
+                                    size=1000,
+                                    act=None,
+                                    param_attr=fluid.ParamAttr(name='fc_w'),
+                                    bias_attr=fluid.ParamAttr(name='fc_b'))
+        y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+        cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+        avg_cost = fluid.layers.mean(cost)
+        optimizer = fluid.optimizer.RMSProp(learning_rate=0.1)
+        optimizer.minimize(avg_cost)
+
+    def transpiler_test_impl(self):
+        pserver, _ = self.get_pserver(self.pserver1_ep)
+        pserver2, _ = self.get_pserver(self.pserver2_ep)
+
+        self.assertTrue(pserver._slice_vars_and_attrs)
+        self.assertTrue(pserver2._slice_vars_and_attrs)
+
+        for idx in six.moves.xrange(len(pserver._slice_vars_and_attrs)):
+            self.assertEqual(pserver._slice_vars_and_attrs[idx][0],
+                             pserver2._slice_vars_and_attrs[idx][0])
+
+            total_numel = six.moves.reduce(
+                lambda x, y: x * y, pserver._slice_vars_and_attrs[idx][0].shape)
+            self.assertEqual(
+                total_numel,
+                six.moves.reduce(lambda x, y: x * y,
+                                 pserver._slice_vars_and_attrs[idx][2].shape) +
+                six.moves.reduce(lambda x, y: x * y,
+                                 pserver2._slice_vars_and_attrs[idx][2].shape))
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_dist_word2vec.py b/python/paddle/fluid/tests/unittests/test_dist_word2vec.py
index 712fd5849d80b1915ae3b2ae5108bedee8d88a2c..9a3e92e8d775a37e0c24ee1bcc5435628d61bb91 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_word2vec.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_word2vec.py
@@ -12,191 +12,25 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import numpy as np
-import argparse
-import time
-import math
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.profiler as profiler
-from paddle.fluid import core
+from __future__ import print_function
 import unittest
-from multiprocessing import Process
-import os
-import signal
+from test_dist_base import TestDistBase
 
-IS_SPARSE = True
-EMBED_SIZE = 32
-HIDDEN_SIZE = 256
-N = 5
-BATCH_SIZE = 32
-ExecutionStrategy = core.ParallelExecutor.ExecutionStrategy
 
+class TestDistSeResneXt2x2(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
 
-def get_model():
-    def __network__(words):
-        embed_first = fluid.layers.embedding(
-            input=words[0],
-            size=[dict_size, EMBED_SIZE],
-            dtype='float32',
-            is_sparse=IS_SPARSE,
-            param_attr='shared_w')
-        embed_second = fluid.layers.embedding(
-            input=words[1],
-            size=[dict_size, EMBED_SIZE],
-            dtype='float32',
-            is_sparse=IS_SPARSE,
-            param_attr='shared_w')
-        embed_third = fluid.layers.embedding(
-            input=words[2],
-            size=[dict_size, EMBED_SIZE],
-            dtype='float32',
-            is_sparse=IS_SPARSE,
-            param_attr='shared_w')
-        embed_forth = fluid.layers.embedding(
-            input=words[3],
-            size=[dict_size, EMBED_SIZE],
-            dtype='float32',
-            is_sparse=IS_SPARSE,
-            param_attr='shared_w')
+    def test_se_resnext(self):
+        self.check_with_place("dist_word2vec.py", delta=1e-4)
 
-        concat_embed = fluid.layers.concat(
-            input=[embed_first, embed_second, embed_third, embed_forth], axis=1)
-        hidden1 = fluid.layers.fc(input=concat_embed,
-                                  size=HIDDEN_SIZE,
-                                  act='sigmoid')
-        predict_word = fluid.layers.fc(input=hidden1,
-                                       size=dict_size,
-                                       act='softmax')
-        cost = fluid.layers.cross_entropy(input=predict_word, label=words[4])
-        avg_cost = fluid.layers.mean(cost)
-        return avg_cost, predict_word
 
-    word_dict = paddle.dataset.imikolov.build_dict()
-    dict_size = len(word_dict)
+class TestDistSeResneXt2x2Async(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = False
 
-    first_word = fluid.layers.data(name='firstw', shape=[1], dtype='int64')
-    second_word = fluid.layers.data(name='secondw', shape=[1], dtype='int64')
-    third_word = fluid.layers.data(name='thirdw', shape=[1], dtype='int64')
-    forth_word = fluid.layers.data(name='forthw', shape=[1], dtype='int64')
-    next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64')
-    avg_cost, predict_word = __network__(
-        [first_word, second_word, third_word, forth_word, next_word])
-
-    inference_program = paddle.fluid.default_main_program().clone()
-
-    sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
-    sgd_optimizer.minimize(avg_cost)
-
-    train_reader = paddle.batch(
-        paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE)
-    test_reader = paddle.batch(
-        paddle.dataset.imikolov.test(word_dict, N), BATCH_SIZE)
-
-    return inference_program, avg_cost, train_reader, test_reader, predict_word
-
-
-def get_transpiler(trainer_id, main_program, pserver_endpoints, trainers):
-    t = fluid.DistributeTranspiler()
-    t.transpile(
-        trainer_id=trainer_id,
-        program=main_program,
-        pservers=pserver_endpoints,
-        trainers=trainers)
-    return t
-
-
-def run_pserver(pserver_endpoints, trainers, current_endpoint):
-    get_model()
-    t = get_transpiler(0,
-                       fluid.default_main_program(), pserver_endpoints,
-                       trainers)
-    pserver_prog = t.get_pserver_program(current_endpoint)
-    startup_prog = t.get_startup_program(current_endpoint, pserver_prog)
-
-    place = fluid.CPUPlace()
-    exe = fluid.Executor(place)
-    exe.run(startup_prog)
-
-    exe.run(pserver_prog)
-
-
-class TestDistMnist(unittest.TestCase):
-    def setUp(self):
-        self._trainers = 1
-        self._pservers = 1
-        self._ps_endpoints = "127.0.0.1:9123"
-
-    def start_pserver(self, endpoint):
-        p = Process(
-            target=run_pserver,
-            args=(self._ps_endpoints, self._trainers, endpoint))
-        p.start()
-        return p.pid
-
-    def _wait_ps_ready(self, pid):
-        retry_times = 5
-        while True:
-            assert retry_times >= 0, "wait ps ready failed"
-            time.sleep(1)
-            try:
-                # the listen_and_serv_op would touch a file which contains the listen port
-                # on the /tmp directory until it was ready to process all the RPC call.
-                os.stat("/tmp/paddle.%d.port" % pid)
-                return
-            except os.error:
-                retry_times -= 1
-
-    def stop_pserver(self, pid):
-        os.kill(pid, signal.SIGKILL)
-
-    def test_with_place(self):
-        p = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
-
-        pserver_pid = self.start_pserver(self._ps_endpoints)
-        self._wait_ps_ready(pserver_pid)
-
-        self.run_trainer(p, 0)
-
-        self.stop_pserver(pserver_pid)
-
-    def run_trainer(self, place, trainer_id):
-        test_program, avg_cost, train_reader, test_reader, predict = get_model()
-        t = get_transpiler(trainer_id,
-                           fluid.default_main_program(), self._ps_endpoints,
-                           self._trainers)
-
-        trainer_prog = t.get_trainer_program()
-
-        exe = fluid.Executor(place)
-        exe.run(fluid.default_startup_program())
-
-        use_gpu = True if core.is_compiled_with_cuda() else False
-
-        exec_strategy = ExecutionStrategy()
-        exec_strategy.use_cuda = use_gpu
-        train_exe = fluid.ParallelExecutor(
-            use_cuda=use_gpu,
-            main_program=trainer_prog,
-            loss_name=avg_cost.name,
-            exec_strategy=exec_strategy)
-
-        feed_var_list = [
-            var for var in trainer_prog.global_block().vars.itervalues()
-            if var.is_data
-        ]
-
-        feeder = fluid.DataFeeder(feed_var_list, place)
-        for pass_id in xrange(10):
-            for batch_id, data in enumerate(train_reader()):
-                avg_loss_np = train_exe.run(feed=feeder.feed(data),
-                                            fetch_list=[avg_cost.name])
-                loss = np.array(avg_loss_np).mean()
-                if float(loss) < 5.0:
-                    return
-                if math.isnan(loss):
-                    assert ("Got Nan loss, training failed")
+    def test_se_resnext(self):
+        self.check_with_place("dist_word2vec.py", delta=1)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_dropout_op.py b/python/paddle/fluid/tests/unittests/test_dropout_op.py
index eaa3435a86462236a99489749abe877648677053..0296bc2af4e0b79478c34b4cceab32b5a8a50f2f 100644
--- a/python/paddle/fluid/tests/unittests/test_dropout_op.py
+++ b/python/paddle/fluid/tests/unittests/test_dropout_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 import paddle.fluid.core as core
diff --git a/python/paddle/fluid/tests/unittests/test_dyn_rnn.py b/python/paddle/fluid/tests/unittests/test_dyn_rnn.py
index 0faed94deb4808783027d776e0f4c61da0db457a..d84dab1499a267ca081c2e8ea2856c7c4bb627cb 100644
--- a/python/paddle/fluid/tests/unittests/test_dyn_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_dyn_rnn.py
@@ -12,11 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import paddle.fluid as fluid
 import paddle
 import unittest
 import numpy
 
+from paddle.fluid.layers.control_flow import lod_rank_table
+from paddle.fluid.layers.control_flow import max_sequence_len
+from paddle.fluid.layers.control_flow import lod_tensor_to_array
+from paddle.fluid.layers.control_flow import array_to_lod_tensor
+from paddle.fluid.layers.control_flow import shrink_memory
+
 
 class TestDynRNN(unittest.TestCase):
     def setUp(self):
@@ -38,12 +46,11 @@ class TestDynRNN(unittest.TestCase):
 
             label = fluid.layers.data(name='label', shape=[1], dtype='float32')
 
-            rank_table = fluid.layers.lod_rank_table(x=sent_emb)
+            rank_table = lod_rank_table(x=sent_emb)
 
-            sent_emb_array = fluid.layers.lod_tensor_to_array(
-                x=sent_emb, table=rank_table)
+            sent_emb_array = lod_tensor_to_array(x=sent_emb, table=rank_table)
 
-            seq_len = fluid.layers.max_sequence_len(rank_table=rank_table)
+            seq_len = max_sequence_len(rank_table=rank_table)
             i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=0)
             i.stop_gradient = False
 
@@ -66,7 +73,7 @@ class TestDynRNN(unittest.TestCase):
                 mem = fluid.layers.array_read(array=mem_array, i=i)
                 ipt = fluid.layers.array_read(array=sent_emb_array, i=i)
 
-                mem = fluid.layers.shrink_memory(x=mem, i=i, table=rank_table)
+                mem = shrink_memory(x=mem, i=i, table=rank_table)
 
                 hidden = fluid.layers.fc(input=[mem, ipt], size=100, act='tanh')
 
@@ -75,8 +82,7 @@ class TestDynRNN(unittest.TestCase):
                 fluid.layers.array_write(x=hidden, i=i, array=mem_array)
                 fluid.layers.less_than(x=i, y=seq_len, cond=cond)
 
-            all_timesteps = fluid.layers.array_to_lod_tensor(
-                x=out, table=rank_table)
+            all_timesteps = array_to_lod_tensor(x=out, table=rank_table)
             last = fluid.layers.sequence_last_step(input=all_timesteps)
             logits = fluid.layers.fc(input=last, size=1, act=None)
             loss = fluid.layers.sigmoid_cross_entropy_with_logits(
@@ -131,7 +137,7 @@ class TestDynRNN(unittest.TestCase):
         loss_0 = exe.run(main_program,
                          feed=feeder.feed(data),
                          fetch_list=[loss])[0]
-        for _ in xrange(100):
+        for _ in range(100):
             val = exe.run(main_program,
                           feed=feeder.feed(data),
                           fetch_list=[loss])[0]
diff --git a/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py b/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py
index 0f289af284773caf8515f9cbdd38e0d4481e4e44..9d635f36fe83d041bb57df0759da1481f66bbaa2 100644
--- a/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py
+++ b/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import numpy
 import random
 import collections
@@ -61,13 +63,13 @@ class BaseRNN(object):
         self.num_seq = num_seq
         self.inputs = collections.defaultdict(list)
 
-        for _ in xrange(num_seq):
+        for _ in range(num_seq):
             seq_len = random.randint(1, max_seq_len - 1)
             for iname in ins:
                 ishape = ins[iname].get('shape', None)
                 idtype = ins[iname].get('dtype', 'float32')
                 lst = []
-                for _ in xrange(seq_len):
+                for _ in range(seq_len):
                     lst.append(numpy.random.random(size=ishape).astype(idtype))
                 self.inputs[iname].append(lst)
 
@@ -96,16 +98,16 @@ class BaseRNN(object):
         for out in self.outputs:
             retv[out] = []
 
-        for seq_id in xrange(self.num_seq):
+        for seq_id in range(self.num_seq):
             for mname in self.mems:
                 self.mems[mname].reset()
             for out in self.outputs:
                 self.outputs[out].next_sequence()
 
-            iname0 = self.inputs.keys()[0]
+            iname0 = list(self.inputs.keys())[0]
             seq_len = len(self.inputs[iname0][seq_id])
 
-            for step_id in xrange(seq_len):
+            for step_id in range(seq_len):
                 xargs = dict()
 
                 for iname in self.inputs:
@@ -138,7 +140,7 @@ class BaseRNN(object):
         for iname in self.inputs:
             lod = []
             np_flatten = []
-            for seq_id in xrange(len(self.inputs[iname])):
+            for seq_id in range(len(self.inputs[iname])):
                 seq_len = len(self.inputs[iname][seq_id])
                 lod.append(seq_len)
                 np_flatten.extend(self.inputs[iname][seq_id])
@@ -159,8 +161,8 @@ class BaseRNN(object):
                              " which is not matrix")
         g = numpy.zeros(shape=p.shape, dtype=p.dtype)
 
-        for i in xrange(p.shape[0]):
-            for j in xrange(p.shape[1]):
+        for i in range(p.shape[0]):
+            for j in range(p.shape[1]):
                 o = p[i][j]
                 p[i][j] += delta
                 pos = self._exe_mean_out_()
@@ -184,7 +186,7 @@ class BaseRNN(object):
                 if len(item.shape) != 1:
                     raise ValueError("Not support")
 
-                for i in xrange(len(item)):
+                for i in range(len(item)):
                     o = item[i]
                     item[i] += delta
                     pos = self._exe_mean_out_()
@@ -198,14 +200,14 @@ class BaseRNN(object):
         if not return_one_tensor:
             return grad
 
-        for i in xrange(len(grad)):
+        for i in range(len(grad)):
             grad[i] = numpy.concatenate(grad[i])
         grad = numpy.concatenate(grad)
         return grad
 
     def _exe_mean_out_(self):
         outs = self.exe()
-        return numpy.array([o.mean() for o in outs.itervalues()]).mean()
+        return numpy.array([o.mean() for o in outs.values()]).mean()
 
 
 class SeedFixedTestCase(unittest.TestCase):
@@ -274,13 +276,14 @@ class TestSimpleMul(SeedFixedTestCase):
 
         cpu = fluid.CPUPlace()
         exe = fluid.Executor(cpu)
-        out, w_g, i_g = map(numpy.array,
-                            exe.run(feed=py_rnn.to_feed(cpu),
-                                    fetch_list=[
-                                        out, self.PARAM_NAME + "@GRAD",
-                                        self.DATA_NAME + "@GRAD"
-                                    ],
-                                    return_numpy=False))
+        out, w_g, i_g = list(
+            map(numpy.array,
+                exe.run(feed=py_rnn.to_feed(cpu),
+                        fetch_list=[
+                            out, self.PARAM_NAME + "@GRAD", self.DATA_NAME +
+                            "@GRAD"
+                        ],
+                        return_numpy=False)))
         out_by_python = py_rnn.exe()[self.OUT_NAME]
         self.assertTrue(numpy.allclose(out, out_by_python))
         w_g_num = py_rnn.get_numeric_gradient_of_param(self.PARAM_NAME)
@@ -351,14 +354,15 @@ class TestSimpleMulWithMemory(SeedFixedTestCase):
         cpu = fluid.CPUPlace()
         exe = fluid.Executor(cpu)
         feed = py_rnn.to_feed(cpu)
-        last_np, w_g, i_g = map(numpy.array,
-                                exe.run(feed=feed,
-                                        fetch_list=[
-                                            last, self.PARAM_NAME + "@GRAD",
-                                            self.DATA_NAME + "@GRAD"
-                                        ],
-                                        return_numpy=False))
-        last_by_py, = py_rnn.exe().values()
+        last_np, w_g, i_g = list(
+            map(numpy.array,
+                exe.run(feed=feed,
+                        fetch_list=[
+                            last, self.PARAM_NAME + "@GRAD", self.DATA_NAME +
+                            "@GRAD"
+                        ],
+                        return_numpy=False)))
+        last_by_py, = list(py_rnn.exe().values())
         w_g_num = py_rnn.get_numeric_gradient_of_param(self.PARAM_NAME)
         self.assertTrue(numpy.allclose(last_np, last_by_py))
 
diff --git a/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py b/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py
index 92e718662dfd7998be3ede2994f160059679fa8a..b4359fc69ae18b45774af0d2b20c1540bd99da5c 100644
--- a/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py
+++ b/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import paddle
 import paddle.fluid.core as core
@@ -65,10 +67,10 @@ class TestDyRnnStaticInput(unittest.TestCase):
         return self._lodtensor_to_ndarray(fetch_outs[0])
 
     def _lodtensor_to_ndarray(self, lod_tensor):
-        dims = lod_tensor.get_dims()
+        dims = lod_tensor.shape()
         ndarray = np.zeros(shape=dims).astype('float32')
-        for i in xrange(np.product(dims)):
-            ndarray.ravel()[i] = lod_tensor.get_float_element(i)
+        for i in range(np.product(dims)):
+            ndarray.ravel()[i] = lod_tensor._get_float_element(i)
         return ndarray, lod_tensor.recursive_sequence_lengths()
 
     def build_graph(self, only_forward=False):
@@ -114,7 +116,7 @@ class TestDyRnnStaticInput(unittest.TestCase):
                 shape=[1], dtype='int64', value=0)
             step_idx.stop_gradient = True
 
-            for i in xrange(self._max_sequence_len):
+            for i in range(self._max_sequence_len):
                 step_out = fluid.layers.array_read(static_input_out_array,
                                                    step_idx)
                 step_out.stop_gradient = True
@@ -140,27 +142,27 @@ class TestDyRnnStaticInput(unittest.TestCase):
         static_lod = self.static_input_tensor.recursive_sequence_lengths()
         static_sliced = []
         cur_offset = 0
-        for i in xrange(len(static_lod[0])):
+        for i in range(len(static_lod[0])):
             static_sliced.append(self.static_input_data[cur_offset:(
                 cur_offset + static_lod[0][i])])
             cur_offset += static_lod[0][i]
         static_seq_len = static_lod[0]
         static_reordered = []
-        for i in xrange(len(x_sorted_indices)):
+        for i in range(len(x_sorted_indices)):
             static_reordered.extend(static_sliced[x_sorted_indices[i]].tolist())
         static_seq_len_reordered = [
             static_seq_len[x_sorted_indices[i]]
-            for i in xrange(len(x_sorted_indices))
+            for i in range(len(x_sorted_indices))
         ]
 
         static_step_outs = []
         static_step_lods = []
 
-        for i in xrange(self._max_sequence_len):
+        for i in range(self._max_sequence_len):
             end = len(x_seq_len) - bisect.bisect_left(x_seq_len_sorted, i + 1)
             lod = []
             total_len = 0
-            for i in xrange(end):
+            for i in range(end):
                 lod.append(static_seq_len_reordered[i])
                 total_len += lod[-1]
             static_step_lods.append([lod])
@@ -174,7 +176,7 @@ class TestDyRnnStaticInput(unittest.TestCase):
         static_step_outs = self.build_graph(only_forward=True)
         self.exe.run(framework.default_startup_program())
         expected_outs, expected_lods = self.get_expected_static_step_outs()
-        for i in xrange(self._max_sequence_len):
+        for i in range(self._max_sequence_len):
             step_out, lod = self.fetch_value(static_step_outs[i])
             self.assertTrue(np.allclose(step_out, expected_outs[i]))
             self.assertTrue(np.allclose(lod, expected_lods[i]))
@@ -185,19 +187,19 @@ class TestDyRnnStaticInput(unittest.TestCase):
 
         actual_gradients, actual_lod = self.fetch_value(static_input_grad)
 
-        static_input_shape = self.static_input_tensor.get_dims()
+        static_input_shape = self.static_input_tensor.shape()
         numeric_gradients = np.zeros(shape=static_input_shape).astype('float32')
         # calculate numeric gradients
         tensor_size = np.product(static_input_shape)
-        for i in xrange(tensor_size):
-            origin = self.static_input_tensor.get_float_element(i)
+        for i in range(tensor_size):
+            origin = self.static_input_tensor._get_float_element(i)
             x_pos = origin + self._delta
-            self.static_input_tensor.set_float_element(i, x_pos)
+            self.static_input_tensor._set_float_element(i, x_pos)
             y_pos = self.fetch_value(loss)[0][0]
             x_neg = origin - self._delta
-            self.static_input_tensor.set_float_element(i, x_neg)
+            self.static_input_tensor._set_float_element(i, x_neg)
             y_neg = self.fetch_value(loss)[0][0]
-            self.static_input_tensor.set_float_element(i, origin)
+            self.static_input_tensor._set_float_element(i, origin)
             numeric_gradients.ravel()[i] = (y_pos - y_neg) / self._delta / 2
         self.assertTrue(np.allclose(actual_gradients, numeric_gradients, 0.001))
         self.assertTrue(
diff --git a/python/paddle/fluid/tests/unittests/test_edit_distance_op.py b/python/paddle/fluid/tests/unittests/test_edit_distance_op.py
index 816562621b4fc749f3c6b0eca8ee3c5850ef1ba9..4d03523025d357e453848f3016ffee890b5d46ec 100644
--- a/python/paddle/fluid/tests/unittests/test_edit_distance_op.py
+++ b/python/paddle/fluid/tests/unittests/test_edit_distance_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_add_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_add_mkldnn_op.py
index bcdbfc8e527d0dc9a95eddaf040f8035207b6c20..d85cc1f856df8eaa73cef318b48a292042488edf 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_add_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_add_mkldnn_op.py
@@ -11,6 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from __future__ import print_function
 import unittest
 import numpy as np
 import paddle.fluid.core as core
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
index fb9a496126f0b6efcad73590c78efe5a47b88cd6..5aec5d8e38aba39e6aba9a8f19637587c2f12544 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
@@ -11,6 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from __future__ import print_function
 import unittest
 import numpy as np
 import paddle.fluid.core as core
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
index bfe022af6dac711e76678b79005d4cfff90c2a2b..cadaf1df53af0af56afa8c3631b0f5ce390f318c 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
@@ -11,6 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from __future__ import print_function
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_gradient_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_gradient_op.py
index c6f45381af8ac64d117eb27325f25763fbf6cae7..9f452ffde74ee18d14f155fb5ed53fee57f12f49 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_gradient_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_gradient_op.py
@@ -11,6 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from __future__ import print_function
 import unittest
 import numpy as np
 
@@ -26,7 +28,7 @@ class TestElementWiseAddOp(unittest.TestCase):
         def test_with_place(place):
             out_grad = np.random.random_sample(self.x.shape).astype(np.float32)
             x_grad = out_grad
-            sum_axis = range(0, len(self.x.shape))
+            sum_axis = list(range(0, len(self.x.shape)))
             del sum_axis[self.axis]
             y_grad = np.sum(out_grad, axis=tuple(sum_axis))
 
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_max_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_max_op.py
index b6cd18a579520f921feed48cc86d8027f6a7bd1e..43c58710ba50c27077942643b84b7642eaf57710 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_max_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_max_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_min_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_min_op.py
index 92099724fe65050b62c738f4e6c269a0ca3f4ef1..45c861e2c3df9f14f9886091012d6cca69944454 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_min_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_min_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
index 2742bb21d95ed3363e82a70b0172cc787878abd9..775c2253ab3b27708b745b85fc007fcb504d1aed 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
@@ -11,6 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from __future__ import print_function
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_pow_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_pow_op.py
index a3fd18669c556701b2586e88ddbb89ca79549a86..7bf642f03f480b1eeec68298f9d453deb1fa2ac3 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_pow_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_pow_op.py
@@ -11,6 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from __future__ import print_function
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py
index acf652d3fb9743d69b7f7e248ff7a3ee83fc4c50..6cb88a8bb1cad7a58ca175cfc14298c959e3bad6 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py
@@ -11,6 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from __future__ import print_function
 import unittest
 import numpy as np
 from op_test import OpTest
@@ -20,8 +22,8 @@ class TestElementwiseOp(OpTest):
     def setUp(self):
         self.op_type = "elementwise_sub"
         self.inputs = {
-            'X': np.random.uniform(0.1, 1, [13, 17]).astype("float32"),
-            'Y': np.random.uniform(0.1, 1, [13, 17]).astype("float32")
+            'X': np.random.uniform(0.1, 1, [2, 3]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [2, 3]).astype("float32")
         }
         self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
 
diff --git a/python/paddle/fluid/tests/unittests/test_exception.py b/python/paddle/fluid/tests/unittests/test_exception.py
index bb7c0f88f6027807394e15aa6803da2ddc22f4e2..798ed53cddade22e986cae65109b6c6ac7a291b6 100644
--- a/python/paddle/fluid/tests/unittests/test_exception.py
+++ b/python/paddle/fluid/tests/unittests/test_exception.py
@@ -12,19 +12,23 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
+import paddle.compat as cpt
 import paddle.fluid.core as core
 import unittest
 
 
 class TestException(unittest.TestCase):
     def test_exception(self):
-        ex = None
+        exception = None
         try:
             core.__unittest_throw_exception__()
         except core.EnforceNotMet as ex:
-            self.assertIn("test exception", ex.message)
+            self.assertIn("test exception", cpt.get_exception_message(ex))
+            exception = ex
 
-        self.assertIsNotNone(ex)
+        self.assertIsNotNone(exception)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_executor_and_mul.py b/python/paddle/fluid/tests/unittests/test_executor_and_mul.py
index e1272c1d6dd7131b55ecf33fa0de0fc78a3ac5a7..b1f89eca6e58aec41b5863f4c885d5c6231a72f4 100644
--- a/python/paddle/fluid/tests/unittests/test_executor_and_mul.py
+++ b/python/paddle/fluid/tests/unittests/test_executor_and_mul.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 
 import numpy
diff --git a/python/paddle/fluid/tests/unittests/test_expand_op.py b/python/paddle/fluid/tests/unittests/test_expand_op.py
index a91e3aef5a18a681f3baf405da2beebb8c85360c..67a8d8f0721c2c75b432d68d64be8fc1035ffc74 100644
--- a/python/paddle/fluid/tests/unittests/test_expand_op.py
+++ b/python/paddle/fluid/tests/unittests/test_expand_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_extract_rows_op.py b/python/paddle/fluid/tests/unittests/test_extract_rows_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..8629bcf0f2e3c37aefdbf79b203176a43e0c3a7e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_extract_rows_op.py
@@ -0,0 +1,60 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+from op_test import OpTest
+
+
+class TestExtractRows(OpTest):
+    def check_with_place(self, place):
+        scope = core.Scope()
+
+        # create and initialize Variable
+        feature_len = 12
+        rows = [0, 4, 4, 7]
+        np_array = np.ones((len(rows), feature_len)).astype("float32")
+
+        in_x = scope.var('X').get_selected_rows()
+        in_x.set_height(len(rows))
+        in_x.set_rows(rows)
+        in_x_tensor = in_x.get_tensor()
+        in_x_tensor.set(np_array, place)
+
+        # create Out Variable
+        out_tensor = scope.var('Out').get_tensor()
+
+        # create and run lookup_table operator
+        extract_rows_op = Operator("extract_rows", X='X', Out='Out')
+        extract_rows_op.run(scope, place)
+
+        # get result from Out
+        result_array = np.array(out_tensor)
+        result_array = [ele[0] for ele in result_array]
+        assert result_array == rows
+
+    def test_concat_rows(self):
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
+        for place in places:
+            self.check_with_place(place)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py b/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py
index 281068e945e76a42635868d19573498f79fde1f3..1bb4662e8d83ac0c34b209e4e7a605869fdb59d5 100644
--- a/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py
@@ -12,48 +12,58 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 import math
 from op_test import OpTest
 
 
-def quantize_max_abs(x, num_bits):
-    range = math.pow(2, num_bits) - 1
+def quantize_max_abs(x, max_range):
     scale = np.max(np.abs(x).flatten())
-    y = np.round(x / scale * range)
+    y = np.round(x / scale * max_range)
     return y, scale
 
 
-def dequantize_max_abs(x, num_bits, scale):
-    range = math.pow(2, num_bits) - 1
-    y = (scale / range) * x
+def dequantize_max_abs(x, scale, max_range):
+    y = (scale / max_range) * x
     return y
 
 
 class TestFakeDequantizeMaxAbsOp(OpTest):
     def set_args(self):
         self.num_bits = 8
+        self.max_range = math.pow(2, self.num_bits - 1) - 1
+        self.data_type = "float32"
 
     def setUp(self):
         self.set_args()
         self.op_type = "fake_dequantize_max_abs"
-        x = np.random.randn(31, 65).astype("float32")
-        yq, scale = quantize_max_abs(x, self.num_bits)
-        print 'scale ', scale
-        ydq = dequantize_max_abs(yq, self.num_bits, scale)
+        x = np.random.randn(31, 65).astype(self.data_type)
+        yq, scale = quantize_max_abs(x, self.max_range)
+        ydq = dequantize_max_abs(yq, scale, self.max_range)
 
-        self.inputs = {'X': yq}
-        self.attrs = {'num_bits': self.num_bits, 'scale': float(scale)}
+        self.inputs = {'X': yq, 'Scale': np.array(scale).astype(self.data_type)}
+        self.attrs = {'max_range': self.max_range}
         self.outputs = {'Out': ydq}
 
     def test_check_output(self):
         self.check_output()
 
 
-class TestFakeDequantizeMaxAbsOp5Bits(OpTest):
+class TestFakeDequantizeMaxAbsOpDouble(TestFakeDequantizeMaxAbsOp):
+    def set_args(self):
+        self.num_bits = 8
+        self.max_range = math.pow(2, self.num_bits - 1) - 1
+        self.data_type = "float64"
+
+
+class TestFakeDequantizeMaxAbsOp5Bits(TestFakeDequantizeMaxAbsOp):
     def set_args(self):
         self.num_bits = 5
+        self.max_range = math.pow(2, self.num_bits - 1) - 1
+        self.data_type = "float32"
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..820ad4af88e9dc49cbe57ac182e1ba0402725f3d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
@@ -0,0 +1,66 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestFakeQuantizeOp(OpTest):
+    def setUp(self):
+        self.op_type = "fake_quantize_abs_max"
+        self.attrs = {'bit_length': 8}
+        self.inputs = {'X': np.random.random((124, 240)).astype("float32"), }
+        scale = np.max(np.abs(self.inputs['X'])).astype("float32")
+        self.outputs = {
+            'Out': np.round(self.inputs['X'] / scale * (
+                (1 << (self.attrs['bit_length'] - 1)) - 1)),
+            'OutScale': np.array(scale).astype("float32"),
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestFakeQuantizeOp(OpTest):
+    def setUp(self):
+        self.op_type = "fake_quantize_range_abs_max"
+        self.attrs = {
+            'bit_length': int(5),
+            'window_size': int(1),
+            'is_test': False
+        }
+        self.inputs = {
+            'X': np.random.random((8, 16, 7, 7)).astype("float32"),
+            'Iter': np.zeros(1).astype("int64"),
+            'InScale': np.zeros(1).astype("float32")
+        }
+        scale = np.max(np.abs(self.inputs['X'])).astype("float32")
+        out_scales = np.zeros(self.attrs['window_size']).astype("float32")
+        out_scales[0] = scale
+        self.outputs = {
+            'Out': np.round(self.inputs['X'] / scale * (
+                (1 << (self.attrs['bit_length'] - 1)) - 1)),
+            'OutScale': scale,
+            'OutScales': out_scales,
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fc_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_fc_mkldnn_op.py
index 3f547f3c484bf034a87823a75d946ef130a5cb70..45951a34d6f61a242cb2dc004d6801a6c1c9dd92 100644
--- a/python/paddle/fluid/tests/unittests/test_fc_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fc_mkldnn_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
@@ -22,6 +24,7 @@ def fully_connected_naive(input, weights, bias_data=None):
     w_h, w_c = weights.shape
 
     x_data = np.reshape(input, [in_n, in_c * in_h * in_w])
+    # this transpose should be implemented at C code
     w_data = np.transpose(np.reshape(weights, (w_c, in_c * in_h * in_w)))
     result = None
 
@@ -43,15 +46,11 @@ class TestFCMKLDNNOp(OpTest):
     def setUp(self):
         self.op_type = "fc"
         self.use_mkldnn = True
-        self.with_bias = True
         self.matrix = MatrixGenerate(1, 10, 15, 3, 3)
 
         self.inputs = {'Input': self.matrix.input, 'W': self.matrix.weights}
 
-        self.attrs = {
-            'use_mkldnn': self.use_mkldnn,
-            'with_bias': self.with_bias
-        }
+        self.attrs = {'use_mkldnn': self.use_mkldnn, }
 
         self.outputs = {
             'Out': fully_connected_naive(self.matrix.input, self.matrix.weights)
@@ -85,13 +84,11 @@ class TestFCMKLDNNOp3(TestFCMKLDNNOp):
 
 class TestFCMKLDNNOp4(TestFCMKLDNNOp):
     def init_op_type(self):
-        self.with_bias = False
         self.matrix = MatrixGenerate(2, 32, 48, 2, 2)
 
 
 class TestFCMKLDNNOp4(TestFCMKLDNNOp):
     def init_op_type(self):
-        self.with_bias = False
         self.matrix = MatrixGenerate(2, 32, 1000, 6, 6)
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_fc_op.py b/python/paddle/fluid/tests/unittests/test_fc_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff417ad2f16b83cd42a0603375c14450195e7fc0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fc_op.py
@@ -0,0 +1,110 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def fc_refer(matrix, with_bias):
+    in_n, in_c, in_h, in_w = matrix.input.shape
+    w_i, w_o = matrix.weights.shape
+
+    x_data = np.reshape(matrix.input, [in_n, in_c * in_h * in_w])
+    w_data = np.reshape(matrix.weights, [w_i, w_o])
+    b_data = np.reshape(matrix.bias, [1, w_o])
+    result = None
+
+    if with_bias:
+        result = np.dot(x_data, w_data) + b_data
+    else:
+        result = np.dot(x_data, w_data)
+
+    return result
+
+
+class MatrixGenerate:
+    def __init__(self, mb, ic, oc, h, w):
+        self.input = np.random.random((mb, ic, h, w)).astype("float32")
+        self.weights = np.random.random((ic * h * w, oc)).astype("float32")
+        self.bias = np.random.random((1, oc)).astype("float32")
+
+
+class TestFCOp(OpTest):
+    def setUp(self):
+        self.op_type = "fc"
+        self.matrix = MatrixGenerate(1, 10, 15, 3, 3)
+
+        self.with_bias = True
+        if self.with_bias:
+            self.inputs = {
+                'Input': self.matrix.input,
+                'W': self.matrix.weights,
+                'Bias': self.matrix.bias
+            }
+        else:
+            self.inputs = {'Input': self.matrix.input, 'W': self.matrix.weights}
+
+        self.attrs = {'use_mkldnn': False}
+
+        self.outputs = {'Out': fc_refer(self.matrix, self.with_bias)}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestFCOpNoBias(TestFCOp):
+    def init_shapes(self, mb, ic, oc, h, w):
+        self.with_bias = False
+        self.matrix = MatrixGenerate(mb, ic, oc, h, w)
+
+
+class TestFCOpWithBias(TestFCOp):
+    def init_shapes(self, mb, ic, oc, h, w):
+        self.with_bias = True
+        self.matrix = MatrixGenerate(mb, ic, oc, h, w)
+
+
+class TestFCOp1(TestFCOpNoBias):
+    def init_op_type(self):
+        self.init_shapes(2, 8, 10, 1, 1)
+
+
+class TestFCOp2(TestFCOpNoBias):
+    def init_op_type(self):
+        self.init_shapes(4, 5, 6, 2, 2)
+
+
+class TestFCOp4(TestFCOpNoBias):
+    def init_op_type(self):
+        self.init_shapes(1, 32, 64, 3, 3)
+
+
+class TestFCOpWithBias1(TestFCOpWithBias):
+    def init_op_type(self):
+        self.init_shapes(3, 8, 10, 2, 1)
+
+
+class TestFCOpWithBias2(TestFCOpWithBias):
+    def init_op_type(self):
+        self.init_shapes(4, 5, 6, 2, 2)
+
+
+class TestFCOpWithBias3(TestFCOpWithBias):
+    def init_op_type(self):
+        self.init_shapes(1, 64, 32, 3, 3)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_feed_fetch_method.py b/python/paddle/fluid/tests/unittests/test_feed_fetch_method.py
index 8b9da843115409c65055927d317867d1290c8f0e..b823d397e9530362f5fee417278e36477d65f6f5 100644
--- a/python/paddle/fluid/tests/unittests/test_feed_fetch_method.py
+++ b/python/paddle/fluid/tests/unittests/test_feed_fetch_method.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import paddle.fluid.core as core
 import unittest
 import numpy as np
diff --git a/python/paddle/fluid/tests/unittests/test_fetch_var.py b/python/paddle/fluid/tests/unittests/test_fetch_var.py
index 46c3bbb6712c6276e48dd9328d7741a447f28b91..de339d821b1329662469c26eacd234b74a102e13 100644
--- a/python/paddle/fluid/tests/unittests/test_fetch_var.py
+++ b/python/paddle/fluid/tests/unittests/test_fetch_var.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 import op_test
@@ -26,7 +28,7 @@ class TestFetchVar(op_test.OpTest):
         layers.assign(input=val, output=x)
         exe = fluid.Executor(fluid.CPUPlace())
         exe.run(fluid.default_main_program(), feed={}, fetch_list=[])
-        fetched_x = fluid.fetch_var("x")
+        fetched_x = fluid.executor._fetch_var("x")
         self.assertTrue(
             numpy.array_equal(fetched_x, val),
             "fetch_x=%s val=%s" % (fetched_x, val))
diff --git a/python/paddle/fluid/tests/unittests/test_fill_constant_batch_size_like_op.py b/python/paddle/fluid/tests/unittests/test_fill_constant_batch_size_like_op.py
index 0c75cf33f5f208d11081a6802910c25553b8c4ec..fdc8a118e56f4473da5ed60169daebec14c7c33c 100644
--- a/python/paddle/fluid/tests/unittests/test_fill_constant_batch_size_like_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fill_constant_batch_size_like_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_fill_constant_op.py b/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
index 5e2ddb218af8fcf4f686260296b57519ec7486b9..fd59c5bb7cff5dd33fae284ba3efe04e667ed75a 100644
--- a/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
@@ -12,10 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
 
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+
 
 class TestFillConstantOp1(OpTest):
     def setUp(self):
@@ -45,5 +50,31 @@ class TestFillConstantOp2(OpTest):
         self.check_output()
 
 
+class TestFillConstantOpWithSelectedRows(OpTest):
+    def check_with_place(self, place):
+        scope = core.Scope()
+        # create Out Variable
+        out = scope.var('Out').get_selected_rows()
+
+        # create and run fill_constant_op operator
+        fill_constant_op = Operator(
+            "fill_constant", shape=[123, 92], value=3.8, Out='Out')
+        fill_constant_op.run(scope, place)
+
+        # get result from Out
+        result_array = np.array(out.get_tensor())
+        full_array = np.full((123, 92), 3.8, 'float32')
+
+        self.assertTrue(np.array_equal(result_array, full_array))
+
+    def test_fill_constant_with_selected_rows(self):
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
+
+        for place in places:
+            self.check_with_place(place)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fill_op.py b/python/paddle/fluid/tests/unittests/test_fill_op.py
index 762d29199e2127415ed7daabca63edcdbae3344f..b734ee05b3f2291d7a79f1550946bf6546ada6e0 100644
--- a/python/paddle/fluid/tests/unittests/test_fill_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fill_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_fill_zeros_like_op.py b/python/paddle/fluid/tests/unittests/test_fill_zeros_like_op.py
index c9b3e4ba138f425fd2991bf637d2e32be3eb5168..eec73d0beb39c49f535a03532e536092001c8445 100644
--- a/python/paddle/fluid/tests/unittests/test_fill_zeros_like_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fill_zeros_like_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_flatten_op.py b/python/paddle/fluid/tests/unittests/test_flatten_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..effa2a148eef8b0047b12c676803abb2871e8118
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_flatten_op.py
@@ -0,0 +1,73 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+
+from op_test import OpTest
+
+
+class TestFlattenOp(OpTest):
+    def setUp(self):
+        self.op_type = "flatten2"
+        self.init_test_case()
+        self.inputs = {"X": np.random.random(self.in_shape).astype("float32")}
+        self.init_attrs()
+        self.outputs = {
+            "Out": self.inputs["X"].reshape(self.new_shape),
+            "XShape": np.random.random(self.in_shape).astype("float32")
+        }
+
+    def test_check_output(self):
+        self.check_output(no_check_set=["XShape"])
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+    def init_test_case(self):
+        self.in_shape = (3, 2, 2, 5)
+        self.axis = 1
+        self.new_shape = (3, 20)
+
+    def init_attrs(self):
+        self.attrs = {"axis": self.axis}
+
+
+class TestFlattenOp(TestFlattenOp):
+    def init_test_case(self):
+        self.in_shape = (3, 2, 2, 3)
+        self.axis = 0
+        self.new_shape = (1, 36)
+
+
+class TestFlattenOpWithDefaultAxis(TestFlattenOp):
+    def init_test_case(self):
+        self.in_shape = (3, 2, 2, 3)
+        self.new_shape = (3, 12)
+
+    def init_attrs(self):
+        self.attrs = {}
+
+
+class TestFlattenOpSixDims(TestFlattenOp):
+    def init_test_case(self):
+        self.in_shape = (3, 2, 3, 2, 4, 4)
+        self.axis = 4
+        self.new_shape = (36, 16)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_framework_debug_str.py b/python/paddle/fluid/tests/unittests/test_framework_debug_str.py
index c906c74afe66b05e2ca0e1122677e2dc738351b8..72f43e56ccbe04f56cfd5a655fb57c58369039bb 100644
--- a/python/paddle/fluid/tests/unittests/test_framework_debug_str.py
+++ b/python/paddle/fluid/tests/unittests/test_framework_debug_str.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 from paddle.fluid.framework import Program
 
diff --git a/python/paddle/fluid/tests/unittests/test_ftrl_op.py b/python/paddle/fluid/tests/unittests/test_ftrl_op.py
index 5f7581391afb2bd57cb25695f3c0d4db8573c80c..a6390b054f06184831c289fe9556216ae213be7c 100644
--- a/python/paddle/fluid/tests/unittests/test_ftrl_op.py
+++ b/python/paddle/fluid/tests/unittests/test_ftrl_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_fused_elemwise_activation_op.py b/python/paddle/fluid/tests/unittests/test_fused_elemwise_activation_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a213c29113e5e23af2caf7fbcb807be3d0166d2
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fused_elemwise_activation_op.py
@@ -0,0 +1,341 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+from functools import partial
+import paddle.fluid.core as core
+from op_test import OpTest
+
+#   TestFusedElementwiseActivationOp
+#   TestFusedElementwiseActivationOp_scalar
+#   TestFusedElementwiseActivationOp_scalar2
+#   TestFusedElementwiseActivationOp_Vector
+#   TestFusedElementwiseActivationOp_broadcast_0
+#   TestFusedElementwiseActivationOp_broadcast_1
+#   TestFusedElementwiseActivationOp_broadcast_2
+#   TestFusedElementwiseActivationOp_broadcast_3
+#   TestFusedElementwiseActivationOp_broadcast_4
+#   TestFusedElementwiseActivationOp_rowwise_add_0
+#   TestFusedElementwiseActivationOp_rowwise_add_1
+#   TestFusedElementwiseActivationOp_channelwise_add
+
+
+def create_test_class(test_case, callback, attrs):
+    class TestFusedElementwiseActivationOp_base(OpTest):
+        def setUp(self):
+            self.op_type = "fused_elemwise_activation"
+            self.dtype = np.float32
+            self.axis = -1
+
+            self.init_input()
+            self.init_output()
+            self.init_attr()
+
+            self.inputs = {
+                'X': OpTest.np_dtype_to_fluid_dtype(self.x),
+                'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
+            }
+            if self.attrs["keep_intermediate_value"]:
+                self.outputs = {
+                    'Out': self.out,
+                    "IntermediateOut": self.intermediate_out
+                }
+            else:
+                self.outputs = {'Out': self.out}
+
+        def init_input(self):
+            self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+            self.y = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+            self.axis = -1
+
+        def init_output(self):
+            self.x, self.y, self.intermediate_out, self.out = \
+                callback(self.x, self.y, self.x, self.y)
+
+        def init_attr(self):
+            self.attrs = {'axis': self.axis, }
+            for key in attrs.keys():
+                self.attrs[key] = attrs[key]
+
+        def test_check_output(self):
+            self.check_output()
+
+        def test_check_grad_normal(self):
+            if self.attrs["keep_intermediate_value"]:
+                self.check_grad(
+                    ['X', 'Y'], ['Out', 'IntermediateOut'],
+                    max_relative_error=0.005,
+                    sum_outputs=['Out'])
+            else:
+                self.check_grad(['X', 'Y'], ['Out'], max_relative_error=0.005)
+
+        def test_check_grad_ingore_x(self):
+            if self.attrs["keep_intermediate_value"]:
+                self.check_grad(
+                    ['Y'], ['Out', 'IntermediateOut'],
+                    max_relative_error=0.005,
+                    no_grad_set=set("X"),
+                    sum_outputs=['Out'])
+            else:
+                self.check_grad(
+                    ['Y'], ['Out'],
+                    max_relative_error=0.005,
+                    no_grad_set=set("X"))
+
+        def test_check_grad_ingore_y(self):
+            if self.attrs["keep_intermediate_value"]:
+                self.check_grad(
+                    ['X'], ['Out', 'IntermediateOut'],
+                    max_relative_error=0.005,
+                    no_grad_set=set("Y"),
+                    sum_outputs=['Out'])
+            else:
+                self.check_grad(
+                    ['X'], ['Out'],
+                    max_relative_error=0.005,
+                    no_grad_set=set("Y"))
+
+    class TestFusedElementwiseActivationOp_scalar(
+            TestFusedElementwiseActivationOp_base):
+        def init_input(self):
+            self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+            self.y = np.random.rand(1).astype(self.dtype)
+
+    class TestFusedElementwiseActivationOp_scalar2(
+            TestFusedElementwiseActivationOp_base):
+        def init_input(self):
+            self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+            self.y = np.random.rand(1, 1).astype(self.dtype)
+
+    class TestFusedElementwiseActivationOp_Vector(
+            TestFusedElementwiseActivationOp_base):
+        def init_input(self):
+            self.x = np.random.random((32, )).astype(self.dtype)
+            self.y = np.random.random((32, )).astype(self.dtype)
+
+    class TestFusedElementwiseActivationOp_broadcast_0(
+            TestFusedElementwiseActivationOp_base):
+        def init_input(self):
+            self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+            self.y = np.random.rand(2).astype(self.dtype)
+            self.axis = 0
+
+        def init_output(self):
+            self.x, self.y, self.intermediate_out, self.out = \
+                callback(self.x, self.y, self.x, self.y.reshape(2, 1, 1))
+
+    class TestFusedElementwiseActivationOp_broadcast_1(
+            TestFusedElementwiseActivationOp_base):
+        def init_input(self):
+            self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+            self.y = np.random.rand(3).astype(self.dtype)
+            self.axis = 1
+
+        def init_output(self):
+            self.x, self.y, self.intermediate_out, self.out = \
+                callback(self.x, self.y, self.x, self.y.reshape(1, 3, 1))
+
+    class TestFusedElementwiseActivationOp_broadcast_2(
+            TestFusedElementwiseActivationOp_base):
+        def init_input(self):
+            self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+            self.y = np.random.rand(4).astype(self.dtype)
+
+        def init_output(self):
+            self.x, self.y, self.intermediate_out, self.out = \
+                callback(self.x, self.y, self.x, self.y.reshape(1, 1, 4))
+
+    class TestFusedElementwiseActivationOp_broadcast_3(
+            TestFusedElementwiseActivationOp_base):
+        def init_input(self):
+            self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype)
+            self.y = np.random.rand(3, 4).astype(self.dtype)
+            self.axis = 1
+
+        def init_output(self):
+            self.x, self.y, self.intermediate_out, self.out = \
+                callback(self.x, self.y, self.x, self.y.reshape(1, 3, 4, 1))
+
+    class TestFusedElementwiseActivationOp_broadcast_4(
+            TestFusedElementwiseActivationOp_base):
+        def init_input(self):
+            self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype)
+            self.y = np.random.rand(2, 1).astype(self.dtype)
+            self.axis = 0
+
+        def init_output(self):
+            self.x, self.y, self.intermediate_out, self.out = \
+                callback(self.x, self.y, self.x, self.y.reshape(2, 1, 1, 1))
+
+    class TestFusedElementwiseActivationOp_rowwise_add_0(
+            TestFusedElementwiseActivationOp_base):
+        def init_input(self):
+            self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+            self.y = np.random.rand(3, 4).astype(self.dtype)
+            self.axis = 1
+
+        def init_output(self):
+            self.x, self.y, self.intermediate_out, self.out = \
+                callback(self.x, self.y, self.x, self.y.reshape(1, 3, 4))
+
+    class TestFusedElementwiseActivationOp_rowwise_add_1(
+            TestFusedElementwiseActivationOp_base):
+        def init_input(self):
+            self.x = np.random.rand(2, 1).astype(self.dtype)
+            self.y = np.random.rand(1).astype(self.dtype)
+            self.axis = 1
+
+        def init_output(self):
+            self.x, self.y, self.intermediate_out, self.out = \
+                callback(self.x, self.y, self.x, self.y.reshape(1, 1))
+
+    class TestFusedElementwiseActivationOp_channelwise_add(
+            TestFusedElementwiseActivationOp_base):
+        def init_input(self):
+            self.x = np.random.rand(3, 20, 20).astype(self.dtype)
+            self.y = np.random.rand(3, 1, 1).astype(self.dtype)
+
+    TestFusedElementwiseActivationOp_base.__name__ = test_case + "_base"
+    TestFusedElementwiseActivationOp_scalar.__name__ = test_case + "_scalar"
+    TestFusedElementwiseActivationOp_scalar2.__name__ = test_case + "_scalar2"
+    TestFusedElementwiseActivationOp_Vector.__name__ = test_case + "_Vector"
+    TestFusedElementwiseActivationOp_broadcast_0.__name__ = test_case + "_broadcast_0"
+    TestFusedElementwiseActivationOp_broadcast_1.__name__ = test_case + "_broadcast_1"
+    TestFusedElementwiseActivationOp_broadcast_2.__name__ = test_case + "_broadcast_2"
+    TestFusedElementwiseActivationOp_broadcast_3.__name__ = test_case + "_broadcast_3"
+    TestFusedElementwiseActivationOp_broadcast_4.__name__ = test_case + "_broadcast_4"
+    TestFusedElementwiseActivationOp_rowwise_add_0.__name__ = test_case + "_rowwise_add_0"
+    TestFusedElementwiseActivationOp_rowwise_add_1.__name__ = test_case + "_rowwise_add_1"
+    TestFusedElementwiseActivationOp_channelwise_add.__name__ = test_case + "_channelwise_add"
+
+    globals()[test_case + "_base"] = TestFusedElementwiseActivationOp_base
+    globals()[test_case + "_scalar"] = TestFusedElementwiseActivationOp_scalar
+    globals()[test_case + "_scalar2"] = TestFusedElementwiseActivationOp_scalar2
+    globals()[test_case + "_Vector"] = TestFusedElementwiseActivationOp_Vector
+    globals()[test_case +
+              "_broadcast_0"] = TestFusedElementwiseActivationOp_broadcast_0
+    globals()[test_case +
+              "_broadcast_1"] = TestFusedElementwiseActivationOp_broadcast_1
+    globals()[test_case +
+              "_broadcast_2"] = TestFusedElementwiseActivationOp_broadcast_2
+    globals()[test_case +
+              "_broadcast_3"] = TestFusedElementwiseActivationOp_broadcast_3
+    globals()[test_case +
+              "_broadcast_4"] = TestFusedElementwiseActivationOp_broadcast_4
+    globals()[test_case +
+              "_rowwise_add_0"] = TestFusedElementwiseActivationOp_rowwise_add_0
+    globals()[test_case +
+              "_rowwise_add_1"] = TestFusedElementwiseActivationOp_rowwise_add_1
+    globals(
+    )[test_case +
+      "_channelwise_add"] = TestFusedElementwiseActivationOp_channelwise_add
+
+
+def scale_add_func(x, y, x_bcast, y_bcast, scale, mode=0):
+    if mode == 0:
+        return x, y, (x_bcast + y_bcast), (x_bcast + y_bcast) * scale
+    else:
+        return y, x, (x_bcast + y_bcast), (x_bcast + y_bcast) * scale
+
+
+def add_scale_func(x, y, x_bcast, y_bcast, scale, mode=0):
+    if mode == 0:
+        return x, y, y * scale, x_bcast + y_bcast * scale
+    else:
+        return y, x, x * scale, y_bcast + x_bcast * scale
+
+
+def add_relu_func(x, y, x_bcast, y_bcast, mode=0):
+    # Copy from test_activation_op.py
+    # Because we set delta = 0.005 in calculating numeric gradient,
+    # if x is too small, such as 0.002, x_neg will be -0.003
+    # x_pos will be 0.007, so the numeric gradient is inaccurate.
+    # we should avoid this
+    if mode == 0:
+        y[np.abs(y) < 0.005] = 0.02
+        y_bcast[np.abs(y_bcast) < 0.005] = 0.02
+        return x, y, np.maximum(y, 0), x_bcast + np.maximum(y_bcast, 0)
+    else:
+        x[np.abs(x) < 0.005] = 0.02
+        x_bcast[np.abs(x_bcast) < 0.005] = 0.02
+        return y, x, np.maximum(x, 0), y_bcast + np.maximum(x_bcast, 0)
+
+
+def relu_add_func(x, y, x_bcast, y_bcast, mode=0):
+    intermediate_out = x_bcast + y_bcast
+    out = np.maximum(intermediate_out, 0)
+    out[np.abs(out) < 0.005] = 0.02
+    if mode == 0:
+        return x, y, intermediate_out, out
+    else:
+        return y, x, intermediate_out, out
+
+
+def mul_scale_func(x, y, x_bcast, y_bcast, scale, mode=0):
+    if mode == 0:
+        return x, y, y * scale, x_bcast * (y_bcast * scale)
+    else:
+        return y, x, x * scale, y_bcast * (x_bcast * scale)
+
+
+scale = 0.1
+scale_add_func = partial(scale_add_func, scale=scale)
+add_scale_func = partial(add_scale_func, scale=scale)
+mul_scale_func = partial(mul_scale_func, scale=scale)
+
+for mode in {0, 1}:
+    scale_add_func = partial(scale_add_func, mode=mode)
+    add_scale_func = partial(add_scale_func, mode=mode)
+    mul_scale_func = partial(mul_scale_func, mode=mode)
+    relu_add_func = partial(relu_add_func, mode=mode)
+    add_relu_func = partial(add_relu_func, mode=mode)
+
+    for recomputation in {True, False}:
+        for keep_intermediate_value in {True, False}:
+            suffix = ("_keep_intermediate_value" if keep_intermediate_value else "") \
+                     + ("_recomputation" if recomputation else "") \
+                     + ("_mode_"+ str(mode))
+            create_test_class('scale_add' + suffix, scale_add_func, {
+                'scale': scale,
+                'functor_list': ["scale", "elementwise_add"],
+                'keep_intermediate_value': keep_intermediate_value,
+                'recomputation': recomputation
+            })
+            create_test_class('add_scale' + suffix, add_scale_func, {
+                'scale': scale,
+                'functor_list': ["elementwise_add", "scale"],
+                'keep_intermediate_value': keep_intermediate_value,
+                'recomputation': recomputation
+            })
+            create_test_class('add_relu' + suffix, add_relu_func, {
+                'functor_list': ["elementwise_add", "relu"],
+                'keep_intermediate_value': keep_intermediate_value,
+                'recomputation': recomputation
+            })
+            create_test_class('relu_add' + suffix, relu_add_func, {
+                'functor_list': ["relu", "elementwise_add"],
+                'keep_intermediate_value': keep_intermediate_value,
+                'recomputation': recomputation
+            })
+            create_test_class('mul_scale' + suffix, mul_scale_func, {
+                'scale': scale,
+                'functor_list': ["elementwise_mul", "scale"],
+                'keep_intermediate_value': keep_intermediate_value,
+                'recomputation': recomputation
+            })
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fusion_gru_op.py b/python/paddle/fluid/tests/unittests/test_fusion_gru_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..36ebc8fb6ea9efdcd1807f5c8917ab1428b3381e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fusion_gru_op.py
@@ -0,0 +1,135 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import math
+from op_test import OpTest
+from test_gru_op import gru
+from test_fusion_lstm_op import fc, ACTIVATION
+
+
+def fusion_gru(
+        x,  # T x M
+        lod,  # 1 x N
+        h0,  # N x D
+        wx,  # M x 3D
+        wh,  # D x 3D
+        bias,  # 1 x 3D
+        is_reverse,
+        act_state,
+        act_gate):
+    return gru(fc(x, wx, bias),
+               lod,
+               h0,
+               wh,
+               np.zeros(
+                   (1, wh.shape[1]), dtype='float32'),
+               is_reverse,
+               act_state,
+               act_gate)
+
+
+class TestFusionGRUOp(OpTest):
+    def set_confs(self):
+        pass
+
+    def setUp(self):
+        self.op_type = "fusion_gru"
+        self.lod = [[2, 4, 3]]
+        self.M = 3
+        self.D = 5
+        self.is_reverse = False
+        self.with_h0 = True
+        self.with_bias = True
+        self.act_state = 'tanh'
+        self.act_gate = 'sigmoid'
+        self.set_confs()
+
+        T = sum(self.lod[0])
+        N = len(self.lod[0])
+
+        x = np.random.rand(T, self.M).astype('float32')
+        wx = np.random.rand(self.M, 3 * self.D).astype('float32')
+        wh = np.random.rand(self.D, 3 * self.D).astype('float32')
+        bias = np.random.rand(
+            1, 3 * self.D).astype('float32') if self.with_bias else np.zeros(
+                (1, 3 * self.D), dtype='float32')
+        h0 = np.random.rand(
+            N, self.D).astype('float32') if self.with_h0 else np.zeros(
+                (N, self.D), dtype='float32')
+
+        _, _, _, hidden = fusion_gru(
+            x, self.lod, h0, wx, wh, bias, self.is_reverse,
+            ACTIVATION[self.act_state], ACTIVATION[self.act_gate])
+
+        self.inputs = {'X': (x, self.lod), 'WeightX': wx, 'WeightH': wh}
+
+        if self.with_bias:
+            self.inputs['Bias'] = bias
+
+        if self.with_h0:
+            self.inputs['H0'] = h0
+
+        self.outputs = {'Hidden': (hidden, self.lod)}
+
+        self.attrs = {
+            'activation': self.act_state,
+            'gate_activation': self.act_gate,
+            'is_reverse': self.is_reverse
+        }
+
+    def test_check_output(self):
+        for use_seq in {True, False}:
+            self.attrs['use_seq'] = use_seq
+            self.check_output()
+
+
+class TestFusionGRUOpNoInitial(TestFusionGRUOp):
+    def set_confs(self):
+        self.with_h0 = False
+
+
+class TestFusionGRUOpNoBias(TestFusionGRUOp):
+    def set_confs(self):
+        self.with_bias = False
+
+
+class TestFusionGRUOpReverse(TestFusionGRUOp):
+    def set_confs(self):
+        self.is_reverse = True
+
+
+class TestFusionGRUOpMD1(TestFusionGRUOp):
+    def set_confs(self):
+        self.M = 36
+        self.D = 8
+
+
+class TestFusionGRUOpMD2(TestFusionGRUOp):
+    def set_confs(self):
+        self.M = 8
+        self.D = 8
+
+
+class TestFusionGRUOpBS1(TestFusionGRUOp):
+    def set_confs(self):
+        self.lod = [[3]]
+        self.D = 16
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py b/python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..de0c86f96db958eebd7e74346bec244f0c804ed9
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py
@@ -0,0 +1,194 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+from test_lstm_op import lstm, ACTIVATION
+
+
+def fc(x, w, b):
+    return np.dot(x, w) + b
+
+
+def fusion_lstm(
+        x,  # T x M
+        lod,  # 1 x N
+        wx=None,  # M x 4D
+        bx=None,  # 1 x 4D
+        h0=None,  # N x D
+        c0=None,  # N x D
+        w_h=None,  # D x 4D
+        w_b=None,  # 1 x 4D
+        w_c=None,  # 1 x 3D
+        is_reverse=False,
+        act_gate=None,
+        act_cell=None,
+        act_cand=None):
+    return lstm(
+        fc(x, wx, bx), lod, h0, c0, w_h, w_b, w_c, is_reverse, act_gate,
+        act_cell, act_cand)
+
+
+class TestFusionLSTMOp(OpTest):
+    def set_conf(self):
+        pass
+
+    def setUp(self):
+        self.op_type = 'fusion_lstm'
+        self.lod = [[2, 3, 5, 4]]
+        self.M = 8
+        self.D = 16
+        self.has_initial_state = False
+        self.use_peepholes = False
+        self.is_reverse = False
+        self.act_gate = 'sigmoid'
+        self.act_cell = 'tanh'
+        self.act_cand = 'tanh'
+        self.set_conf()
+
+        T = sum(self.lod[0])
+        bs = len(self.lod[0])
+
+        x = np.random.normal(size=(T, self.M)).astype('float32')
+        if self.has_initial_state:
+            h0 = np.random.normal(size=(bs, self.D)).astype('float32')
+            c0 = np.random.normal(size=(bs, self.D)).astype('float32')
+        else:
+            h0 = np.zeros((bs, self.D)).astype('float32')
+            c0 = np.zeros((bs, self.D)).astype('float32')
+
+        wh = np.random.normal(size=(self.D, 4 * self.D)).astype('float32')
+
+        if self.use_peepholes:
+            b = np.random.normal(size=(1, 7 * self.D)).astype('float32')
+        else:
+            b = np.random.normal(size=(1, 4 * self.D)).astype('float32')
+        w_b = np.copy(b[:, 0:4 * self.D])
+        w_c = b[:, 4 * self.D:] if self.use_peepholes else None
+
+        # this is the weight of fc
+        wx = np.random.normal(size=(self.M, 4 * self.D)).astype('float32')
+        # this is the bias of fc
+        # and it should be manually added into the bias of this fusion LSTM
+        bx = np.random.normal(size=(1, 4 * self.D)).astype('float32')
+        b[0, 0:4 * self.D] += bx[0, :]
+        h, c = fusion_lstm(x, self.lod, wx, bx, h0, c0, wh, w_b, w_c,
+                           self.is_reverse, ACTIVATION[self.act_gate],
+                           ACTIVATION[self.act_cell], ACTIVATION[self.act_cand])
+
+        self.inputs = {
+            'X': (x, self.lod),
+            'WeightX': wx,
+            'WeightH': wh,
+            'Bias': b
+        }
+
+        if self.has_initial_state:
+            self.inputs['H0'] = h0
+            self.inputs['C0'] = c0
+
+        self.outputs = {
+            'Hidden': (h, self.lod),
+            'Cell': (c, self.lod),
+        }
+        self.attrs = {
+            'use_peepholes': self.use_peepholes,
+            'is_reverse': self.is_reverse,
+            'gate_activation': self.act_gate,
+            'cell_activation': self.act_cell,
+            'candidate_activation': self.act_cand
+        }
+
+    def test_check_output(self):
+        for use_seq in {True, False}:
+            self.attrs['use_seq'] = use_seq
+            self.check_output()
+
+
+class TestFusionLSTMOpInit(TestFusionLSTMOp):
+    def set_conf(self):
+        self.has_initial_state = True
+
+
+class TestFusionLSTMOpReverse(TestFusionLSTMOp):
+    def set_conf(self):
+        self.is_reverse = True
+
+
+class TestFusionLSTMOpInitReverse(TestFusionLSTMOp):
+    def set_conf(self):
+        self.has_initial_state = True
+        self.is_reverse = True
+
+
+class TestFusionLSTMOpMD1(TestFusionLSTMOp):
+    def set_conf(self):
+        self.M = 36
+        self.D = 8
+
+
+class TestFusionLSTMOpMD2(TestFusionLSTMOp):
+    def set_conf(self):
+        self.M = 8
+        self.D = 8
+
+
+class TestFusionLSTMOpMD3(TestFusionLSTMOp):
+    def set_conf(self):
+        self.M = 15
+        self.D = 3
+
+
+class TestFusionLSTMOpBS1(TestFusionLSTMOp):
+    def set_conf(self):
+        self.lod = [[3]]
+        self.D = 16
+
+
+class TestFusionLSTMOpPeepholes(TestFusionLSTMOp):
+    def set_conf(self):
+        self.use_peepholes = True
+
+
+class TestFusionLSTMOpPeepholesInit(TestFusionLSTMOp):
+    def set_conf(self):
+        self.use_peepholes = True
+        self.has_initial_state = True
+
+
+class TestFusionLSTMOpPeepholesReverse(TestFusionLSTMOp):
+    def set_conf(self):
+        self.use_peepholes = True
+        self.is_reverse = True
+
+
+class TestFusionLSTMOpPeepholesInitReverse(TestFusionLSTMOp):
+    def set_conf(self):
+        self.use_peepholes = True
+        self.has_initial_state = True
+        self.is_reverse = True
+
+
+class TestFusionLSTMOpPeepholesBS1(TestFusionLSTMOp):
+    def set_conf(self):
+        self.use_peepholes = True
+        self.lod = [[2]]
+        self.D = 8
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fusion_seqexpand_concat_fc_op.py b/python/paddle/fluid/tests/unittests/test_fusion_seqexpand_concat_fc_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..aeee3a9999a94b4979fc3793150101352e50be85
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fusion_seqexpand_concat_fc_op.py
@@ -0,0 +1,139 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+from test_fusion_lstm_op import fc, ACTIVATION
+
+
+def fusion_seqexpand_concat_fc(xs, lod, w, b, fc_act):
+
+    T = sum(lod[0])
+    N = len(lod[0])
+    num_inputs = len(xs)
+    D = w.shape[1]
+
+    expanded_inputs = [xs[0]]
+    for i in range(num_inputs - 1):
+        x = xs[i + 1]
+        assert x.shape[0] == N
+        expanded = np.repeat(x, lod[0], axis=0)
+        assert expanded.shape[0] == T
+        assert expanded.shape[1] == x.shape[1]
+        expanded_inputs.append(expanded)
+
+    fc_input = np.concatenate(expanded_inputs, axis=1)
+    assert fc_input.shape[0] == T
+    assert fc_input.shape[1] == w.shape[0]
+    fc_out = fc(fc_input, w, b)
+    fc_out = fc_act(fc_out)
+    assert fc_out.shape[0] == T
+    assert fc_out.shape[1] == D
+    return fc_out
+
+
+class TestFusionSeqExpandConcatFCOp(OpTest):
+    def set_conf(self):
+        pass
+
+    def setUp(self):
+        self.op_type = 'fusion_seqexpand_concat_fc'
+        self.lod = [[3, 5, 8, 2]]
+        self.inputs_M = [15, 10, 10]
+        self.D = 20
+        self.with_bias = True
+        self.fc_act = 'relu'
+        self.set_conf()
+
+        T = sum(self.lod[0])
+        bs = len(self.lod[0])
+        num_inputs = len(self.inputs_M)
+
+        x0 = np.random.normal(size=(T, self.inputs_M[0])).astype('float32')
+        xs = [x0]
+        for i in range(num_inputs - 1):
+            xi = np.random.normal(size=(bs,
+                                        self.inputs_M[i + 1])).astype('float32')
+            xs.append(xi)
+
+        # fc weight and bias
+        w = np.random.normal(size=(sum(self.inputs_M),
+                                   self.D)).astype('float32')
+        b = np.random.normal(size=(
+            1, self.D)).astype('float32') if self.with_bias else np.zeros(
+                (1, self.D)).astype('float32')
+
+        out = fusion_seqexpand_concat_fc(xs, self.lod, w, b,
+                                         ACTIVATION[self.fc_act])
+
+        self.inputs = {'X': [('x0', (x0, self.lod))], 'FCWeight': w}
+        normal_lod = [[1] * bs]
+        for i in range(num_inputs - 1):
+            self.inputs['X'].append(('x%d' % (i + 1), (xs[i + 1], normal_lod)))
+
+        if self.with_bias:
+            self.inputs['FCBias'] = b
+
+        self.outputs = {'Out': (out, self.lod)}
+        self.attrs = {'fc_activation': self.fc_act}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestFusionSECFCOpNonBias(TestFusionSeqExpandConcatFCOp):
+    def set_conf(self):
+        self.with_bias = False
+
+
+class TestFusionSECFCOpNonAct(TestFusionSeqExpandConcatFCOp):
+    def set_conf(self):
+        self.fc_act = 'identity'
+
+
+class TestFusionSECFCOpMD1(TestFusionSeqExpandConcatFCOp):
+    def set_conf(self):
+        self.inputs_M = [3, 4, 2, 1, 5]
+        self.D = 8
+
+
+class TestFusionSECFCOpMD2(TestFusionSeqExpandConcatFCOp):
+    def set_conf(self):
+        self.lod = [[5, 6]]
+        self.inputs_M = [1, 1]
+
+
+class TestFusionSECFCOpBS1_1(TestFusionSeqExpandConcatFCOp):
+    def set_conf(self):
+        self.lod = [[1]]
+        self.inputs_M = [3, 4, 2]
+
+
+class TestFusionSECFCOpBS1_2(TestFusionSeqExpandConcatFCOp):
+    def set_conf(self):
+        self.lod = [[1]]
+        self.inputs_M = [3, 4]
+
+
+class TestFusionSECFCOpBS1_3(TestFusionSeqExpandConcatFCOp):
+    def set_conf(self):
+        self.lod = [[5]]
+        self.inputs_M = [6, 3]
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_gather_op.py b/python/paddle/fluid/tests/unittests/test_gather_op.py
index 4ae90864806204197c52bbbdc5516f141afd4613..bd5785aa55af241fe42a1ae2c550dbdb980f42e2 100644
--- a/python/paddle/fluid/tests/unittests/test_gather_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gather_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_gaussian_random_batch_size_like_op.py b/python/paddle/fluid/tests/unittests/test_gaussian_random_batch_size_like_op.py
index 1398166a74e714e0e902532166cde5d94ccae5f6..9a0631fa26a3e93c5c2115fd03a37de3fac46ce5 100644
--- a/python/paddle/fluid/tests/unittests/test_gaussian_random_batch_size_like_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gaussian_random_batch_size_like_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_gaussian_random_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_gaussian_random_mkldnn_op.py
index 3ae877a60818744f852d3af9a02ffebf5e2affc8..9777ec390656d3f6166bf9f5de7bbad8b6bd786d 100644
--- a/python/paddle/fluid/tests/unittests/test_gaussian_random_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gaussian_random_mkldnn_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 
 from test_gaussian_random_op import TestGaussianRandomOp
diff --git a/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py b/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
index 8481500fd78f0ccf34f09c66bec27e195b9aada3..496aa4111056591efce14549011d66f9ae49713a 100644
--- a/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy
 
diff --git a/python/paddle/fluid/tests/unittests/test_generate_proposal_labels_op.py b/python/paddle/fluid/tests/unittests/test_generate_proposal_labels_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d5cd3b24bff52d82353ccf3fd2ecb69166c66c6
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_generate_proposal_labels_op.py
@@ -0,0 +1,336 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://w_idxw.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import sys
+import math
+import paddle.fluid as fluid
+from op_test import OpTest
+
+
+def generate_proposal_labels_in_python(rpn_rois, gt_classes, is_crowd, gt_boxes,
+                                       im_info, batch_size_per_im, fg_fraction,
+                                       fg_thresh, bg_thresh_hi, bg_thresh_lo,
+                                       bbox_reg_weights, class_nums):
+    rois = []
+    labels_int32 = []
+    bbox_targets = []
+    bbox_inside_weights = []
+    bbox_outside_weights = []
+    lod = []
+    assert len(rpn_rois) == len(
+        im_info), 'batch size of rpn_rois and ground_truth is not matched'
+
+    for im_i in range(len(im_info)):
+        frcn_blobs = _sample_rois(
+            rpn_rois[im_i], gt_classes[im_i], is_crowd[im_i], gt_boxes[im_i],
+            im_info[im_i], batch_size_per_im, fg_fraction, fg_thresh,
+            bg_thresh_hi, bg_thresh_lo, bbox_reg_weights, class_nums)
+
+        lod.append(frcn_blobs['rois'].shape[0])
+
+        rois.append(frcn_blobs['rois'])
+        labels_int32.append(frcn_blobs['labels_int32'])
+        bbox_targets.append(frcn_blobs['bbox_targets'])
+        bbox_inside_weights.append(frcn_blobs['bbox_inside_weights'])
+        bbox_outside_weights.append(frcn_blobs['bbox_outside_weights'])
+
+    return rois, labels_int32, bbox_targets, bbox_inside_weights, bbox_outside_weights, lod
+
+
+def _sample_rois(rpn_rois, gt_classes, is_crowd, gt_boxes, im_info,
+                 batch_size_per_im, fg_fraction, fg_thresh, bg_thresh_hi,
+                 bg_thresh_lo, bbox_reg_weights, class_nums):
+    rois_per_image = int(batch_size_per_im)
+    fg_rois_per_im = int(np.round(fg_fraction * rois_per_image))
+
+    # Roidb
+    im_scale = im_info[2]
+    inv_im_scale = 1. / im_scale
+    rpn_rois = rpn_rois * inv_im_scale
+
+    boxes = np.vstack([gt_boxes, rpn_rois])
+    gt_overlaps = np.zeros((boxes.shape[0], class_nums))
+    box_to_gt_ind_map = np.zeros((boxes.shape[0]), dtype=np.int32)
+    if len(gt_boxes) > 0:
+        proposal_to_gt_overlaps = _bbox_overlaps(boxes, gt_boxes)
+
+        overlaps_argmax = proposal_to_gt_overlaps.argmax(axis=1)
+        overlaps_max = proposal_to_gt_overlaps.max(axis=1)
+        # Boxes which with non-zero overlap with gt boxes
+        overlapped_boxes_ind = np.where(overlaps_max > 0)[0]
+        overlapped_boxes_gt_classes = gt_classes[overlaps_argmax[
+            overlapped_boxes_ind]]
+        gt_overlaps[overlapped_boxes_ind,
+                    overlapped_boxes_gt_classes] = overlaps_max[
+                        overlapped_boxes_ind]
+        box_to_gt_ind_map[overlapped_boxes_ind] = overlaps_argmax[
+            overlapped_boxes_ind]
+
+    crowd_ind = np.where(is_crowd)[0]
+    gt_overlaps[crowd_ind] = -1
+
+    max_overlaps = gt_overlaps.max(axis=1)
+    max_classes = gt_overlaps.argmax(axis=1)
+
+    # Foreground
+    fg_inds = np.where(max_overlaps >= fg_thresh)[0]
+    fg_rois_per_this_image = np.minimum(fg_rois_per_im, fg_inds.shape[0])
+    # Sample foreground if there are too many
+    # if fg_inds.shape[0] > fg_rois_per_this_image:
+    #     fg_inds = np.random.choice(
+    #         fg_inds, size=fg_rois_per_this_image, replace=False)
+    fg_inds = fg_inds[:fg_rois_per_this_image]
+
+    # Background
+    bg_inds = np.where((max_overlaps < bg_thresh_hi) & (max_overlaps >=
+                                                        bg_thresh_lo))[0]
+    bg_rois_per_this_image = rois_per_image - fg_rois_per_this_image
+    bg_rois_per_this_image = np.minimum(bg_rois_per_this_image,
+                                        bg_inds.shape[0])
+    # Sample background if there are too many
+    # if bg_inds.shape[0] > bg_rois_per_this_image:
+    #     bg_inds = np.random.choice(
+    #         bg_inds, size=bg_rois_per_this_image, replace=False)
+    bg_inds = bg_inds[:bg_rois_per_this_image]
+
+    keep_inds = np.append(fg_inds, bg_inds)
+    sampled_labels = max_classes[keep_inds]
+    sampled_labels[fg_rois_per_this_image:] = 0
+    sampled_boxes = boxes[keep_inds]
+    sampled_gts = gt_boxes[box_to_gt_ind_map[keep_inds]]
+    sampled_gts[fg_rois_per_this_image:, :] = gt_boxes[0]
+
+    bbox_label_targets = _compute_targets(sampled_boxes, sampled_gts,
+                                          sampled_labels, bbox_reg_weights)
+    bbox_targets, bbox_inside_weights = _expand_bbox_targets(bbox_label_targets,
+                                                             class_nums)
+    bbox_outside_weights = np.array(
+        bbox_inside_weights > 0, dtype=bbox_inside_weights.dtype)
+
+    # Scale rois
+    sampled_rois = sampled_boxes * im_scale
+
+    # Faster RCNN blobs
+    frcn_blobs = dict(
+        rois=sampled_rois,
+        labels_int32=sampled_labels,
+        bbox_targets=bbox_targets,
+        bbox_inside_weights=bbox_inside_weights,
+        bbox_outside_weights=bbox_outside_weights)
+    return frcn_blobs
+
+
+def _bbox_overlaps(roi_boxes, gt_boxes):
+    w1 = np.maximum(roi_boxes[:, 2] - roi_boxes[:, 0] + 1, 0)
+    h1 = np.maximum(roi_boxes[:, 3] - roi_boxes[:, 1] + 1, 0)
+    w2 = np.maximum(gt_boxes[:, 2] - gt_boxes[:, 0] + 1, 0)
+    h2 = np.maximum(gt_boxes[:, 3] - gt_boxes[:, 1] + 1, 0)
+    area1 = w1 * h1
+    area2 = w2 * h2
+
+    overlaps = np.zeros((roi_boxes.shape[0], gt_boxes.shape[0]))
+    for ind1 in range(roi_boxes.shape[0]):
+        for ind2 in range(gt_boxes.shape[0]):
+            inter_x1 = np.maximum(roi_boxes[ind1, 0], gt_boxes[ind2, 0])
+            inter_y1 = np.maximum(roi_boxes[ind1, 1], gt_boxes[ind2, 1])
+            inter_x2 = np.minimum(roi_boxes[ind1, 2], gt_boxes[ind2, 2])
+            inter_y2 = np.minimum(roi_boxes[ind1, 3], gt_boxes[ind2, 3])
+            inter_w = np.maximum(inter_x2 - inter_x1 + 1, 0)
+            inter_h = np.maximum(inter_y2 - inter_y1 + 1, 0)
+            inter_area = inter_w * inter_h
+            iou = inter_area / (area1[ind1] + area2[ind2] - inter_area)
+            overlaps[ind1, ind2] = iou
+    return overlaps
+
+
+def _compute_targets(roi_boxes, gt_boxes, labels, bbox_reg_weights):
+    assert roi_boxes.shape[0] == gt_boxes.shape[0]
+    assert roi_boxes.shape[1] == 4
+    assert gt_boxes.shape[1] == 4
+
+    targets = np.zeros(roi_boxes.shape)
+    bbox_reg_weights = np.asarray(bbox_reg_weights)
+    targets = _box_to_delta(
+        ex_boxes=roi_boxes, gt_boxes=gt_boxes, weights=bbox_reg_weights)
+
+    return np.hstack([labels[:, np.newaxis], targets]).astype(
+        np.float32, copy=False)
+
+
+def _box_to_delta(ex_boxes, gt_boxes, weights):
+    ex_w = ex_boxes[:, 2] - ex_boxes[:, 0] + 1
+    ex_h = ex_boxes[:, 3] - ex_boxes[:, 1] + 1
+    ex_ctr_x = ex_boxes[:, 0] + 0.5 * ex_w
+    ex_ctr_y = ex_boxes[:, 1] + 0.5 * ex_h
+
+    gt_w = gt_boxes[:, 2] - gt_boxes[:, 0] + 1
+    gt_h = gt_boxes[:, 3] - gt_boxes[:, 1] + 1
+    gt_ctr_x = gt_boxes[:, 0] + 0.5 * gt_w
+    gt_ctr_y = gt_boxes[:, 1] + 0.5 * gt_h
+
+    dx = (gt_ctr_x - ex_ctr_x) / ex_w / weights[0]
+    dy = (gt_ctr_y - ex_ctr_y) / ex_h / weights[1]
+    dw = (np.log(gt_w / ex_w)) / weights[2]
+    dh = (np.log(gt_h / ex_h)) / weights[3]
+
+    targets = np.vstack([dx, dy, dw, dh]).transpose()
+    return targets
+
+
+def _expand_bbox_targets(bbox_targets_input, class_nums):
+    class_labels = bbox_targets_input[:, 0]
+    fg_inds = np.where(class_labels > 0)[0]
+
+    bbox_targets = np.zeros((class_labels.shape[0], 4 * class_nums))
+    bbox_inside_weights = np.zeros(bbox_targets.shape)
+    for ind in fg_inds:
+        class_label = int(class_labels[ind])
+        start_ind = class_label * 4
+        end_ind = class_label * 4 + 4
+        bbox_targets[ind, start_ind:end_ind] = bbox_targets_input[ind, 1:]
+        bbox_inside_weights[ind, start_ind:end_ind] = (1.0, 1.0, 1.0, 1.0)
+
+    return bbox_targets, bbox_inside_weights
+
+
+class TestGenerateProposalLabelsOp(OpTest):
+    def set_data(self):
+        self.init_test_params()
+        self.init_test_input()
+        self.init_test_output()
+        self.inputs = {
+            'RpnRois': (self.rpn_rois[0], self.rpn_rois_lod),
+            'GtClasses': (self.gt_classes[0], self.gts_lod),
+            'IsCrowd': (self.is_crowd[0], self.gts_lod),
+            'GtBoxes': (self.gt_boxes[0], self.gts_lod),
+            'ImInfo': self.im_info
+        }
+        self.attrs = {
+            'batch_size_per_im': self.batch_size_per_im,
+            'fg_fraction': self.fg_fraction,
+            'fg_thresh': self.fg_thresh,
+            'bg_thresh_hi': self.bg_thresh_hi,
+            'bg_thresh_lo': self.bg_thresh_lo,
+            'bbox_reg_weights': self.bbox_reg_weights,
+            'class_nums': self.class_nums,
+            'use_random': False
+        }
+        self.outputs = {
+            'Rois': (self.rois, [self.lod]),
+            'LabelsInt32': (self.labels_int32, [self.lod]),
+            'BboxTargets': (self.bbox_targets, [self.lod]),
+            'BboxInsideWeights': (self.bbox_inside_weights, [self.lod]),
+            'BboxOutsideWeights': (self.bbox_outside_weights, [self.lod]),
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def setUp(self):
+        self.op_type = 'generate_proposal_labels'
+        self.set_data()
+
+    def init_test_params(self):
+        self.batch_size_per_im = 512
+        self.fg_fraction = 0.25
+        self.fg_thresh = 0.5
+        self.bg_thresh_hi = 0.5
+        self.bg_thresh_lo = 0.0
+        self.bbox_reg_weights = [0.1, 0.1, 0.2, 0.2]
+        self.class_nums = 81
+
+    def init_test_input(self):
+        np.random.seed(0)
+        gt_nums = 6  # Keep same with batch_size_per_im for unittest
+        proposal_nums = 2000  #self.batch_size_per_im - gt_nums
+        images_shape = [[64, 64]]
+        self.im_info = np.ones((len(images_shape), 3)).astype(np.float32)
+        for i in range(len(images_shape)):
+            self.im_info[i, 0] = images_shape[i][0]
+            self.im_info[i, 1] = images_shape[i][1]
+            self.im_info[i, 2] = 0.8  #scale
+
+        self.rpn_rois, self.rpn_rois_lod = _generate_proposals(images_shape,
+                                                               proposal_nums)
+        ground_truth, self.gts_lod = _generate_groundtruth(
+            images_shape, self.class_nums, gt_nums)
+        self.gt_classes = [gt['gt_classes'] for gt in ground_truth]
+        self.gt_boxes = [gt['boxes'] for gt in ground_truth]
+        self.is_crowd = [gt['is_crowd'] for gt in ground_truth]
+
+    def init_test_output(self):
+        self.rois, self.labels_int32, self.bbox_targets, \
+        self.bbox_inside_weights, self.bbox_outside_weights, \
+        self.lod = generate_proposal_labels_in_python(
+                self.rpn_rois, self.gt_classes, self.is_crowd, self.gt_boxes, self.im_info,
+                self.batch_size_per_im, self.fg_fraction,
+                self.fg_thresh, self.bg_thresh_hi, self.bg_thresh_lo,
+                self.bbox_reg_weights, self.class_nums
+            )
+        self.rois = np.vstack(self.rois)
+        self.labels_int32 = np.hstack(self.labels_int32)
+        self.labels_int32 = self.labels_int32[:, np.newaxis]
+        self.bbox_targets = np.vstack(self.bbox_targets)
+        self.bbox_inside_weights = np.vstack(self.bbox_inside_weights)
+        self.bbox_outside_weights = np.vstack(self.bbox_outside_weights)
+
+
+def _generate_proposals(images_shape, proposal_nums):
+    rpn_rois = []
+    rpn_rois_lod = []
+    num_proposals = 0
+    for i, image_shape in enumerate(images_shape):
+        proposals = _generate_boxes(image_shape, proposal_nums)
+        rpn_rois.append(proposals)
+        num_proposals = len(proposals)
+        rpn_rois_lod.append(num_proposals)
+    return rpn_rois, [rpn_rois_lod]
+
+
+def _generate_groundtruth(images_shape, class_nums, gt_nums):
+    ground_truth = []
+    gts_lod = []
+    num_gts = 0
+    for i, image_shape in enumerate(images_shape):
+        # Avoid background
+        gt_classes = np.random.randint(
+            low=1, high=class_nums, size=gt_nums).astype(np.int32)
+        gt_boxes = _generate_boxes(image_shape, gt_nums)
+        is_crowd = np.zeros((gt_nums), dtype=np.int32)
+        is_crowd[0] = 1
+        ground_truth.append(
+            dict(
+                gt_classes=gt_classes, boxes=gt_boxes, is_crowd=is_crowd))
+        num_gts += len(gt_classes)
+        gts_lod.append(num_gts)
+    return ground_truth, [gts_lod]
+
+
+def _generate_boxes(image_size, box_nums):
+    width = image_size[0]
+    height = image_size[1]
+    xywh = np.random.rand(box_nums, 4)
+    xy1 = xywh[:, [0, 1]] * image_size
+    wh = xywh[:, [2, 3]] * (image_size - xy1)
+    xy2 = xy1 + wh
+    boxes = np.hstack([xy1, xy2])
+    boxes[:, [0, 2]] = np.minimum(width - 1., np.maximum(0., boxes[:, [0, 2]]))
+    boxes[:, [1, 3]] = np.minimum(height - 1., np.maximum(0., boxes[:, [1, 3]]))
+    return boxes.astype(np.float32)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py b/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..86e27fe29ed945ec77fbbcdbd1c7cc6ecfba0fd5
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py
@@ -0,0 +1,329 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://w_idxw.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import sys
+import math
+import paddle.fluid as fluid
+from op_test import OpTest
+from test_multiclass_nms_op import nms
+from test_anchor_generator_op import anchor_generator_in_python
+import copy
+
+
+def generate_proposals_in_python(scores, bbox_deltas, im_info, anchors,
+                                 variances, pre_nms_topN, post_nms_topN,
+                                 nms_thresh, min_size, eta):
+    all_anchors = anchors.reshape(-1, 4)
+    rois = np.empty((0, 5), dtype=np.float32)
+    roi_probs = np.empty((0, 1), dtype=np.float32)
+
+    rpn_rois = []
+    rpn_roi_probs = []
+    lod = []
+    num_images = scores.shape[0]
+    for img_idx in range(num_images):
+        img_i_boxes, img_i_probs = proposal_for_one_image(
+            im_info[img_idx, :], all_anchors, variances,
+            bbox_deltas[img_idx, :, :, :], scores[img_idx, :, :, :],
+            pre_nms_topN, post_nms_topN, nms_thresh, min_size, eta)
+        lod.append(img_i_probs.shape[0])
+        rpn_rois.append(img_i_boxes)
+        rpn_roi_probs.append(img_i_probs)
+
+    return rpn_rois, rpn_roi_probs, lod
+
+
+def proposal_for_one_image(im_info, all_anchors, variances, bbox_deltas, scores,
+                           pre_nms_topN, post_nms_topN, nms_thresh, min_size,
+                           eta):
+    # Transpose and reshape predicted bbox transformations to get them
+    # into the same order as the anchors:
+    #   - bbox deltas will be (4 * A, H, W) format from conv output
+    #   - transpose to (H, W, 4 * A)
+    #   - reshape to (H * W * A, 4) where rows are ordered by (H, W, A)
+    #     in slowest to fastest order to match the enumerated anchors
+    bbox_deltas = bbox_deltas.transpose((1, 2, 0)).reshape(-1, 4)
+    all_anchors = all_anchors.reshape(-1, 4)
+    variances = variances.reshape(-1, 4)
+    # Same story for the scores:
+    #   - scores are (A, H, W) format from conv output
+    #   - transpose to (H, W, A)
+    #   - reshape to (H * W * A, 1) where rows are ordered by (H, W, A)
+    #     to match the order of anchors and bbox_deltas
+    scores = scores.transpose((1, 2, 0)).reshape(-1, 1)
+
+    # sort all (proposal, score) pairs by score from highest to lowest
+    # take top pre_nms_topN (e.g. 6000)
+    if pre_nms_topN <= 0 or pre_nms_topN >= len(scores):
+        order = np.argsort(-scores.squeeze())
+    else:
+        # Avoid sorting possibly large arrays;
+        # First partition to get top K unsorted
+        # and then sort just thoes
+        inds = np.argpartition(-scores.squeeze(), pre_nms_topN)[:pre_nms_topN]
+        order = np.argsort(-scores[inds].squeeze())
+        order = inds[order]
+    scores = scores[order, :]
+    bbox_deltas = bbox_deltas[order, :]
+    all_anchors = all_anchors[order, :]
+    proposals = box_coder(all_anchors, bbox_deltas, variances)
+    # clip proposals to image (may result in proposals with zero area
+    # that will be removed in the next step)
+    proposals = clip_tiled_boxes(proposals, im_info[:2])
+    # remove predicted boxes with height or width < min_size
+    keep = filter_boxes(proposals, min_size, im_info)
+    proposals = proposals[keep, :]
+    scores = scores[keep, :]
+
+    # apply loose nms (e.g. threshold = 0.7)
+    # take post_nms_topN (e.g. 1000)
+    # return the top proposals
+    if nms_thresh > 0:
+        keep = nms(boxes=proposals,
+                   scores=scores,
+                   nms_threshold=nms_thresh,
+                   eta=eta)
+        if post_nms_topN > 0 and post_nms_topN < len(keep):
+            keep = keep[:post_nms_topN]
+        proposals = proposals[keep, :]
+        scores = scores[keep, :]
+
+    return proposals, scores
+
+
+def box_coder(all_anchors, bbox_deltas, variances):
+    """
+    Decode proposals by anchors and bbox_deltas from RPN 
+    """
+    #proposals: xmin, ymin, xmax, ymax
+    proposals = np.zeros_like(bbox_deltas, dtype=np.float32)
+
+    #anchor_loc: width, height, center_x, center_y
+    anchor_loc = np.zeros_like(bbox_deltas, dtype=np.float32)
+
+    anchor_loc[:, 0] = all_anchors[:, 2] - all_anchors[:, 0] + 1
+    anchor_loc[:, 1] = all_anchors[:, 3] - all_anchors[:, 1] + 1
+    anchor_loc[:, 2] = all_anchors[:, 0] + 0.5 * anchor_loc[:, 0]
+    anchor_loc[:, 3] = all_anchors[:, 1] + 0.5 * anchor_loc[:, 1]
+
+    #predicted bbox: bbox_center_x, bbox_center_y, bbox_width, bbox_height 
+    pred_bbox = np.zeros_like(bbox_deltas, dtype=np.float32)
+    if variances is not None:
+        for i in range(bbox_deltas.shape[0]):
+            pred_bbox[i, 0] = variances[i, 0] * bbox_deltas[i, 0] * anchor_loc[
+                i, 0] + anchor_loc[i, 2]
+            pred_bbox[i, 1] = variances[i, 1] * bbox_deltas[i, 1] * anchor_loc[
+                i, 1] + anchor_loc[i, 3]
+            pred_bbox[i, 2] = math.exp(
+                min(variances[i, 2] * bbox_deltas[i, 2], math.log(
+                    1000 / 16.0))) * anchor_loc[i, 0]
+            pred_bbox[i, 3] = math.exp(
+                min(variances[i, 3] * bbox_deltas[i, 3], math.log(
+                    1000 / 16.0))) * anchor_loc[i, 1]
+    else:
+        for i in range(bbox_deltas.shape[0]):
+            pred_bbox[i, 0] = bbox_deltas[i, 0] * anchor_loc[i, 0] + anchor_loc[
+                i, 2]
+            pred_bbox[i, 1] = bbox_deltas[i, 1] * anchor_loc[i, 1] + anchor_loc[
+                i, 3]
+            pred_bbox[i, 2] = math.exp(
+                min(bbox_deltas[i, 2], math.log(1000 / 16.0))) * anchor_loc[i,
+                                                                            0]
+            pred_bbox[i, 3] = math.exp(
+                min(bbox_deltas[i, 3], math.log(1000 / 16.0))) * anchor_loc[i,
+                                                                            1]
+
+    proposals[:, 0] = pred_bbox[:, 0] - pred_bbox[:, 2] / 2
+    proposals[:, 1] = pred_bbox[:, 1] - pred_bbox[:, 3] / 2
+    proposals[:, 2] = pred_bbox[:, 0] + pred_bbox[:, 2] / 2 - 1
+    proposals[:, 3] = pred_bbox[:, 1] + pred_bbox[:, 3] / 2 - 1
+
+    return proposals
+
+
+def clip_tiled_boxes(boxes, im_shape):
+    """Clip boxes to image boundaries. im_shape is [height, width] and boxes
+    has shape (N, 4 * num_tiled_boxes)."""
+    assert boxes.shape[1] % 4 == 0, \
+        'boxes.shape[1] is {:d}, but must be divisible by 4.'.format(
+        boxes.shape[1]
+    )
+    # x1 >= 0
+    boxes[:, 0::4] = np.maximum(np.minimum(boxes[:, 0::4], im_shape[1] - 1), 0)
+    # y1 >= 0
+    boxes[:, 1::4] = np.maximum(np.minimum(boxes[:, 1::4], im_shape[0] - 1), 0)
+    # x2 < im_shape[1]
+    boxes[:, 2::4] = np.maximum(np.minimum(boxes[:, 2::4], im_shape[1] - 1), 0)
+    # y2 < im_shape[0]
+    boxes[:, 3::4] = np.maximum(np.minimum(boxes[:, 3::4], im_shape[0] - 1), 0)
+    return boxes
+
+
+def filter_boxes(boxes, min_size, im_info):
+    """Only keep boxes with both sides >= min_size and center within the image.
+    """
+    # Scale min_size to match image scale
+    im_scale = im_info[2]
+    min_size = max(min_size, 1.0)
+    ws = boxes[:, 2] - boxes[:, 0] + 1
+    hs = boxes[:, 3] - boxes[:, 1] + 1
+    ws_orig_scale = (boxes[:, 2] - boxes[:, 0]) / im_scale + 1
+    hs_orig_scale = (boxes[:, 3] - boxes[:, 1]) / im_scale + 1
+    x_ctr = boxes[:, 0] + ws / 2.
+    y_ctr = boxes[:, 1] + hs / 2.
+    keep = np.where((ws_orig_scale >= min_size) & (hs_orig_scale >= min_size) &
+                    (x_ctr < im_info[1]) & (y_ctr < im_info[0]))[0]
+    return keep
+
+
+def iou(box_a, box_b):
+    """
+	Apply intersection-over-union overlap between box_a and box_b
+    """
+    xmin_a = min(box_a[0], box_a[2])
+    ymin_a = min(box_a[1], box_a[3])
+    xmax_a = max(box_a[0], box_a[2])
+    ymax_a = max(box_a[1], box_a[3])
+
+    xmin_b = min(box_b[0], box_b[2])
+    ymin_b = min(box_b[1], box_b[3])
+    xmax_b = max(box_b[0], box_b[2])
+    ymax_b = max(box_b[1], box_b[3])
+
+    area_a = (ymax_a - ymin_a + 1) * (xmax_a - xmin_a + 1)
+    area_b = (ymax_b - ymin_b + 1) * (xmax_b - xmin_b + 1)
+    if area_a <= 0 and area_b <= 0:
+        return 0.0
+
+    xa = max(xmin_a, xmin_b)
+    ya = max(ymin_a, ymin_b)
+    xb = min(xmax_a, xmax_b)
+    yb = min(ymax_a, ymax_b)
+
+    inter_area = max(xb - xa + 1, 0.0) * max(yb - ya + 1, 0.0)
+
+    iou_ratio = inter_area / (area_a + area_b - inter_area)
+
+    return iou_ratio
+
+
+def nms(boxes, scores, nms_threshold, eta=1.0):
+    """Apply non-maximum suppression at test time to avoid detecting too many
+    overlapping bounding boxes for a given object.
+    Args:
+        boxes: (tensor) The location preds for the img, Shape: [num_priors,4].
+        scores: (tensor) The class predscores for the img, Shape:[num_priors].
+        nms_threshold: (float) The overlap thresh for suppressing unnecessary
+            boxes.
+        eta: (float) The parameter for adaptive NMS.
+    Return:
+        The indices of the kept boxes with respect to num_priors.
+    """
+    all_scores = copy.deepcopy(scores)
+    all_scores = all_scores.flatten()
+
+    sorted_indices = np.argsort(-all_scores, axis=0, kind='mergesort')
+    sorted_scores = all_scores[sorted_indices]
+    selected_indices = []
+    adaptive_threshold = nms_threshold
+    for i in range(sorted_scores.shape[0]):
+        idx = sorted_indices[i]
+        keep = True
+        for k in range(len(selected_indices)):
+            if keep:
+                kept_idx = selected_indices[k]
+                overlap = iou(boxes[idx], boxes[kept_idx])
+                keep = True if overlap <= adaptive_threshold else False
+            else:
+                break
+        if keep:
+            selected_indices.append(idx)
+        if keep and eta < 1 and adaptive_threshold > 0.5:
+            adaptive_threshold *= eta
+    return selected_indices
+
+
+class TestGenerateProposalsOp(OpTest):
+    def set_data(self):
+        self.init_test_params()
+        self.init_test_input()
+        self.init_test_output()
+        self.inputs = {
+            'Scores': self.scores,
+            'BboxDeltas': self.bbox_deltas,
+            'ImInfo': self.im_info.astype(np.float32),
+            'Anchors': self.anchors,
+            'Variances': self.variances
+        }
+
+        self.attrs = {
+            'pre_nms_topN': self.pre_nms_topN,
+            'post_nms_topN': self.post_nms_topN,
+            'nms_thresh': self.nms_thresh,
+            'min_size': self.min_size,
+            'eta': self.eta
+        }
+
+        print("lod = ", self.lod)
+        self.outputs = {
+            'RpnRois': (self.rpn_rois[0], [self.lod]),
+            'RpnRoiProbs': (self.rpn_roi_probs[0], [self.lod])
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def setUp(self):
+        self.op_type = "generate_proposals"
+        self.set_data()
+
+    def init_test_params(self):
+        self.pre_nms_topN = 12000  # train 12000, test 2000
+        self.post_nms_topN = 5000  # train 6000, test 1000
+        self.nms_thresh = 0.7
+        self.min_size = 3.0
+        self.eta = 0.8
+
+    def init_test_input(self):
+        batch_size = 1
+        input_channels = 20
+        layer_h = 16
+        layer_w = 16
+        input_feat = np.random.random(
+            (batch_size, input_channels, layer_h, layer_w)).astype('float32')
+        self.anchors, self.variances = anchor_generator_in_python(
+            input_feat=input_feat,
+            anchor_sizes=[16., 32.],
+            aspect_ratios=[0.5, 1.0],
+            variances=[1.0, 1.0, 1.0, 1.0],
+            stride=[16.0, 16.0],
+            offset=0.5)
+        self.im_info = np.array([[64., 64., 8.]])  #im_height, im_width, scale
+        num_anchors = self.anchors.shape[2]
+        self.scores = np.random.random(
+            (batch_size, num_anchors, layer_h, layer_w)).astype('float32')
+        self.bbox_deltas = np.random.random(
+            (batch_size, num_anchors * 4, layer_h, layer_w)).astype('float32')
+
+    def init_test_output(self):
+        self.rpn_rois, self.rpn_roi_probs, self.lod = generate_proposals_in_python(
+            self.scores, self.bbox_deltas, self.im_info, self.anchors,
+            self.variances, self.pre_nms_topN, self.post_nms_topN,
+            self.nms_thresh, self.min_size, self.eta)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_get_places_op.py b/python/paddle/fluid/tests/unittests/test_get_places_op.py
index 6dab1e22f0c50ab011d6b8e8944097600cf3fecc..441666a97b16a320692d6a15363f61156e52242b 100644
--- a/python/paddle/fluid/tests/unittests/test_get_places_op.py
+++ b/python/paddle/fluid/tests/unittests/test_get_places_op.py
@@ -12,7 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import paddle.fluid as fluid
+from paddle.fluid.layers.device import get_places
 import decorators
 import unittest
 
@@ -20,7 +23,7 @@ import unittest
 class TestGetPlaces(unittest.TestCase):
     @decorators.prog_scope()
     def test_get_places(self):
-        places = fluid.layers.get_places()
+        places = get_places()
         cpu = fluid.CPUPlace()
         exe = fluid.Executor(cpu)
         exe.run(fluid.default_main_program())
diff --git a/python/paddle/fluid/tests/unittests/test_gru_op.py b/python/paddle/fluid/tests/unittests/test_gru_op.py
index 8fbf1560859aa295fc40b36129d0f0d07d55dd9f..9f6f03f9cfe3c505a7b1227e2b20db3c3c84c745 100644
--- a/python/paddle/fluid/tests/unittests/test_gru_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gru_op.py
@@ -12,33 +12,34 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 import math
+import functools
 from op_test import OpTest
-from test_lstm_op import identity, sigmoid, tanh, relu
-
-
-class TestGRUOp(OpTest):
-    lod = [[2, 4, 3]]
-    batch_size = sum(lod[0])
-    frame_size = 5
-    activate = {
-        'identity': identity,
-        'sigmoid': sigmoid,
-        'tanh': tanh,
-        'relu': relu
-    }
-
-    @staticmethod
-    def seq_to_batch(lod, is_reverse):
+from test_lstm_op import ACTIVATION
+
+
+def gru(
+        input,  # T x 3D
+        lod,  # 1 x N
+        h0,  # N x D
+        weight,  # D x 3D
+        bias,  # 1 x 3D
+        is_reverse,
+        act_state,
+        act_gate):
+    def _seq_to_batch(lod, is_reverse):
         idx_in_seq_list = []
         seq_lens = lod[0]
         seq_starts = [0]
         for i in range(len(seq_lens)):
             seq_starts.append(seq_starts[-1] + seq_lens[i])
         sorted_seqs = sorted(
-            range(len(seq_lens)), lambda x, y: seq_lens[y] - seq_lens[x])
+            list(range(len(seq_lens))),
+            key=functools.cmp_to_key(lambda x, y: seq_lens[y] - seq_lens[x]))
         num_batch = seq_lens[sorted_seqs[0]]
         for batch_idx in range(num_batch):
             idx_in_seq = []
@@ -52,120 +53,125 @@ class TestGRUOp(OpTest):
             idx_in_seq_list.append(idx_in_seq)
         return idx_in_seq_list, sorted_seqs
 
-    def gru_step(self, x, h_p, w, b):
-        batch_size = x.shape[0]
-        frame_size = w.shape[0]
-        g = x + np.tile(b, (batch_size, 1))
-        w_u_r = w.flatten()[:frame_size * frame_size * 2].reshape(
-            (frame_size, frame_size * 2))
-        u_r = self.activate[self.attrs['gate_activation']](np.dot(
-            h_p, w_u_r) + g[:, :frame_size * 2])
-        u = u_r[:, :frame_size]
-        r = u_r[:, frame_size:frame_size * 2]
+    def _step(x, h_p, w, b, act_state, act_gate):
+        T = x.shape[0]
+        D = w.shape[0]
+        g = x + np.tile(b, (T, 1))
+        w_u_r = w.flatten()[:D * D * 2].reshape((D, D * 2))
+        u_r = act_gate(np.dot(h_p, w_u_r) + g[:, :D * 2])
+        u = u_r[:, :D]
+        r = u_r[:, D:D * 2]
         r_h_p = r * h_p
-        w_c = w.flatten()[frame_size * frame_size * 2:].reshape(
-            (frame_size, frame_size))
-        c = self.activate[self.attrs['activation']](np.dot(r_h_p, w_c) +
-                                                    g[:, frame_size * 2:])
+        w_c = w.flatten()[D * D * 2:].reshape((D, D))
+        c = act_state(np.dot(r_h_p, w_c) + g[:, D * 2:])
         g = np.hstack((u_r, c))
         h = u * c + (1 - u) * h_p
         return g, r_h_p, h
 
-    def gru(self):
-        input, lod = self.inputs['Input']
-        w = self.inputs['Weight']
-        b = self.inputs['Bias'] if self.inputs.has_key('Bias') else np.zeros(
-            (1, self.frame_size * 3))
-        batch_gate = self.outputs['BatchGate']
-        batch_reset_hidden_prev = self.outputs['BatchResetHiddenPrev']
-        batch_hidden = self.outputs['BatchHidden']
-        hidden = self.outputs['Hidden']
-        idx_in_seq_list = self.idx_in_seq_list
-        h_p = self.inputs['H0'][self.sorted_seqs] if self.inputs.has_key(
-            'H0') else np.zeros((len(idx_in_seq_list[0]), self.frame_size))
-        num_batch = len(idx_in_seq_list)
-        end_idx = 0
-        for batch_idx in range(num_batch):
-            x = input[idx_in_seq_list[batch_idx]]
-            g, r_h_p, h = self.gru_step(x, h_p, w, b)
-            if batch_idx < (num_batch - 1):
-                h_p = h[:len(idx_in_seq_list[batch_idx + 1])]
-            start_idx = end_idx
-            end_idx = start_idx + len(idx_in_seq_list[batch_idx])
-            batch_gate[start_idx:end_idx] = g
-            batch_reset_hidden_prev[start_idx:end_idx] = r_h_p
-            batch_hidden[start_idx:end_idx] = h
-            hidden[idx_in_seq_list[batch_idx]] = h
-        return batch_gate, batch_reset_hidden_prev, hidden
-
-    def set_data(self):
-        lod = self.lod
-        self.idx_in_seq_list, self.sorted_seqs = self.seq_to_batch(
-            lod, self.is_reverse)
-        batch_size = self.batch_size
-        frame_size = self.frame_size
-        input = np.random.rand(batch_size, frame_size * 3).astype('float64')
-        h0 = np.random.rand(len(self.idx_in_seq_list[0]),
-                            frame_size).astype('float64')
-        weight = np.random.rand(frame_size, frame_size * 3).astype('float64')
-        bias = np.random.rand(1, frame_size * 3).astype('float64')
-
-        self.inputs = {
-            'Input': (input, lod),
-            'H0': h0,
-            'Weight': weight,
-            'Bias': bias
-        }
+    T = sum(lod[0])
+    N = len(lod[0])
+    D = weight.shape[0]
+    batch_gate = np.zeros((T, 3 * D), dtype='float64')
+    batch_reset_hidden_prev = np.zeros((T, D), dtype='float64')
+    batch_hidden = np.zeros((T, D), dtype='float64')
+    hidden = np.zeros((T, D), dtype='float64')
+
+    idx_in_seq_list, sorted_seqs = _seq_to_batch(lod, is_reverse)
+    h_p = h0[sorted_seqs]
+    max_seq_len = len(idx_in_seq_list)
+    assert len(idx_in_seq_list[0]) == N
+    end_idx = 0
+    for batch_idx in range(max_seq_len):
+        x = input[idx_in_seq_list[batch_idx]]
+        g, r_h_p, h = _step(x, h_p, weight, bias, act_state, act_gate)
+        if batch_idx < (max_seq_len - 1):
+            h_p = h[:len(idx_in_seq_list[batch_idx + 1])]
+        start_idx = end_idx
+        end_idx = start_idx + len(idx_in_seq_list[batch_idx])
+        batch_gate[start_idx:end_idx] = g
+        batch_reset_hidden_prev[start_idx:end_idx] = r_h_p
+        batch_hidden[start_idx:end_idx] = h
+        hidden[idx_in_seq_list[batch_idx]] = h
+    return batch_gate, batch_reset_hidden_prev, batch_hidden, hidden
 
-        self.outputs = {
-            'BatchGate': np.zeros(
-                (batch_size, frame_size * 3), dtype='float64'),
-            'BatchResetHiddenPrev': np.zeros(
-                (batch_size, frame_size), dtype='float64'),
-            'BatchHidden': np.zeros(
-                (batch_size, frame_size), dtype='float64'),
-            'Hidden': np.zeros(
-                (batch_size, frame_size), dtype='float64')
-        }
 
+class TestGRUOp(OpTest):
     def set_confs(self):
-        self.is_reverse = False
-        self.attrs = {
-            'activation': 'tanh',
-            'gate_activation': 'sigmoid',
-            'is_reverse': self.is_reverse
-        }
+        pass
 
     def setUp(self):
         self.op_type = "gru"
+        self.lod = [[2, 4, 3]]
+        self.D = 5
+        self.is_reverse = False
+        self.with_h0 = True
+        self.with_bias = True
+        self.act_state = 'tanh'
+        self.act_gate = 'sigmoid'
         self.set_confs()
-        self.set_data()
-        self.gru()
+
+        T = sum(self.lod[0])
+        N = len(self.lod[0])
+
+        input = np.random.rand(T, 3 * self.D).astype('float64')
+        weight = np.random.rand(self.D, 3 * self.D).astype('float64')
+        bias = np.random.rand(
+            1, 3 * self.D).astype('float64') if self.with_bias else np.zeros(
+                (1, 3 * self.D), dtype='float64')
+        h0 = np.random.rand(
+            N, self.D).astype('float64') if self.with_h0 else np.zeros(
+                (N, self.D), dtype='float64')
+
+        batch_gate, batch_reset_hidden_prev, batch_hidden, hidden = gru(
+            input, self.lod, h0, weight, bias, self.is_reverse,
+            ACTIVATION[self.act_state], ACTIVATION[self.act_gate])
+        self.inputs = {'Input': (input, self.lod), 'Weight': weight}
+
+        if self.with_bias:
+            self.inputs['Bias'] = bias
+
+        if self.with_h0:
+            self.inputs['H0'] = h0
+
+        self.outputs = {
+            'Hidden': (hidden, self.lod),
+            'BatchGate': batch_gate,
+            'BatchResetHiddenPrev': batch_reset_hidden_prev,
+            'BatchHidden': batch_hidden,
+        }
+
+        self.attrs = {
+            'activation': self.act_state,
+            'gate_activation': self.act_gate,
+            'is_reverse': self.is_reverse
+        }
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(atol=1e-8)
 
     def test_check_grad(self):
         self.check_grad(['Input', 'H0', 'Weight', 'Bias'], ['Hidden'])
 
 
 class TestGRUOpNoInitial(TestGRUOp):
-    def set_data(self):
-        super(TestGRUOpNoInitial, self).set_data()
-        self.inputs.pop('H0')
+    def set_confs(self):
+        self.with_h0 = False
 
     def test_check_grad(self):
         self.check_grad(['Input', 'Weight', 'Bias'], ['Hidden'])
 
 
+class TestGRUOpNoBias(TestGRUOp):
+    def set_confs(self):
+        self.with_bias = False
+
+    def test_check_grad(self):
+        self.check_grad(['Input', 'H0', 'Weight'], ['Hidden'])
+
+
 class TestGRUOpReverse(TestGRUOp):
     def set_confs(self):
         self.is_reverse = True
-        self.attrs = {
-            'activation': 'tanh',
-            'gate_activation': 'sigmoid',
-            'is_reverse': self.is_reverse
-        }
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_gru_unit_op.py b/python/paddle/fluid/tests/unittests/test_gru_unit_op.py
index c56b1eefd3a3dfe1478bd0526fa32077edcac9ba..b5a66fdf086f2abc0c9a8af663241b9eda739407 100644
--- a/python/paddle/fluid/tests/unittests/test_gru_unit_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gru_unit_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import math
 import unittest
 import numpy as np
@@ -76,7 +78,7 @@ class TestGRUUnitOp(OpTest):
         x = self.inputs['Input']
         h_p = self.inputs['HiddenPrev']
         w = self.inputs['Weight']
-        b = self.inputs['Bias'] if self.inputs.has_key('Bias') else np.zeros(
+        b = self.inputs['Bias'] if 'Bias' in self.inputs else np.zeros(
             (1, frame_size * 3))
         g = x + np.tile(b, (batch_size, 1))
         w_u_r = w.flatten()[:frame_size * frame_size * 2].reshape(
diff --git a/python/paddle/fluid/tests/unittests/test_hinge_loss_op.py b/python/paddle/fluid/tests/unittests/test_hinge_loss_op.py
index 70586c6be3da415fbccf4114615e6f7e08de0f0f..1eb441e2c52905c2b60104de5e04037714b34648 100644
--- a/python/paddle/fluid/tests/unittests/test_hinge_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_hinge_loss_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..6948ae30023a75d4735db1c78466e89e28640c9e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
@@ -0,0 +1,103 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import math
+from op_test import OpTest
+
+np.random.seed(100)
+
+
+def find_latest_set(num):
+    return 1 + int(math.floor(math.log(num, 2)))
+
+
+class CodeTable(object):
+    def __init__(self, num_classes, code):
+        self.c = num_classes + code
+
+    def cal_index(self, bit):
+        return (self.c >> (bit + 1)) - 1
+
+    def get_length(self):
+        return find_latest_set(self.c) - 1
+
+    def cal_bit(self, bit):
+        return self.c & (1 << bit)
+
+
+def hsigmoid(x, w, label, bias, num_classes):
+    batch_size = x.shape[0]
+    code_length = find_latest_set(num_classes - 1)
+    code_table = [0 for _ in range(code_length)]
+    pre_output = np.zeros((batch_size, code_length))
+    pre_sum = np.zeros((batch_size, 1))
+    out = np.zeros((batch_size, 1)).astype("float32")
+    for i in range(batch_size):
+        code_table = CodeTable(num_classes, label[i])
+        length = code_table.get_length()
+        for j in range(length):
+            idx = code_table.cal_index(j)
+            pre_output[i][j] += bias[0][idx]
+    for i in range(batch_size):
+        code_table = CodeTable(num_classes, label[i])
+        length = code_table.get_length()
+        for j in range(length):
+            idx = code_table.cal_index(j)
+            pre_output[i][j] += np.dot(w[idx], x[i])
+    # clip[-40.0, 40.0]
+    pre_output = np.clip(pre_output, -40.0, 40.0)
+    # out(i, 0) = \sum_j  bit(i, j) * preout(i, j)
+    for i in range(batch_size):
+        code_table = CodeTable(num_classes, label[i])
+        length = code_table.get_length()
+        sum = 0.0
+        for j in range(length):
+            if code_table.cal_bit(j):
+                sum += pre_output[i][j]
+        out[i] = -1.0 * sum
+    # soft relu
+    pre_output = np.log(1 + np.exp(pre_output))
+    pre_sum = pre_output.sum(1).reshape((batch_size, 1))
+    out += pre_sum
+    return pre_output, out
+
+
+class TestHSigmoidOp(OpTest):
+    def setUp(self):
+        self.op_type = "hierarchical_sigmoid"
+        num_classes = 6
+        feature_size = 8
+        batch_size = 4
+        x = np.random.random((batch_size, feature_size)).astype("float32")
+        w = np.random.random((num_classes - 1, feature_size)).astype("float32")
+        label = np.random.randint(0, num_classes, (batch_size, 1))
+        bias = np.random.random((1, num_classes - 1)).astype("float32")
+        self.attrs = {'num_classes': num_classes}
+        self.inputs = {'X': x, 'W': w, 'Label': label, 'Bias': bias}
+        pre_output, out = hsigmoid(x, w, label, bias, num_classes)
+        self.outputs = {'PreOut': pre_output, 'Out': out}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['Bias', 'X', 'W'], ['Out'], no_grad_set=set('Label'))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_huber_loss_op.py b/python/paddle/fluid/tests/unittests/test_huber_loss_op.py
index a8d0a77625598e9d929a993db46ba95b0e07527a..0055ef0052fe126b268cf7a17a8307224cced99a 100644
--- a/python/paddle/fluid/tests/unittests/test_huber_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_huber_loss_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_im2sequence_op.py b/python/paddle/fluid/tests/unittests/test_im2sequence_op.py
index 4946475f11a4fc0ccaffeec6821d3976ea7c6560..833e46483c2532e283fd672dc56cb93941f5b4ba 100644
--- a/python/paddle/fluid/tests/unittests/test_im2sequence_op.py
+++ b/python/paddle/fluid/tests/unittests/test_im2sequence_op.py
@@ -11,28 +11,55 @@
 #WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #See the License for the specific language governing permissions and
 #limitations under the License.
+
+from __future__ import print_function
 import unittest
 import numpy as np
 from op_test import OpTest
 
 
-def get_output_shape(attrs, in_shape):
+def get_output_shape(attrs, in_shape, img_real_size):
+    batchsize = in_shape[0]
     img_height = in_shape[2]
     img_width = in_shape[3]
+    paddings = np.array(attrs['paddings']).astype("int32")
+    kernels = np.array(attrs['kernels']).astype("int32")
+    strides = np.array(attrs['strides']).astype("int32")
+    output_height = np.zeros((1, batchsize)).astype("int32")
+    output_width = np.zeros((1, batchsize)).astype("int32")
+    if len(img_real_size):
+        out_stride = np.array(attrs['out_stride']).astype("int32")
+        imgreal_h = 0
+        imgreal_w = 0
+        for index in range(batchsize):
+            if img_real_size[index, 0] % out_stride[0] == 0:
+                imgreal_h = img_real_size[index, 0] / out_stride[0]
+            else:
+                imgreal_h = img_real_size[index, 0] / out_stride[0] + 1
+            if img_real_size[index, 0] % out_stride[1] == 0:
+                imgreal_w = img_real_size[index, 1] / out_stride[1]
+            else:
+                imgreal_w = img_real_size[index, 0] / out_stride[1] + 1
+            output_height[0,index] = \
+              1 +  \
+              (imgreal_h + paddings[0] + paddings[2] - kernels[0] + strides[0] - 1) / \
+                  strides[0]
 
-    paddings = attrs['paddings']
-    kernels = attrs['kernels']
-    strides = attrs['strides']
-
-    output_height = \
-      1 +  \
-      (img_height + paddings[0] + paddings[2] - kernels[0] + strides[0] - 1) / \
-          strides[0]
+            output_width[0,index] = \
+              1 + \
+              (imgreal_w + paddings[1] + paddings[3] - kernels[1] + strides[1] - 1) / \
+                  strides[1]
+    else:
+        for index in range(batchsize):
+            output_height[0,index] = \
+              1 +  \
+              (img_height + paddings[0] + paddings[2] - kernels[0] + strides[0] - 1) / \
+                  strides[0]
 
-    output_width = \
-      1 + \
-      (img_width + paddings[1] + paddings[3] - kernels[1] + strides[1] - 1) / \
-          strides[1]
+            output_width[0,index] = \
+              1 + \
+              (img_width + paddings[1] + paddings[3] - kernels[1] + strides[1] - 1) / \
+                  strides[1]
 
     return output_height, output_width
 
@@ -75,22 +102,25 @@ def im2col(attrs, im, col):
                                     im_row_offset][im_col_offset]
 
 
-def Im2Sequence(inputs, attrs):
-    output_height, output_width = get_output_shape(attrs, inputs.shape)
+def Im2Sequence(inputs, img_real_size, attrs):
+    output_height, output_width = get_output_shape(attrs, inputs.shape,
+                                                   img_real_size)
     img_channels = inputs.shape[1]
     batch_size = inputs.shape[0]
-    out = np.zeros([
-        batch_size, output_height, output_width, img_channels,
-        attrs['kernels'][0], attrs['kernels'][1]
-    ]).astype("float32")
-
-    for i in range(len(inputs)):
-        im2col(attrs, inputs[i], out[i])
-
-    out = out.reshape([
-        batch_size * output_height * output_width,
-        img_channels * attrs['kernels'][0] * attrs['kernels'][1]
-    ])
+    out = []
+    for index in range(batch_size):
+        tmp = np.zeros([
+            output_height[0, index], output_width[0, index], img_channels,
+            attrs['kernels'][0], attrs['kernels'][1]
+        ]).astype("float32")
+        out.append(tmp)
+    for index in range(len(inputs)):
+        im2col(attrs, inputs[index], out[index])
+        out[index] = out[index].reshape([
+            output_height[0, index] * output_width[0, index],
+            img_channels * attrs['kernels'][0] * attrs['kernels'][1]
+        ])
+    out = np.concatenate(out, axis=0)
     return out
 
 
@@ -103,7 +133,7 @@ class TestBlockExpandOp(OpTest):
         self.attrs = {
             'kernels': [2, 2],
             'strides': [1, 1],
-            'paddings': [1, 1, 1, 1]
+            'paddings': [1, 1, 1, 1],
         }
 
     def setUp(self):
@@ -113,7 +143,8 @@ class TestBlockExpandOp(OpTest):
             self.batch_size, self.img_channels, self.img_height, self.img_width
         ]).astype("float32")
 
-        out = Im2Sequence(x, self.attrs)
+        real_size = np.array([]).astype("float32")
+        out = Im2Sequence(x, real_size, self.attrs)
         self.inputs = {'X': x}
         self.outputs = {'Out': out}
 
@@ -133,20 +164,20 @@ class TestBlockExpandOpCase2(TestBlockExpandOp):
         self.attrs = {
             'kernels': [2, 1],
             'strides': [2, 1],
-            'paddings': [2, 1, 2, 1]
+            'paddings': [2, 1, 2, 1],
         }
 
 
 class TestBlockExpandOpCase3(TestBlockExpandOp):
     def config(self):
-        self.batch_size = 3
+        self.batch_size = 2
         self.img_channels = 1
         self.img_height = 4
         self.img_width = 5
         self.attrs = {
             'kernels': [2, 1],
             'strides': [2, 1],
-            'paddings': [2, 0, 2, 0]
+            'paddings': [2, 0, 2, 0],
         }
 
 
@@ -159,9 +190,94 @@ class TestBlockExpandOpCase4(TestBlockExpandOp):
         self.attrs = {
             'kernels': [2, 2],
             'strides': [1, 1],
-            'paddings': [0, 0, 0, 0]
+            'paddings': [0, 0, 0, 0],
+        }
+
+
+class TestBlockExpandOpCase5(OpTest):
+    def config(self):
+        self.batch_size = 1
+        self.img_channels = 3
+        self.img_height = 4
+        self.img_width = 5
+        self.attrs = {
+            'kernels': [2, 1],
+            'strides': [2, 1],
+            'paddings': [2, 1, 2, 1],
+            'out_stride': [2, 2],
         }
 
+    def setUp(self):
+        self.config()
+        self.op_type = "im2sequence"
+        x = np.random.uniform(0.1, 1, [
+            self.batch_size, self.img_channels, self.img_height, self.img_width
+        ]).astype("float32")
+        real_size = np.array([[8, 10], [5, 8]]).astype("float32")
+        out = np.array(Im2Sequence(x, real_size, self.attrs))
+        self.inputs = {'X': x, 'Y': real_size}  #l ??
+        self.outputs = {'Out': out}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestBlockExpandOpCase6(OpTest):
+    def config(self):
+        self.batch_size = 3
+        self.img_channels = 1
+        self.img_height = 4
+        self.img_width = 5
+        self.attrs = {
+            'kernels': [2, 1],
+            'strides': [1, 1],
+            'paddings': [0, 0, 0, 0],
+            'out_stride': [1, 1],
+        }
+
+    def setUp(self):
+        self.config()
+        self.op_type = "im2sequence"
+        x = np.random.uniform(0.1, 1, [
+            self.batch_size, self.img_channels, self.img_height, self.img_width
+        ]).astype("float32")
+        real_size = np.array([[8, 10], [5, 8], [5, 8]]).astype("float32")
+        out = np.array(Im2Sequence(x, real_size, self.attrs))
+        self.inputs = {'X': x, 'Y': real_size}  #l ??
+        self.outputs = {'Out': out}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestBlockExpandOpCase7(OpTest):
+    def config(self):
+        self.batch_size = 2
+        self.img_channels = 2
+        self.img_height = 3
+        self.img_width = 3
+        self.attrs = {
+            'kernels': [2, 2],
+            'strides': [1, 1],
+            'paddings': [1, 0, 1, 0],
+            'out_stride': [2, 2],
+        }
+
+    def setUp(self):
+        self.config()
+        self.op_type = "im2sequence"
+        x = np.random.uniform(0.1, 1, [
+            self.batch_size, self.img_channels, self.img_height, self.img_width
+        ]).astype("float32")
+        real_size = np.array([[6, 6], [4, 4]]).astype("float32")
+        out = np.array(Im2Sequence(x, real_size, self.attrs))
+        self.inputs = {'X': x, 'Y': real_size}
+        self.outputs = {'Out': out}
+
+    def test_check_output(self):
+        self.check_output()
+
 
 if __name__ == '__main__':
     unittest.main()
+#set shiftwidth=4 set expandtab set tabstop=4
diff --git a/python/paddle/fluid/tests/unittests/test_image_classification_layer.py b/python/paddle/fluid/tests/unittests/test_image_classification_layer.py
index 6ecfa9ea213fe0cf57e18fa83bbb85c223727d71..405637969af6fb515a24ecb077e470279c3ffc24 100644
--- a/python/paddle/fluid/tests/unittests/test_image_classification_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_image_classification_layer.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 
 import paddle.fluid as fluid
@@ -43,7 +45,7 @@ class TestLayer(unittest.TestCase):
             hidden2 = fluid.layers.fc(input=hidden1, size=128, act='relu')
             fluid.layers.batch_norm(input=hidden2)
 
-        print str(main_program)
+        print(str(main_program))
 
     def test_dropout_layer(self):
         main_program = Program()
@@ -53,7 +55,7 @@ class TestLayer(unittest.TestCase):
                 name='pixel', shape=[3, 48, 48], dtype='float32')
             fluid.layers.dropout(x=images, dropout_prob=0.5)
 
-        print str(main_program)
+        print(str(main_program))
 
     def test_img_conv_group(self):
         main_program = Program()
@@ -65,7 +67,7 @@ class TestLayer(unittest.TestCase):
             conv1 = conv_block(images, 64, 2, [0.3, 0])
             conv_block(conv1, 256, 3, [0.4, 0.4, 0])
 
-        print str(main_program)
+        print(str(main_program))
 
     def test_elementwise_add_with_act(self):
         main_program = Program()
diff --git a/python/paddle/fluid/tests/unittests/test_infer_shape.py b/python/paddle/fluid/tests/unittests/test_infer_shape.py
index 699a2d42467b7ac0dcf1939bde744ad2fcb29c97..a3d700aad8236fea7bb0e6d043323ad3bd0851f2 100644
--- a/python/paddle/fluid/tests/unittests/test_infer_shape.py
+++ b/python/paddle/fluid/tests/unittests/test_infer_shape.py
@@ -12,8 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 
+import six
 import paddle.fluid.core as core
 
 
@@ -27,14 +30,14 @@ class TestInferShape(unittest.TestCase):
         shape = [10, 20]
 
         # prepare input/output
-        x1 = block.var("x1")
+        x1 = block.var(six.b("x1"))
         x1.set_type(core.VarDesc.VarType.LOD_TENSOR)
         x1.set_shape(shape)
-        x2 = block.var("x2")
+        x2 = block.var(six.b("x2"))
         x2.set_type(core.VarDesc.VarType.LOD_TENSOR)
         x2.set_shape(shape)
 
-        out = block.var("out")
+        out = block.var(six.b("out"))
         out.set_type(core.VarDesc.VarType.LOD_TENSOR)
 
         # prepare the operator
@@ -57,14 +60,14 @@ class TestInferShape(unittest.TestCase):
         y_shape = [20, 30]
 
         # prepare input/output
-        x1 = block.var("x")
+        x1 = block.var(six.b("x"))
         x1.set_type(core.VarDesc.VarType.LOD_TENSOR)
         x1.set_shape(x_shape)
-        x2 = block.var("y")
+        x2 = block.var(six.b("y"))
         x2.set_type(core.VarDesc.VarType.LOD_TENSOR)
         x2.set_shape(y_shape)
 
-        out = block.var("out")
+        out = block.var(six.b("out"))
         out.set_type(core.VarDesc.VarType.LOD_TENSOR)
 
         # prepare the operator
diff --git a/python/paddle/fluid/tests/unittests/test_inference_model_io.py b/python/paddle/fluid/tests/unittests/test_inference_model_io.py
index 51460cbb1370f6794e13d18fe099865b4713691f..9962702f69644b7aef7d868f086abb390441f617 100644
--- a/python/paddle/fluid/tests/unittests/test_inference_model_io.py
+++ b/python/paddle/fluid/tests/unittests/test_inference_model_io.py
@@ -12,8 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 
+import six
 import numpy as np
 import paddle.fluid.core as core
 
@@ -48,7 +51,7 @@ class TestBook(unittest.TestCase):
 
         exe.run(init_program, feed={}, fetch_list=[])
 
-        for i in xrange(100):
+        for i in six.moves.xrange(100):
             tensor_x = np.array(
                 [[1, 1], [1, 2], [3, 4], [5, 2]]).astype("float32")
             tensor_y = np.array([[-2], [-3], [-7], [-7]]).astype("float32")
@@ -64,7 +67,7 @@ class TestBook(unittest.TestCase):
                                  'y': tensor_y},
                            fetch_list=[avg_cost])[0]
 
-        reload(executor)  # reload to build a new scope
+        six.moves.reload_module(executor)  # reload to build a new scope
         exe = executor.Executor(place)
 
         [infer_prog, feed_var_names, fetch_vars] = load_inference_model(
diff --git a/python/paddle/fluid/tests/unittests/test_initializer.py b/python/paddle/fluid/tests/unittests/test_initializer.py
index 15a72cb605911dfe957fb927763174521a30a085..ab7183f88df809e584ca50ba16221bfdfe1376a9 100644
--- a/python/paddle/fluid/tests/unittests/test_initializer.py
+++ b/python/paddle/fluid/tests/unittests/test_initializer.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import numpy as np
 import unittest
 
@@ -27,12 +29,13 @@ class TestConstantInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="param",
-            initializer=initializer.ConstantInitializer())
+        for _ in range(2):
+            block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=initializer.ConstantInitializer())
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'fill_constant')
@@ -43,12 +46,13 @@ class TestConstantInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="param",
-            initializer=initializer.ConstantInitializer(2.3))
+        for _ in range(2):
+            block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=initializer.ConstantInitializer(2.3))
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'fill_constant')
@@ -61,12 +65,13 @@ class TestUniformInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="param",
-            initializer=initializer.UniformInitializer())
+        for _ in range(2):
+            block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=initializer.UniformInitializer())
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'uniform_random')
@@ -80,18 +85,19 @@ class TestUniformInitializer(unittest.TestCase):
         program = framework.Program()
         program.random_seed = 123
         block = program.global_block()
-        block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="param",
-            initializer=initializer.UniformInitializer())
-        block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="param",
-            initializer=initializer.UniformInitializer(seed=456))
+        for _ in range(2):
+            block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param1",
+                initializer=initializer.UniformInitializer())
+            block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param2",
+                initializer=initializer.UniformInitializer(seed=456))
         init_op = block.ops[1]
         self.assertEqual(init_op.attr("seed"), 123)
         init_op1 = block.ops[0]
@@ -102,12 +108,13 @@ class TestUniformInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="param",
-            initializer=initializer.UniformInitializer(-4.2, 3.1, 123))
+        for _ in range(2):
+            block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=initializer.UniformInitializer(-4.2, 3.1, 123))
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'uniform_random')
@@ -115,6 +122,25 @@ class TestUniformInitializer(unittest.TestCase):
         self.assertAlmostEqual(init_op.attr('max'), 3.1, delta=DELTA)
         self.assertEqual(init_op.attr('seed'), 123)
 
+    def test_uniform_initializer_two_op(self):
+        """Test uniform initializer with supplied attributes
+        """
+        program = framework.Program()
+        block = program.global_block()
+        for i in range(2):
+            block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=initializer.UniformInitializer(-4.2, float(i), 123))
+        self.assertEqual(len(block.ops), 1)
+        init_op0 = block.ops[0]
+        self.assertEqual(init_op0.type, 'uniform_random')
+        self.assertAlmostEqual(init_op0.attr('min'), -4.2, delta=DELTA)
+        self.assertAlmostEqual(init_op0.attr('max'), 0.0, delta=DELTA)
+        self.assertEqual(init_op0.attr('seed'), 123)
+
 
 class TestNormalInitializer(unittest.TestCase):
     def test_normal_initializer_default_value(self):
@@ -122,12 +148,13 @@ class TestNormalInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="param",
-            initializer=initializer.NormalInitializer())
+        for _ in range(2):
+            block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=initializer.NormalInitializer())
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'gaussian_random')
@@ -140,12 +167,13 @@ class TestNormalInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="param",
-            initializer=initializer.NormalInitializer(2.3, 1.9, 123))
+        for _ in range(2):
+            block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=initializer.NormalInitializer(2.3, 1.9, 123))
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'gaussian_random')
@@ -161,12 +189,13 @@ class TestXavierInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        param = block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="param",
-            initializer=initializer.XavierInitializer())
+        for _ in range(2):
+            param = block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=initializer.XavierInitializer())
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'uniform_random')
@@ -181,12 +210,13 @@ class TestXavierInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        param = block.create_parameter(
-            dtype="float32",
-            shape=[5, 10, 15, 20],
-            lod_level=0,
-            name="param",
-            initializer=initializer.XavierInitializer())
+        for _ in range(2):
+            param = block.create_parameter(
+                dtype="float32",
+                shape=[5, 10, 15, 20],
+                lod_level=0,
+                name="param",
+                initializer=initializer.XavierInitializer())
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'uniform_random')
@@ -203,12 +233,13 @@ class TestXavierInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        param = block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="param",
-            initializer=initializer.XavierInitializer(uniform=False))
+        for _ in range(2):
+            param = block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=initializer.XavierInitializer(uniform=False))
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'gaussian_random')
@@ -223,12 +254,13 @@ class TestXavierInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        param = block.create_parameter(
-            dtype="float32",
-            shape=[5, 10, 15, 20],
-            lod_level=0,
-            name="param",
-            initializer=initializer.XavierInitializer(uniform=False))
+        for _ in range(2):
+            param = block.create_parameter(
+                dtype="float32",
+                shape=[5, 10, 15, 20],
+                lod_level=0,
+                name="param",
+                initializer=initializer.XavierInitializer(uniform=False))
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'gaussian_random')
@@ -244,13 +276,14 @@ class TestXavierInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="param",
-            initializer=initializer.XavierInitializer(
-                fan_in=12, fan_out=23, seed=134))
+        for _ in range(2):
+            block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=initializer.XavierInitializer(
+                    fan_in=12, fan_out=23, seed=134))
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'uniform_random')
@@ -267,12 +300,13 @@ class TestMSRAInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        param = block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="param",
-            initializer=initializer.MSRAInitializer())
+        for _ in range(2):
+            param = block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=initializer.MSRAInitializer())
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'uniform_random')
@@ -287,12 +321,13 @@ class TestMSRAInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        param = block.create_parameter(
-            dtype="float32",
-            shape=[5, 10, 15, 20],
-            lod_level=0,
-            name="param",
-            initializer=initializer.MSRAInitializer())
+        for _ in range(2):
+            param = block.create_parameter(
+                dtype="float32",
+                shape=[5, 10, 15, 20],
+                lod_level=0,
+                name="param",
+                initializer=initializer.MSRAInitializer())
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'uniform_random')
@@ -308,12 +343,13 @@ class TestMSRAInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        param = block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="param",
-            initializer=initializer.MSRAInitializer(uniform=False))
+        for _ in range(2):
+            param = block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=initializer.MSRAInitializer(uniform=False))
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'gaussian_random')
@@ -328,12 +364,13 @@ class TestMSRAInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        param = block.create_parameter(
-            dtype="float32",
-            shape=[5, 10, 15, 20],
-            lod_level=0,
-            name="param",
-            initializer=initializer.MSRAInitializer(uniform=False))
+        for _ in range(2):
+            param = block.create_parameter(
+                dtype="float32",
+                shape=[5, 10, 15, 20],
+                lod_level=0,
+                name="param",
+                initializer=initializer.MSRAInitializer(uniform=False))
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'gaussian_random')
@@ -348,13 +385,14 @@ class TestMSRAInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="param",
-            initializer=initializer.MSRAInitializer(
-                fan_in=12, seed=134))
+        for _ in range(2):
+            block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=initializer.MSRAInitializer(
+                    fan_in=12, seed=134))
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'uniform_random')
@@ -370,12 +408,13 @@ class TestMSRAInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        block.create_parameter(
-            dtype="float32",
-            shape=[8, 1, 3, 3],
-            lod_level=0,
-            name="param",
-            initializer=initializer.BilinearInitializer())
+        for _ in range(2):
+            block.create_parameter(
+                dtype="float32",
+                shape=[8, 1, 3, 3],
+                lod_level=0,
+                name="param",
+                initializer=initializer.BilinearInitializer())
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'assign_value')
diff --git a/python/paddle/fluid/tests/unittests/test_iou_similarity_op.py b/python/paddle/fluid/tests/unittests/test_iou_similarity_op.py
index eff4212d91e609a7ef531280bbd3cf3671a59830..7c1808cf998e84c22c46df68ef07259c1a021c19 100644
--- a/python/paddle/fluid/tests/unittests/test_iou_similarity_op.py
+++ b/python/paddle/fluid/tests/unittests/test_iou_similarity_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 import numpy.random as random
diff --git a/python/paddle/fluid/tests/unittests/test_is_empty_op.py b/python/paddle/fluid/tests/unittests/test_is_empty_op.py
index 11121d9b65351eab639b7618fac0e54714cf4680..26d607718aec0bdffa00b9b4bca06ec6c0196217 100644
--- a/python/paddle/fluid/tests/unittests/test_is_empty_op.py
+++ b/python/paddle/fluid/tests/unittests/test_is_empty_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_l1_norm_op.py b/python/paddle/fluid/tests/unittests/test_l1_norm_op.py
index fa5b18a16f7a3e734ff8bb4f53240e8a9ce8fd8f..4e24a78ee54dfb1fb0e4f97317642cfaffe9436e 100644
--- a/python/paddle/fluid/tests/unittests/test_l1_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_l1_norm_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import numpy as np
 import unittest
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_label_smooth_op.py b/python/paddle/fluid/tests/unittests/test_label_smooth_op.py
index ca21289a0d48a123aed90bc557ccc732702b47f1..62d385bc52cfb3a9fe15a82096ff33abc1bcc552 100644
--- a/python/paddle/fluid/tests/unittests/test_label_smooth_op.py
+++ b/python/paddle/fluid/tests/unittests/test_label_smooth_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_layer_norm_op.py b/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
index 69365db4d104a1b69916a605534eff83e242289f..fb6c43136ff82af55d1fcc2969cf4a07ae081204 100644
--- a/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
@@ -11,12 +11,15 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from __future__ import print_function
 import unittest
 import numpy as np
 
 from operator import mul
 import paddle.fluid.core as core
 import paddle.fluid as fluid
+from functools import reduce
 
 np.random.random(123)
 
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 842d34c07e94a79e3351347e2528ecc478cc56dc..b04346b052903959f44aa96f6fccb7d20652e854 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -16,10 +16,12 @@ from __future__ import print_function
 import unittest
 
 import paddle.fluid.layers as layers
+from paddle.fluid.layers.device import get_places
 import paddle.fluid.nets as nets
 from paddle.fluid.framework import Program, program_guard, default_main_program
 from paddle.fluid.param_attr import ParamAttr
 import decorators
+from paddle.fluid.initializer import Constant
 
 
 class TestBook(unittest.TestCase):
@@ -157,7 +159,7 @@ class TestBook(unittest.TestCase):
                 input=crf_decode,
                 label=label,
                 chunk_scheme="IOB",
-                num_chunk_types=(label_dict_len - 1) / 2)
+                num_chunk_types=(label_dict_len - 1) // 2)
             self.assertFalse(crf is None)
             self.assertFalse(crf_decode is None)
 
@@ -173,6 +175,16 @@ class TestBook(unittest.TestCase):
                     x=dat, label=lbl))
         print(str(program))
 
+    def test_hsigmoid(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name='x', shape=[2], dtype='float32')
+            y = layers.data(name='y', shape=[2], dtype='int64')
+            self.assertIsNotNone(
+                layers.hsigmoid(
+                    input=x, label=y, num_classes=2))
+        print(str(program))
+
     def test_sequence_expand(self):
         program = Program()
         with program_guard(program):
@@ -228,6 +240,22 @@ class TestBook(unittest.TestCase):
             self.assertIsNotNone(layers.softmax(hid))
         print(str(program))
 
+    def test_sequence_unsqueeze(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name='x', shape=[8, 2], dtype='float32')
+            out = layers.unsqueeze(input=x, axes=[1])
+            self.assertIsNotNone(out)
+        print(str(program))
+
+    def test_squeeze(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name='x', shape=[1, 1, 4], dtype='float32')
+            out = layers.squeeze(input=x, axes=[2])
+            self.assertIsNotNone(out)
+        print(str(program))
+
     def test_lrn(self):
         program = Program()
         with program_guard(program):
@@ -238,7 +266,7 @@ class TestBook(unittest.TestCase):
     def test_get_places(self):
         program = Program()
         with program_guard(program):
-            x = layers.get_places(device_count=4)
+            x = get_places(device_count=4)
             self.assertIsNotNone(x)
         print(str(program))
 
@@ -251,12 +279,16 @@ class TestBook(unittest.TestCase):
         print(str(program))
 
     def test_im2sequence(self):
-        print("test_im2sequence")
         program = Program()
         with program_guard(program):
             x = layers.data(name='x', shape=[3, 128, 128], dtype='float32')
+            y = layers.data(name='y', shape=[], dtype='float32')
             output = layers.im2sequence(
-                input=x, stride=[1, 1], filter_size=[2, 2])
+                input=x,
+                input_image_size=y,
+                stride=[1, 1],
+                filter_size=[2, 2],
+                out_stride=[1, 1])
             self.assertIsNotNone(output)
         print(str(program))
 
@@ -264,16 +296,16 @@ class TestBook(unittest.TestCase):
     def test_nce(self):
         window_size = 5
         words = []
-        for i in xrange(window_size):
+        for i in range(window_size):
             words.append(
                 layers.data(
                     name='word_{0}'.format(i), shape=[1], dtype='int64'))
 
         dict_size = 10000
-        label_word = int(window_size / 2) + 1
+        label_word = int(window_size // 2) + 1
 
         embs = []
-        for i in xrange(window_size):
+        for i in range(window_size):
             if i == label_word:
                 continue
 
@@ -331,6 +363,25 @@ class TestBook(unittest.TestCase):
             self.assertIsNotNone(loss)
         print(str(program))
 
+    def test_scatter(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(
+                name='x',
+                shape=[3, 3],
+                append_batch_size=False,
+                dtype='float32')
+            idx = layers.data(
+                name='idx', shape=[2], append_batch_size=False, dtype='int32')
+            updates = layers.data(
+                name='updates',
+                shape=[2, 3],
+                append_batch_size=False,
+                dtype='float32')
+            out = layers.scatter(input=x, index=idx, updates=updates)
+            self.assertIsNotNone(out)
+        print(str(program))
+
     def test_lod_reset(self):
         program = Program()
         with program_guard(program):
@@ -428,6 +479,92 @@ class TestBook(unittest.TestCase):
             self.assertIsNotNone(ids)
         print(str(program))
 
+    def test_rank_loss(self):
+        program = Program()
+        with program_guard(program):
+            label = layers.data(
+                name='label',
+                append_batch_size=False,
+                shape=[16, 1],
+                dtype="float32")
+            left = layers.data(
+                name='left',
+                append_batch_size=False,
+                shape=[16, 1],
+                dtype="float32")
+            right = layers.data(
+                name='right',
+                append_batch_size=False,
+                shape=[16, 1],
+                dtype="float32")
+            out = layers.rank_loss(label, left, right, name="rank_loss")
+            self.assertIsNotNone(out)
+        print(str(program))
+
+    def test_flatten(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(
+                name='x',
+                append_batch_size=False,
+                shape=[4, 4, 3],
+                dtype="float32")
+            out = layers.flatten(x, axis=1, name="flatten")
+            self.assertIsNotNone(out)
+
+    def test_shape(self):
+        program = Program()
+        with program_guard(program):
+            input = layers.data(
+                name="input", shape=[3, 100, 100], dtype="float32")
+            out = layers.shape(input, name="shape")
+            self.assertIsNotNone(out)
+        print(str(program))
+
+    def test_pad2d(self):
+        program = Program()
+        with program_guard(program):
+            input = layers.data(
+                name="input", shape=[3, 100, 100], dtype="float32")
+            out = layers.pad2d(
+                input,
+                paddings=[1, 2, 3, 4],
+                mode='reflect',
+                data_format='NCHW',
+                name="shape")
+            self.assertIsNotNone(out)
+        print(str(program))
+
+    def test_prelu(self):
+        program = Program()
+        with program_guard(program):
+            input = layers.data(
+                name="input", shape=[5, 200, 100, 100], dtype="float32")
+            mode = 'channel'
+            out = layers.prelu(
+                input,
+                mode,
+                param_attr=ParamAttr(initializer=Constant(1.0)),
+                name='prelu')
+            self.assertIsNotNone(out)
+        print(str(program))
+
+    def test_sequence_enumerate(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name="input", shape=[1], dtype='int32', lod_level=1)
+            out = layers.sequence_enumerate(input=x, win_size=2, pad_value=0)
+        print(str(program))
+
+    def test_cross_entropy(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name="x", shape=[30, 10], dtype="float32")
+            label = layers.data(name="label", shape=[30, 1], dtype="int32")
+            mode = 'channel'
+            out = layers.cross_entropy(x, label, False, 4)
+            self.assertIsNotNone(out)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py b/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py
index 6382e290eb30c621da64d5c600be6d8a7c6254f1..0d3e6d73e0149fe633b8f1de9041068c2e3bb293 100644
--- a/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py
+++ b/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import copy
 import math
 import unittest
@@ -91,20 +93,21 @@ class TestLearningRateDecay(unittest.TestCase):
 
     def check_decay_with_place(self, place, python_decay_fn, fluid_decay_fn,
                                kwargs):
+        main_prog = fluid.Program()
+        startup_prog = fluid.Program()
 
-        decayed_lr = fluid_decay_fn(**kwargs)
+        with fluid.program_guard(main_prog, startup_prog):
+            decayed_lr = fluid_decay_fn(**kwargs)
 
         place = fluid.CPUPlace()
         exe = fluid.Executor(place)
 
-        exe.run(fluid.default_startup_program())
+        exe.run(startup_prog)
 
-        fluid.memory_optimize(fluid.default_main_program())
+        fluid.memory_optimize(main_prog)
 
         for step in range(10):
-            lr_val, = exe.run(fluid.default_main_program(),
-                              feed={},
-                              fetch_list=[decayed_lr])
+            lr_val, = exe.run(main_prog, feed={}, fetch_list=[decayed_lr])
             python_decayed_lr = python_decay_fn(
                 global_step=float(step), **kwargs)
             self.assertAlmostEqual(
diff --git a/python/paddle/fluid/tests/unittests/test_linear_chain_crf_op.py b/python/paddle/fluid/tests/unittests/test_linear_chain_crf_op.py
index 696d0ab4fa81a409a2bf0d6f6f23779ec26eb6d2..6e31e9204e95d98fcf69ed84a46d6cf3d94c808a 100644
--- a/python/paddle/fluid/tests/unittests/test_linear_chain_crf_op.py
+++ b/python/paddle/fluid/tests/unittests/test_linear_chain_crf_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import random
 import numpy as np
diff --git a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
index 1cdc69501043d120b9e3cc8ccda3a1212d205886..48b52a5412eb99fbc7a5c8534a766ede4954e849 100644
--- a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import paddle
 import paddle.fluid as fluid
 import os
diff --git a/python/paddle/fluid/tests/unittests/test_lod_array_length_op.py b/python/paddle/fluid/tests/unittests/test_lod_array_length_op.py
index d8b4e40662568f580ccff0257512cb8809488f17..15485df5ac440f2ff666ca27ef8e8bcc5df866c0 100644
--- a/python/paddle/fluid/tests/unittests/test_lod_array_length_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lod_array_length_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import paddle.fluid.layers as layers
 from paddle.fluid.executor import Executor
diff --git a/python/paddle/fluid/tests/unittests/test_lod_rank_table.py b/python/paddle/fluid/tests/unittests/test_lod_rank_table.py
index bac5e502318397b43e9867d5fc9e4e8cd33394b8..865ca118d55f82c66d44f4e3d553baafa0c14c3a 100644
--- a/python/paddle/fluid/tests/unittests/test_lod_rank_table.py
+++ b/python/paddle/fluid/tests/unittests/test_lod_rank_table.py
@@ -12,7 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from paddle.fluid.layers import lod_rank_table, data
+from __future__ import print_function
+
+from paddle.fluid.layers import data
+from paddle.fluid.layers.control_flow import lod_rank_table
 from paddle.fluid.executor import Executor
 import paddle.fluid.core as core
 import numpy
@@ -35,7 +38,7 @@ class TestLoDRankTable(unittest.TestCase):
         exe.run(scope=scope, feed={'x': tensor})
         var = scope.find_var(rank_table.name)
         table = var.get_lod_rank_table()
-        self.assertEqual([(0, 5), (1, 1), (2, 1)], table.items())
+        self.assertEqual([(0, 5), (1, 1), (2, 1)], list(table.items()))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_lod_reset_op.py b/python/paddle/fluid/tests/unittests/test_lod_reset_op.py
index 77905c4b96499c855fd5c5e704b8051ccdb7a323..31f364a42f624c8662a5ae087b003ca0304ae419 100644
--- a/python/paddle/fluid/tests/unittests/test_lod_reset_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lod_reset_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_lod_tensor_array.py b/python/paddle/fluid/tests/unittests/test_lod_tensor_array.py
index 118c22fbb1ff6be5859ae9e4aed6218b0c77deec..6ad27de9a0e42d1a15ec4a17804c7c0f7ebf5d94 100644
--- a/python/paddle/fluid/tests/unittests/test_lod_tensor_array.py
+++ b/python/paddle/fluid/tests/unittests/test_lod_tensor_array.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import paddle.fluid.core as core
 import numpy
@@ -24,7 +26,7 @@ class TestLoDTensorArray(unittest.TestCase):
         tensor_array = arr.get_lod_tensor_array()
         self.assertEqual(0, len(tensor_array))
         cpu = core.CPUPlace()
-        for i in xrange(10):
+        for i in range(10):
             t = core.LoDTensor()
             t.set(numpy.array([i], dtype='float32'), cpu)
             t.set_recursive_sequence_lengths([[1]])
@@ -32,7 +34,7 @@ class TestLoDTensorArray(unittest.TestCase):
 
         self.assertEqual(10, len(tensor_array))
 
-        for i in xrange(10):
+        for i in range(10):
             t = tensor_array[i]
             self.assertEqual(numpy.array(t), numpy.array([i], dtype='float32'))
             self.assertEqual([[1]], t.recursive_sequence_lengths())
diff --git a/python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py b/python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py
index cebe6997bb4152519dabbabfc0404d6036bc4e65..6a78ef5078a738efa2ae39ea23645fedaecce63b 100644
--- a/python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py
+++ b/python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import paddle.fluid.core as core
 import numpy
@@ -20,6 +22,11 @@ from paddle.fluid.framework import Program, program_guard
 from paddle.fluid.executor import Executor
 from paddle.fluid.backward import append_backward
 
+from paddle.fluid.layers.control_flow import lod_rank_table
+from paddle.fluid.layers.control_flow import max_sequence_len
+from paddle.fluid.layers.control_flow import lod_tensor_to_array
+from paddle.fluid.layers.control_flow import array_to_lod_tensor
+
 
 class TestCPULoDTensorArrayOps(unittest.TestCase):
     def place(self):
@@ -30,8 +37,10 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
         tensor.set(
             numpy.arange(10).reshape(10, 1).astype('int32'), self.place())
         tensor.set_recursive_sequence_lengths([[3, 6, 1]])
-        expect = map(lambda x: numpy.array(x).astype('int32'),
-                     [[3, 0, 9], [4, 1], [5, 2], [6], [7], [8]])
+        expect = [
+            numpy.array(x).astype('int32')
+            for x in [[3, 0, 9], [4, 1], [5, 2], [6], [7], [8]]
+        ]
         self.main(
             tensor=tensor,
             expect_array=expect,
@@ -43,8 +52,10 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
         tensor.set(
             numpy.arange(10).reshape(10, 1).astype('int32'), self.place())
         tensor.set_recursive_sequence_lengths([[3, 6, 0, 1]])
-        expect = map(lambda x: numpy.array(x).astype('int32'),
-                     [[3, 0, 9], [4, 1], [5, 2], [6], [7], [8]])
+        expect = [
+            numpy.array(x).astype('int32')
+            for x in [[3, 0, 9], [4, 1], [5, 2], [6], [7], [8]]
+        ]
         self.main(
             tensor=tensor,
             expect_array=expect,
@@ -106,8 +117,8 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
         expect = [
             numpy.array(
                 item, dtype='int32')
-            for item in [[21, 0, 1, 2, 3, 4, 5, 6, 46, 47, 48, 49], range(
-                22, 39) + range(7, 21), range(39, 46)]
+            for item in [[21, 0, 1, 2, 3, 4, 5, 6, 46, 47, 48, 49], list(
+                range(22, 39)) + list(range(7, 21)), list(range(39, 46))]
         ]
         lod = [[[1, 2, 1], [1, 3, 4, 4]], [[4, 3], [1, 4, 4, 8, 4, 6, 4]],
                [[2], [6, 1]]]
@@ -137,13 +148,13 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
         with program_guard(program):
             x = layers.data(name='x', shape=[10])
             x.persistable = True
-            table = layers.lod_rank_table(x, level=level)
-            max_len = layers.max_sequence_len(table)
+            table = lod_rank_table(x, level=level)
+            max_len = max_sequence_len(table)
             max_len.persistable = True
-            array = layers.lod_tensor_to_array(x, table)
+            array = lod_tensor_to_array(x, table)
             array.persistable = True
 
-            result = layers.array_to_lod_tensor(array, table)
+            result = array_to_lod_tensor(array, table)
             result.persistable = True
         exe = Executor(place)
         scope = core.Scope()
@@ -181,9 +192,9 @@ class TestCPULoDTensorArrayOpGrad(unittest.TestCase):
         with program_guard(program):
             x = layers.data(
                 name='x', shape=[1], dtype='float32', stop_gradient=False)
-            table = layers.lod_rank_table(x, level=0)
-            array = layers.lod_tensor_to_array(x, table)
-            result = layers.array_to_lod_tensor(array, table)
+            table = lod_rank_table(x, level=0)
+            array = lod_tensor_to_array(x, table)
+            result = array_to_lod_tensor(array, table)
 
             mean = layers.mean(result)
 
diff --git a/python/paddle/fluid/tests/unittests/test_log_loss_op.py b/python/paddle/fluid/tests/unittests/test_log_loss_op.py
index d3980b8db93ca517d16b6f782ba800ce839c3f45..784f4f648d52bdf4f2357f4454d790a8d53288f3 100644
--- a/python/paddle/fluid/tests/unittests/test_log_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_log_loss_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_logical_op.py b/python/paddle/fluid/tests/unittests/test_logical_op.py
index 1d7dfe60f200459705a48664c1a5b22d2a5888d2..521851a3d57a4a3e8b2c8e1639325cc6c88fdd84 100644
--- a/python/paddle/fluid/tests/unittests/test_logical_op.py
+++ b/python/paddle/fluid/tests/unittests/test_logical_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import op_test
 import unittest
 import numpy as np
diff --git a/python/paddle/fluid/tests/unittests/test_lookup_sparse_table_op.py b/python/paddle/fluid/tests/unittests/test_lookup_sparse_table_op.py
index aa9eae1e882f55ef51f38e158317a1a9aeed641c..11e5d8b536fb65b66c954991bf815241774702ec 100644
--- a/python/paddle/fluid/tests/unittests/test_lookup_sparse_table_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lookup_sparse_table_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
@@ -19,36 +21,27 @@ import paddle.fluid.core as core
 from paddle.fluid.op import Operator
 
 
-def output_hist(out):
-    hist, _ = np.histogram(out, range=(-5, 10))
-    hist = hist.astype("float32")
-    hist /= float(out.size)
-    prob = 0.1 * np.ones((10))
-    return hist, prob
-
-
 class TestLookupSpraseTable(OpTest):
     def check_with_place(self, place):
         scope = core.Scope()
 
-        # create and initialize Id Variable
-        ids = scope.var("Ids").get_tensor()
-        ids_array = np.array([0, 2, 3, 5, 100]).astype("int64")
-        ids.set(ids_array, place)
-
         # create and initialize W Variable
-        rows = [0, 1, 2, 3, 4, 5, 6]
-        row_numel = 10000
+        table_size = 10000
+        row_numel = 8
 
         w_selected_rows = scope.var('W').get_selected_rows()
-        w_selected_rows.set_height(len(rows))
-        w_selected_rows.set_rows(rows)
-        w_array = np.ones((len(rows), row_numel)).astype("float32")
-        for i in range(len(rows)):
+        w_selected_rows.set_height(table_size)
+        w_array = np.ones((table_size, row_numel)).astype("float32")
+        for i in range(table_size):
             w_array[i] *= i
         w_tensor = w_selected_rows.get_tensor()
         w_tensor.set(w_array, place)
 
+        # create and initialize Id Variable
+        ids = scope.var("Ids").get_tensor()
+        ids_array1 = np.array([0, 2, 3, 2, 5, 0, 100]).astype("int64")
+        ids.set(ids_array1, place)
+
         # create Out Variable
         out_tensor = scope.var('Out').get_tensor()
 
@@ -64,16 +57,28 @@ class TestLookupSpraseTable(OpTest):
         lookup_table.run(scope, place)
 
         # get result from Out
-        result_array = np.array(out_tensor)
+        result_array1 = np.array(out_tensor)
         # all(): return True if all elements of the iterable are true (or if the iterable is empty)
-        for idx, row in enumerate(ids_array[:-2]):
-            assert (row == result_array[idx]).all()
+        assert (result_array1[0] == w_array[0]).all()
+        assert (result_array1[1] == w_array[1]).all()
+        assert (result_array1[2] == w_array[2]).all()
+        assert (result_array1[3] == w_array[1]).all()
+        assert (result_array1[4] == w_array[3]).all()
+        assert (result_array1[5] == w_array[0]).all()
+        assert (result_array1[6] == w_array[4]).all()
+
+        # create and initialize Id Variable
+        ids = scope.var("Ids").get_tensor()
+        ids_array2 = np.array([4, 2, 3, 7, 100000]).astype("int64")
+        ids.set(ids_array2, place)
+        lookup_table.run(scope, place)
 
-        # check the random value
-        hist, prob = output_hist(result_array[-1])
-        self.assertTrue(
-            np.allclose(
-                hist, prob, rtol=0, atol=0.01), "hist: " + str(hist))
+        result_array2 = np.array(out_tensor)
+        assert (result_array2[0] == w_array[5]).all()
+        assert (result_array2[1] == w_array[1]).all()
+        assert (result_array2[2] == w_array[2]).all()
+        assert (result_array2[3] == w_array[6]).all()
+        assert (result_array2[4] == w_array[7]).all()
 
     def test_w_is_selected_rows(self):
         places = [core.CPUPlace()]
diff --git a/python/paddle/fluid/tests/unittests/test_lookup_table_op.py b/python/paddle/fluid/tests/unittests/test_lookup_table_op.py
index f8d5785fbfe64843f4aa3b96b24809df60980c74..4990ee898d81089735f6db4ee4ad6758944e311a 100644
--- a/python/paddle/fluid/tests/unittests/test_lookup_table_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lookup_table_op.py
@@ -12,11 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
 import paddle.fluid.core as core
 from paddle.fluid.op import Operator
+import paddle.compat as cpt
 
 
 class TestLookupTableOp(OpTest):
@@ -35,77 +38,59 @@ class TestLookupTableOp(OpTest):
         self.check_grad(['W'], 'Out', no_grad_set=set('Ids'))
 
 
+class TestLookupTableOpWithTensorIds(OpTest):
+    def setUp(self):
+        self.op_type = "lookup_table"
+        table = np.random.random((17, 31)).astype("float32")
+        ids = np.random.randint(
+            low=0, high=17, size=(2, 4, 5, 1)).astype("int64")
+        self.inputs = {'W': table, 'Ids': ids}
+        self.outputs = {'Out': table[ids.flatten()].reshape((2, 4, 5, 31))}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['W'], 'Out', no_grad_set=set('Ids'))
+
+
 class TestLookupTableOpWithPadding(TestLookupTableOp):
     def test_check_output(self):
         ids = np.squeeze(self.inputs['Ids'])
         padding_idx = np.random.choice(ids, 1)[0]
         self.outputs['Out'][ids == padding_idx] = np.zeros(31)
-        self.attrs = {'padding_idx': long(padding_idx)}
+        self.attrs = {'padding_idx': int(padding_idx)}
         self.check_output()
 
     def test_check_grad(self):
-        # Since paddings are not trainable and fixed in forward, the gradient of 
+        # Since paddings are not trainable and fixed in forward, the gradient of
         # paddings makes no sense and we don't test the gradient here.
         pass
 
 
-class TestLookupTableIdsIsSelectedRows(OpTest):
-    def check_with_place(self, place):
-        scope = core.Scope()
-
-        # create and initialize Variable
-        height = 10
-        rows = [0, 4, 4, 7]
-        row_numel = 12
-
-        # create and initialize W Variable
-        W = scope.var('W').get_tensor()
-        W_array = np.full((height, row_numel), 1.0).astype("float32")
-        for i in range(height):
-            W_array[i] *= i
-        W.set(W_array, place)
-
-        # create and initialize Ids Variable
-        ids_selected_rows = scope.var('Ids').get_selected_rows()
-        ids_selected_rows.set_height(len(rows))
-        ids_selected_rows.set_rows(rows)
-        np_array = np.ones((len(rows), row_numel)).astype("float32")
-        ids_tensor = ids_selected_rows.get_tensor()
-        ids_tensor.set(np_array, place)
-
-        # create Out Variable
-        Out = scope.var('Out').get_selected_rows()
-
-        # create and run lookup_table operator
-        concat_rows_op = Operator("lookup_table", W='W', Ids='Ids', Out='Out')
-        concat_rows_op.run(scope, place)
-
-        # get result from Out
-        Out_tensor = Out.get_tensor()
-        result_array = np.array(Out_tensor)
-
-        # all(): return True if all elements of the iterable are true (or if the iterable is empty)
-        for idx, row in enumerate(rows):
-            assert (row == result_array[idx]).all()
+class TestLookupTableOpWithTensorIdsAndPadding(TestLookupTableOpWithTensorIds):
+    def test_check_output(self):
+        ids = self.inputs['Ids']
+        flatten_idx = ids.flatten()
+        padding_idx = np.random.choice(flatten_idx, 1)[0]
+        self.outputs['Out'][np.squeeze(ids == padding_idx)] = np.zeros(31)
+        self.attrs = {'padding_idx': cpt.long_type(padding_idx)}
+        self.check_output()
 
-    def test_concat_rows(self):
-        places = [core.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(core.CUDAPlace(0))
-        for place in places:
-            self.check_with_place(place)
+    def test_check_grad(self):
+        # Since paddings are not trainable and fixed in forward, the gradient of
+        # paddings makes no sense and we don't test the gradient here.
+        pass
 
 
 class TestLookupTableWIsSelectedRows(OpTest):
-    def check_with_place(self, place):
-        scope = core.Scope()
-
-        # create and initialize Id Variable
+    def prepare_ids(self, scope, place):
         ids_tensor = scope.var('Ids').get_tensor()
         ids_array = np.array([[0], [4], [3], [5]]).astype("int64")
         ids_tensor.set(ids_array, place)
+        return ids_array
 
-        # create and initialize W Variable
+    def prepare_w(self, scope, place):
         rows = [0, 1, 2, 3, 4, 5, 6]
         row_numel = 12
 
@@ -118,8 +103,22 @@ class TestLookupTableWIsSelectedRows(OpTest):
         w_tensor = w_selected_rows.get_tensor()
         w_tensor.set(w_array, place)
 
-        # create Out Variable
-        out_tensor = scope.var('Out').get_tensor()
+    def create_out_tensor(self, scope, place):
+        return scope.var('Out').get_tensor()
+
+    def check_result(self, ids_array, result_array):
+        # all(): return True if all elements of the iterable are true (or if the iterable is empty)
+        for idx, row in enumerate(ids_array):
+            assert (row[0] == result_array[idx]).all()
+
+    def check_with_place(self, place):
+        scope = core.Scope()
+
+        ids_array = self.prepare_ids(scope, place)
+
+        self.prepare_w(scope, place)
+
+        out_tensor = self.create_out_tensor(scope, place)
 
         # create and run lookup_table operator
         lookup_table = Operator("lookup_table", W='W', Ids='Ids', Out='Out')
@@ -127,9 +126,8 @@ class TestLookupTableWIsSelectedRows(OpTest):
 
         # get result from Out
         result_array = np.array(out_tensor)
-        # all(): return True if all elements of the iterable are true (or if the iterable is empty)
-        for idx, row in enumerate(ids_array):
-            assert (row[0] == result_array[idx]).all()
+
+        self.check_result(ids_array, result_array)
 
     def test_w_is_selected_rows(self):
         places = [core.CPUPlace()]
@@ -138,5 +136,19 @@ class TestLookupTableWIsSelectedRows(OpTest):
             self.check_with_place(place)
 
 
+class TestLookupTableWithTensorIdsWIsSelectedRows(
+        TestLookupTableWIsSelectedRows):
+    def prepare_ids(self, scope, place):
+        ids_tensor = scope.var('Ids').get_tensor()
+        ids_array = np.random.randint(
+            low=0, high=6, size=(2, 4, 3, 1)).astype("int64")
+        ids_tensor.set(ids_array, place)
+        return ids_array
+
+    def check_result(self, ids_array, result_array):
+        for idx, row in np.ndenumerate(ids_array):
+            assert (row == result_array[idx]).all()
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_lrn_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_lrn_mkldnn_op.py
index 966a16dc870c041b9deb140bed57d907cf305fd8..f6bb2ab7a696c40cb61dd5b38ca702b577fe7ea2 100644
--- a/python/paddle/fluid/tests/unittests/test_lrn_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lrn_mkldnn_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 from test_lrn_op import TestLRNOp
 
diff --git a/python/paddle/fluid/tests/unittests/test_lrn_op.py b/python/paddle/fluid/tests/unittests/test_lrn_op.py
index eaff45cbb2a58798e9d55149510bec72eea370cd..bb91f26bbb53de454a6d037af4c9d96262866ce3 100644
--- a/python/paddle/fluid/tests/unittests/test_lrn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lrn_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
@@ -34,7 +36,7 @@ class TestLRNOp(OpTest):
         return x + 1
 
     def get_out(self):
-        start = -(self.n - 1) / 2
+        start = -(self.n - 1) // 2
         end = start + self.n
 
         mid = np.empty((self.N, self.C, self.H, self.W)).astype("float32")
diff --git a/python/paddle/fluid/tests/unittests/test_lstm_op.py b/python/paddle/fluid/tests/unittests/test_lstm_op.py
index 705a24bd8f39a55e0a352944d961f8d33aaf96ff..76a24123fc7d51231bf24a3d1a6930186c94a5db 100644
--- a/python/paddle/fluid/tests/unittests/test_lstm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lstm_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_lstm_unit_op.py b/python/paddle/fluid/tests/unittests/test_lstm_unit_op.py
index e343265874f99afcd8201fa997932e2613fffc4c..eaa6b774c4d3e7add555c34f887e86dc847583b2 100644
--- a/python/paddle/fluid/tests/unittests/test_lstm_unit_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lstm_unit_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_lstmp_op.py b/python/paddle/fluid/tests/unittests/test_lstmp_op.py
index ed2262da4bc727657c2e65d69cb1922891e17b09..9c3ec45515ffe0a07541fd9cfb7e92b079264071 100644
--- a/python/paddle/fluid/tests/unittests/test_lstmp_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lstmp_op.py
@@ -11,6 +11,8 @@
 #WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #See the License for the specific language governing permissions and
 #limitations under the License.
+
+from __future__ import print_function
 import unittest
 import numpy as np
 import test_lstm_op as LstmTest
diff --git a/python/paddle/fluid/tests/unittests/test_margin_rank_loss_op.py b/python/paddle/fluid/tests/unittests/test_margin_rank_loss_op.py
index 97c112487fd9193ab77d18945585a554e5fbcdf8..4a7e952436bd46c92c6256b4ec2d0652cfa38959 100644
--- a/python/paddle/fluid/tests/unittests/test_margin_rank_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_margin_rank_loss_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_math_op_patch.py b/python/paddle/fluid/tests/unittests/test_math_op_patch.py
index 852a80261e02f5ed19e7fbe608d490be1f7798a9..b25d40a3a15e259878222ee5482cd842543b63d6 100644
--- a/python/paddle/fluid/tests/unittests/test_math_op_patch.py
+++ b/python/paddle/fluid/tests/unittests/test_math_op_patch.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import decorators
 import paddle.fluid as fluid
diff --git a/python/paddle/fluid/tests/unittests/test_matmul_op.py b/python/paddle/fluid/tests/unittests/test_matmul_op.py
index cae2c8fa87d9857de8f26cf4962d9370eca66243..abf10437d83268a6a84a1c62399eb02cd3b1d663 100644
--- a/python/paddle/fluid/tests/unittests/test_matmul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_matmul_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_maxout_op.py b/python/paddle/fluid/tests/unittests/test_maxout_op.py
index f5ddf72516bf8adb41698d9b2e22c7de74a3fad9..d588b22fe2607a6041359d420ebba757d8a632d6 100644
--- a/python/paddle/fluid/tests/unittests/test_maxout_op.py
+++ b/python/paddle/fluid/tests/unittests/test_maxout_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
@@ -19,7 +21,7 @@ from op_test import OpTest
 
 def maxout_forward_naive(input, groups):
     s0, s1, s2, s3 = input.shape
-    return np.ndarray([s0, s1 / groups, groups, s2, s3], \
+    return np.ndarray([s0, s1 // groups, groups, s2, s3], \
         buffer = input, dtype=input.dtype).max(axis=(2))
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_mean_iou.py b/python/paddle/fluid/tests/unittests/test_mean_iou.py
index 64d42b693bf11f3cb0153243909db4c0612bf4e7..03e94483178e83adad9886cd7df2107581360dd1 100644
--- a/python/paddle/fluid/tests/unittests/test_mean_iou.py
+++ b/python/paddle/fluid/tests/unittests/test_mean_iou.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 from __future__ import division
 import unittest
 import numpy as np
@@ -80,7 +82,7 @@ class TestMeanIOUOp(OpTest):
             'InCorrects': in_corrects,
             'InMeanIou': in_mean_ious
         }
-        self.attrs = {'num_classes': long(self.num_classes)}
+        self.attrs = {'num_classes': int(self.num_classes)}
         mean_iou, out_wrong, out_correct = compute_mean_iou(
             predictions, labels, self.num_classes, in_wrongs, in_corrects,
             in_mean_ious)
diff --git a/python/paddle/fluid/tests/unittests/test_mean_op.py b/python/paddle/fluid/tests/unittests/test_mean_op.py
index 15472a8fc4716f218b0eddf17589634c565130b1..ff338f0e0037307e81a92eed804096c9a2a87361 100644
--- a/python/paddle/fluid/tests/unittests/test_mean_op.py
+++ b/python/paddle/fluid/tests/unittests/test_mean_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py b/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py
index cfd6e63e12258a92447e68b4afbc7ead91b68cc1..67733807f8f8582f68dcfa3f361e13a631a29597 100644
--- a/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py
@@ -43,5 +43,29 @@ class TestControlFlowGraph(unittest.TestCase):
         print(str(result_program))
 
 
+class TestMemoryTranspiler2(unittest.TestCase):
+    def setUp(self):
+        program = Program()
+        with program_guard(program, startup_program=Program()):
+            x = layers.data(name='x', shape=[13], dtype='float32')
+            fc = layers.fc(input=x, size=10, act=None)
+            reshape = layers.reshape(x=fc, shape=[-1, 2, 5])
+            fc = layers.reshape(x=reshape, shape=[-1, 5, 2])
+            y_predict = layers.fc(input=fc, size=1, act=None)
+            y = layers.data(name='y', shape=[1], dtype='float32')
+            cost = layers.square_error_cost(input=y_predict, label=y)
+            avg_cost = layers.mean(cost)
+            opt = optimizer.SGD(learning_rate=0.001)
+            opt.minimize(avg_cost)
+        self.program = program
+
+    def test_inplace_ops(self):
+        print("before optimization")
+        print(str(self.program))
+        result_program = memory_optimize(self.program)
+        print("after optimization")
+        print(str(result_program))
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_memory_usage.py b/python/paddle/fluid/tests/unittests/test_memory_usage.py
new file mode 100644
index 0000000000000000000000000000000000000000..4cdb5b5d9f7f020c4eb9a3b3a804c074d7ddbb35
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_memory_usage.py
@@ -0,0 +1,69 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import paddle
+import paddle.fluid as fluid
+import contextlib
+import unittest
+
+
+def train_simulator(test_batch_size=10):
+    if test_batch_size <= 0:
+        raise ValueError("batch_size should be a positive integeral value, "
+                         "but got batch_size={}".format(test_batch_size))
+
+    x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+    y_predict = fluid.layers.fc(input=x, size=1, act=None)
+    y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+
+    cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+    avg_cost = fluid.layers.mean(cost)
+
+    sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+    sgd_optimizer.minimize(avg_cost)
+
+    # Calculate memory usage in current network config
+    lower_usage, upper_usage, unit = fluid.contrib.memory_usage(
+        fluid.default_main_program(), batch_size=test_batch_size)
+
+    print("memory usage is about %.3f - %.3f %s" %
+          (lower_usage, upper_usage, unit))
+
+
+class TestMemoryUsage(unittest.TestCase):
+    def test_with_unit_B(self):
+        with self.program_scope_guard():
+            train_simulator()
+
+    def test_with_unit_KB(self):
+        with self.program_scope_guard():
+            train_simulator(test_batch_size=1000)
+
+    def test_with_unit_MB(self):
+        with self.program_scope_guard():
+            train_simulator(test_batch_size=100000)
+
+    @contextlib.contextmanager
+    def program_scope_guard(self):
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        scope = fluid.core.Scope()
+        with fluid.scope_guard(scope):
+            with fluid.program_guard(prog, startup_prog):
+                yield
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_merge_ids_op.py b/python/paddle/fluid/tests/unittests/test_merge_ids_op.py
index f209bdf30faffc0b2c7932b7b10f384d6d61a831..26ce7024117162e8bad403a9d8b8518c27578c83 100644
--- a/python/paddle/fluid/tests/unittests/test_merge_ids_op.py
+++ b/python/paddle/fluid/tests/unittests/test_merge_ids_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_mine_hard_examples_op.py b/python/paddle/fluid/tests/unittests/test_mine_hard_examples_op.py
index 54ee85c1a7a539fe9517f32adb35ab99b5ae2a07..4e5cc91268c5df4be3de3c04a82ef65b33cf4d20 100644
--- a/python/paddle/fluid/tests/unittests/test_mine_hard_examples_op.py
+++ b/python/paddle/fluid/tests/unittests/test_mine_hard_examples_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 import sys
diff --git a/python/paddle/fluid/tests/unittests/test_minus_op.py b/python/paddle/fluid/tests/unittests/test_minus_op.py
index ee32bd49925e266cfd5beb51496355e111a3d0d2..54253b17b967871b03628023c5a9fdb339af1828 100644
--- a/python/paddle/fluid/tests/unittests/test_minus_op.py
+++ b/python/paddle/fluid/tests/unittests/test_minus_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_modified_huber_loss_op.py b/python/paddle/fluid/tests/unittests/test_modified_huber_loss_op.py
index 62035efe8ec3809a7672e58697de4304426338d7..02fecfe47ec3fbff085b0a7f24316e5d0f6cd814 100644
--- a/python/paddle/fluid/tests/unittests/test_modified_huber_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_modified_huber_loss_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_momentum_op.py b/python/paddle/fluid/tests/unittests/test_momentum_op.py
index aaea9c1809213c5707e8540eebbdd6f269836fdc..7137fd0fdb7c503492107da684b95989037eb872 100644
--- a/python/paddle/fluid/tests/unittests/test_momentum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_momentum_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
@@ -39,7 +41,7 @@ class TestMomentumOp1(OpTest):
 
         velocity_out = mu * velocity + grad
         if use_nesterov:
-            param_out = param - grad * learning_rate + \
+            param_out = param - grad * learning_rate - \
                         velocity_out * mu * learning_rate
         else:
             param_out = param - learning_rate * velocity_out
@@ -75,7 +77,7 @@ class TestMomentumOp2(OpTest):
 
         velocity_out = mu * velocity + grad
         if use_nesterov:
-            param_out = param - grad * learning_rate + \
+            param_out = param - grad * learning_rate - \
                         velocity_out * mu * learning_rate
         else:
             param_out = param - learning_rate * velocity_out
diff --git a/python/paddle/fluid/tests/unittests/test_mul_op.py b/python/paddle/fluid/tests/unittests/test_mul_op.py
index bbc782c1bce302df68ab30013f3a7667e51ed479..fca4ffa88b79ebfad009b436d440e86ddceaaed7 100644
--- a/python/paddle/fluid/tests/unittests/test_mul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_mul_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 import paddle.fluid.core as core
diff --git a/python/paddle/fluid/tests/unittests/test_multi_file_reader.py b/python/paddle/fluid/tests/unittests/test_multi_file_reader.py
index dbd510e64ffdd6f3b78b22bb0d37d9a7ba3fd9b5..09788868ccb926f56c2f622b5caf695670fd17f8 100644
--- a/python/paddle/fluid/tests/unittests/test_multi_file_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_multi_file_reader.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 
 import paddle.fluid as fluid
@@ -39,17 +41,17 @@ class TestMultipleReader(unittest.TestCase):
         copyfile('./mnist_0.recordio', './mnist_1.recordio')
         copyfile('./mnist_0.recordio', './mnist_2.recordio')
 
-    def main(self, thread_num):
+    def main(self, is_test=False):
         file_list = [
             './mnist_0.recordio', './mnist_1.recordio', './mnist_2.recordio'
         ]
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             data_files = fluid.layers.open_files(
                 filenames=file_list,
-                thread_num=thread_num,
                 shapes=[(-1, 784), (-1, 1)],
                 lod_levels=[0, 0],
-                dtypes=['float32', 'int64'])
+                dtypes=['float32', 'int64'],
+                is_test=is_test)
             img, label = fluid.layers.read_file(data_files)
 
             if fluid.core.is_compiled_with_cuda():
@@ -71,6 +73,9 @@ class TestMultipleReader(unittest.TestCase):
             self.assertEqual(batch_count, self.num_batch * 3)
 
     def test_main(self):
-        self.main(thread_num=3)  # thread number equals to file number
-        self.main(thread_num=10)  # thread number is larger than file number
-        self.main(thread_num=2)  # thread number is less than file number
+        self.main(is_test=False)
+        self.main(is_test=True)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py b/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py
index 7fc9f550440d3d0e1a8182a69f5692b3df0aa258..4fae11e928dc7e066799a8936bada0e252afaa42 100644
--- a/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 
 import paddle.fluid as fluid
diff --git a/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py b/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
index aacd8ae45af10a2b19d2903ab121e9bb4f9de7ff..df0562dcc79cbb960136c19d2b3f243cf2e09782 100644
--- a/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
+++ b/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
@@ -11,6 +11,8 @@
 #WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #See the License for the specific language governing permissions and
 #limitations under the License.
+
+from __future__ import print_function
 import unittest
 import numpy as np
 import copy
@@ -112,7 +114,7 @@ def multiclass_nms(boxes, scores, background, score_threshold, nms_threshold,
 
     if keep_top_k > -1 and num_det > keep_top_k:
         score_index = []
-        for c, indices in selected_indices.iteritems():
+        for c, indices in selected_indices.items():
             for idx in indices:
                 score_index.append((scores[c][idx], c, idx))
 
@@ -143,7 +145,7 @@ def batched_multiclass_nms(boxes, scores, background, score_threshold,
         lod.append(nmsed_num)
         if nmsed_num == 0: continue
 
-        for c, indices in nmsed_outs.iteritems():
+        for c, indices in nmsed_outs.items():
             for idx in indices:
                 xmin, ymin, xmax, ymax = boxes[n][idx][:]
                 det_outs.append([c, scores[n][c][idx], xmin, ymin, xmax, ymax])
diff --git a/python/paddle/fluid/tests/unittests/test_multihead_attention.py b/python/paddle/fluid/tests/unittests/test_multihead_attention.py
index 80c3c67967e970a7182c008b6cfd138aff044167..f60da862ac091ca1eefccfe2834201d1c79e2def 100644
--- a/python/paddle/fluid/tests/unittests/test_multihead_attention.py
+++ b/python/paddle/fluid/tests/unittests/test_multihead_attention.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import paddle.fluid as fluid
 import paddle.fluid.core as core
diff --git a/python/paddle/fluid/tests/unittests/test_multiplex_op.py b/python/paddle/fluid/tests/unittests/test_multiplex_op.py
index 03cad8b43bac053f246156b8c57df9fceab20dcd..1567a74808aa37e5e18bbe583cc1d8987b31cd58 100644
--- a/python/paddle/fluid/tests/unittests/test_multiplex_op.py
+++ b/python/paddle/fluid/tests/unittests/test_multiplex_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_name_scope.py b/python/paddle/fluid/tests/unittests/test_name_scope.py
new file mode 100644
index 0000000000000000000000000000000000000000..08c802e20d2bb364ef7f116ee0042a2ad21a9b2b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_name_scope.py
@@ -0,0 +1,45 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import paddle.fluid as fluid
+
+
+class TestNameScope(unittest.TestCase):
+    def test_name_scope(self):
+        with fluid.name_scope("s1"):
+            a = fluid.layers.data(name='data', shape=[1], dtype='int32')
+            b = a + 1
+            with fluid.name_scope("s2"):
+                c = b * 1
+            with fluid.name_scope("s3"):
+                d = c / 1
+        with fluid.name_scope("s1"):
+            f = fluid.layers.pow(d, 2.0)
+        with fluid.name_scope("s4"):
+            g = f - 1
+
+        for op in fluid.default_main_program().block(0).ops:
+            if op.type == 'elementwise_add':
+                self.assertEqual(op.desc.attr("op_namescope"), '/s1/')
+            elif op.type == 'elementwise_mul':
+                self.assertEqual(op.desc.attr("op_namescope"), '/s1/s2/')
+            elif op.type == 'elementwise_div':
+                self.assertEqual(op.desc.attr("op_namescope"), '/s1/s3/')
+            elif op.type == 'elementwise_sub':
+                self.assertEqual(op.desc.attr("op_namescope"), '/s4/')
+            elif op.type == 'pow':
+                self.assertEqual(op.desc.attr("op_namescope"), '/s1_1/')
diff --git a/python/paddle/fluid/tests/unittests/test_nce.py b/python/paddle/fluid/tests/unittests/test_nce.py
index 76ecc8ba08ba31798040080a0ae99fe515c28cec..0745bd274f73715b6fdec236819b8d89827e1346 100644
--- a/python/paddle/fluid/tests/unittests/test_nce.py
+++ b/python/paddle/fluid/tests/unittests/test_nce.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
@@ -66,7 +68,7 @@ class TestNCE(OpTest):
         self.attrs = {
             'num_total_classes': num_classes,
             'num_neg_samples': num_neg_samples,
-            'custom_neg_classes': range(num_neg_samples)
+            'custom_neg_classes': list(range(num_neg_samples))
         }
         self.inputs = {
             'Input': input,
diff --git a/python/paddle/fluid/tests/unittests/test_network_with_dtype.py b/python/paddle/fluid/tests/unittests/test_network_with_dtype.py
index d4835dd18405fc7a0d508a780a734922e0abd12c..60dcf195daf61d76a2e6d6f764fa216270804f55 100644
--- a/python/paddle/fluid/tests/unittests/test_network_with_dtype.py
+++ b/python/paddle/fluid/tests/unittests/test_network_with_dtype.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 
 import numpy as np
diff --git a/python/paddle/fluid/tests/unittests/test_norm_op.py b/python/paddle/fluid/tests/unittests/test_norm_op.py
index 108a665f37f5cd652ec83f784a56ca52e6b49fe8..a424260312eab850e579b4365efd071de599bd4f 100644
--- a/python/paddle/fluid/tests/unittests/test_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_norm_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
@@ -61,5 +63,27 @@ class TestNormOp3(TestNormOp):
         self.epsilon = 1e-8
 
 
+class TestNormOp4(TestNormOp):
+    def init_test_case(self):
+        self.shape = [128, 1024, 14, 14]
+        self.axis = 2
+        self.epsilon = 1e-8
+
+    def test_check_grad(self):
+        # since the gradient check is very slow in large shape, so skip check_grad
+        pass
+
+
+class TestNormOp5(TestNormOp):
+    def init_test_case(self):
+        self.shape = [2048, 2048]
+        self.axis = 1
+        self.epsilon = 1e-8
+
+    def test_check_grad(self):
+        # since the gradient check is very slow in large shape, so skip check_grad
+        pass
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_normalization_wrapper.py b/python/paddle/fluid/tests/unittests/test_normalization_wrapper.py
index 198c68866d399023c51c2a43b588aa8ec49c3c9a..24fdcf8c88417244e981194e63bd77a2fdbd179d 100644
--- a/python/paddle/fluid/tests/unittests/test_normalization_wrapper.py
+++ b/python/paddle/fluid/tests/unittests/test_normalization_wrapper.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import paddle.fluid as fluid
 import paddle.fluid.core as core
diff --git a/python/paddle/fluid/tests/unittests/test_nvprof.py b/python/paddle/fluid/tests/unittests/test_nvprof.py
index 226e5e5d1131b1f33cfbbfefec536e6974f85b36..da943d64da6cfc64d121b7373f7c067c1cff731c 100644
--- a/python/paddle/fluid/tests/unittests/test_nvprof.py
+++ b/python/paddle/fluid/tests/unittests/test_nvprof.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import os
 import numpy as np
diff --git a/python/paddle/fluid/tests/unittests/test_one_hot_op.py b/python/paddle/fluid/tests/unittests/test_one_hot_op.py
index d13f2b3afde10f9b4e632094fa216d8729069afa..7afdae804a65b9fb05a521a1b08ce0bfb21d721f 100644
--- a/python/paddle/fluid/tests/unittests/test_one_hot_op.py
+++ b/python/paddle/fluid/tests/unittests/test_one_hot_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 import math
@@ -28,13 +30,13 @@ class TestOneHotOp(OpTest):
         depth = 10
         dimension = 12
         x_lod = [[4, 1, 3, 3]]
-        x = [np.random.randint(0, depth - 1) for i in xrange(sum(x_lod[0]))]
+        x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))]
         x = np.array(x).astype('int').reshape([sum(x_lod[0]), 1])
 
         out = np.zeros(shape=(np.product(x.shape[:-1]),
                               depth)).astype('float32')
 
-        for i in xrange(np.product(x.shape)):
+        for i in range(np.product(x.shape)):
             out[i, x[i]] = 1.0
 
         self.inputs = {'X': (x, x_lod)}
@@ -51,13 +53,13 @@ class TestOneHotOp_default_dtype(OpTest):
         depth = 10
         dimension = 12
         x_lod = [[4, 1, 3, 3]]
-        x = [np.random.randint(0, depth - 1) for i in xrange(sum(x_lod[0]))]
+        x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))]
         x = np.array(x).astype('int').reshape([sum(x_lod[0]), 1])
 
         out = np.zeros(shape=(np.product(x.shape[:-1]),
                               depth)).astype('float32')
 
-        for i in xrange(np.product(x.shape)):
+        for i in range(np.product(x.shape)):
             out[i, x[i]] = 1.0
 
         self.inputs = {'X': (x, x_lod)}
@@ -76,7 +78,7 @@ class TestOneHotOp_exception(OpTest):
         self.dimension = 12
         self.x = core.LoDTensor()
         x_lod = [[4, 1, 3, 3]]
-        data = [np.random.randint(11, 20) for i in xrange(sum(x_lod[0]))]
+        data = [np.random.randint(11, 20) for i in range(sum(x_lod[0]))]
         data = np.array(data).astype('int').reshape([sum(x_lod[0]), 1])
         self.x.set(data, self.place)
         self.x.set_recursive_sequence_lengths(x_lod)
diff --git a/python/paddle/fluid/tests/unittests/test_op_support_gpu.py b/python/paddle/fluid/tests/unittests/test_op_support_gpu.py
index 5fafb8280e19cca46e5bf687494c07200ca53153..e203fccd03f86077c51e176456c1c313ac14a9ee 100644
--- a/python/paddle/fluid/tests/unittests/test_op_support_gpu.py
+++ b/python/paddle/fluid/tests/unittests/test_op_support_gpu.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import paddle.fluid.core as core
 
diff --git a/python/paddle/fluid/tests/unittests/test_operator.py b/python/paddle/fluid/tests/unittests/test_operator.py
index 5e418fe6ac2d62948762290a65686207d017275c..544fca8cecd0a2b94a5aec40b9442f86036fd4d2 100644
--- a/python/paddle/fluid/tests/unittests/test_operator.py
+++ b/python/paddle/fluid/tests/unittests/test_operator.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 
 import paddle.fluid.op as op
diff --git a/python/paddle/fluid/tests/unittests/test_operator_desc.py b/python/paddle/fluid/tests/unittests/test_operator_desc.py
index c098a5a0cb0364f9ec93c95c1ef50912e574b3d9..cac132e6e08a8a9ec595236b1a990c0900ea4f0f 100644
--- a/python/paddle/fluid/tests/unittests/test_operator_desc.py
+++ b/python/paddle/fluid/tests/unittests/test_operator_desc.py
@@ -12,9 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 
 import paddle.fluid.core as core
+import paddle.compat as cpt
 
 from paddle.fluid.framework import Program, default_startup_program
 
@@ -29,14 +32,15 @@ class TestOperator(unittest.TestCase):
             self.assertFail()
         except ValueError as v_err:
             self.assertEqual(
-                v_err.message,
+                cpt.get_exception_message(v_err),
                 "`type` to initilized an Operator can not be None.")
         try:
             block.append_op(type="no_such_op")
             self.assertFail()
         except ValueError as a_err:
-            self.assertEqual(a_err.message,
-                             "Operator \"no_such_op\" has not been registered.")
+            self.assertEqual(
+                cpt.get_exception_message(a_err),
+                "Operator \"no_such_op\" has not been registered.")
 
     def test_op_desc_creation(self):
         program = Program()
@@ -63,7 +67,10 @@ class TestOperator(unittest.TestCase):
         self.assertEqual(mul_op.output("Out"), ["mul.out"])
         self.assertEqual(
             set(mul_op.attr_names),
-            set(["x_num_col_dims", "y_num_col_dims", "op_role", "op_role_var"]))
+            set([
+                "x_num_col_dims", "y_num_col_dims", "op_role", "op_role_var",
+                "op_namescope"
+            ]))
         self.assertEqual(mul_op.has_attr("x_num_col_dims"), True)
         self.assertEqual(mul_op.attr_type("x_num_col_dims"), core.AttrType.INT)
         self.assertEqual(mul_op.attr("x_num_col_dims"), 1)
diff --git a/python/paddle/fluid/tests/unittests/test_optimizer.py b/python/paddle/fluid/tests/unittests/test_optimizer.py
index 7286c7c450108c4b5ad7136041bc4e989894a2ba..4374d198f2f869afab5fb76fdcb43e3c445f7689 100644
--- a/python/paddle/fluid/tests/unittests/test_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_optimizer.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 
 import paddle.fluid.framework as framework
@@ -97,7 +99,7 @@ class TestMomentumOptimizer(unittest.TestCase):
         params_grads = append_backward(mean_out)
         self.assertEqual(len(params_grads), 1)
         self.assertEqual(len(momentum_optimizer.get_accumulators()), 0)
-        opts = momentum_optimizer.create_optimization_pass(
+        opts = momentum_optimizer._create_optimization_pass(
             params_grads, mul_out, init_program)
         self.assertEqual(len(opts), 3)
         sgd_op = opts[-1]
@@ -151,7 +153,7 @@ class TestMomentumOptimizer(unittest.TestCase):
         params_grads = append_backward(mean_out)
         self.assertEqual(len(params_grads), 1)
         self.assertEqual(len(momentum_optimizer.get_accumulators()), 0)
-        opts = momentum_optimizer.create_optimization_pass(
+        opts = momentum_optimizer._create_optimization_pass(
             params_grads, mul_out, init_program)
         self.assertEqual(len(opts), 3)
         sgd_op = opts[-1]
@@ -214,8 +216,8 @@ class TestAdagradOptimizer(unittest.TestCase):
         params_grads = append_backward(mean_out)
         self.assertEqual(len(params_grads), 1)
         self.assertEqual(len(adagrad_optimizer.get_accumulators()), 0)
-        opts = adagrad_optimizer.create_optimization_pass(params_grads, mul_out,
-                                                          init_program)
+        opts = adagrad_optimizer._create_optimization_pass(
+            params_grads, mul_out, init_program)
         self.assertEqual(len(opts), 3)
         self.assertEqual([op.type for op in opts],
                          ["fill_constant", "elementwise_mul", "adagrad"])
@@ -278,8 +280,8 @@ class TestAdamOptimizer(unittest.TestCase):
         params_grads = append_backward(mean_out)
         self.assertEqual(len(params_grads), 1)
         self.assertEqual(len(adam_optimizer.get_accumulators()), 0)
-        opts = adam_optimizer.create_optimization_pass(params_grads, mul_out,
-                                                       init_program)
+        opts = adam_optimizer._create_optimization_pass(params_grads, mul_out,
+                                                        init_program)
         self.assertEqual(len(opts), 5)
         self.assertEqual(
             [op.type for op in opts],
@@ -287,7 +289,7 @@ class TestAdamOptimizer(unittest.TestCase):
 
         # Check accumulators
         accumulators = adam_optimizer.get_accumulators()
-        self.assertEqual(len(accumulators), 2)
+        self.assertEqual(len(accumulators), 4)
         self.assertTrue(adam_optimizer.get_moment1_str() in accumulators)
         self.assertTrue(adam_optimizer.get_moment2_str() in accumulators)
         moment1_acc = accumulators[adam_optimizer.get_moment1_str()]
@@ -345,8 +347,8 @@ class TestAdamaxOptimizer(unittest.TestCase):
         params_grads = append_backward(mean_out)
         self.assertEqual(len(params_grads), 1)
         self.assertEqual(len(adamax_optimizer.get_accumulators()), 0)
-        opts = adamax_optimizer.create_optimization_pass(params_grads, mul_out,
-                                                         init_program)
+        opts = adamax_optimizer._create_optimization_pass(params_grads, mul_out,
+                                                          init_program)
         self.assertEqual(len(opts), 4)
         self.assertEqual(
             [op.type for op in opts],
@@ -354,7 +356,7 @@ class TestAdamaxOptimizer(unittest.TestCase):
 
         # Check accumulators
         accumulators = adamax_optimizer.get_accumulators()
-        self.assertEqual(len(accumulators), 2)
+        self.assertEqual(len(accumulators), 3)
         self.assertTrue(adamax_optimizer.get_moment_str() in accumulators)
         self.assertTrue(adamax_optimizer.get_inf_norm_str() in accumulators)
         moment_acc = accumulators[adamax_optimizer.get_moment_str()]
@@ -409,7 +411,7 @@ class TestDecayedAdagradOptimizer(unittest.TestCase):
         params_grads = append_backward(mean_out)
         self.assertEqual(len(params_grads), 1)
         self.assertEqual(len(decayed_adagrad_optimizer.get_accumulators()), 0)
-        opts = decayed_adagrad_optimizer.create_optimization_pass(
+        opts = decayed_adagrad_optimizer._create_optimization_pass(
             params_grads, mul_out, init_program)
         self.assertEqual(len(opts), 3)
         self.assertEqual(
@@ -475,8 +477,8 @@ class TestFtrlOptimizer(unittest.TestCase):
         params_grads = append_backward(mean_out)
         self.assertEqual(len(params_grads), 1)
         self.assertEqual(len(ftrl_optimizer.get_accumulators()), 0)
-        opts = ftrl_optimizer.create_optimization_pass(params_grads, mul_out,
-                                                       init_program)
+        opts = ftrl_optimizer._create_optimization_pass(params_grads, mul_out,
+                                                        init_program)
         self.assertEqual(len(opts), 3)
         self.assertEqual([op.type for op in opts],
                          ["fill_constant", "elementwise_mul", "ftrl"])
diff --git a/python/paddle/fluid/tests/unittests/test_pad2d_op.py b/python/paddle/fluid/tests/unittests/test_pad2d_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..728b8c181a4410d7df7f304bcc8d2816e91ea6d8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_pad2d_op.py
@@ -0,0 +1,102 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestPad2dOp(OpTest):
+    def setUp(self):
+        self.pad_value = 0.0
+        self.initTestCase()
+        self.op_type = "pad2d"
+        self.inputs = {'X': np.random.random(self.shape).astype("float32"), }
+        self.attrs = {}
+        self.attrs['paddings'] = np.array(self.paddings).flatten()
+        self.attrs['pad_value'] = self.pad_value
+        self.attrs['mode'] = self.mode
+        self.attrs['data_format'] = self.data_format
+        if self.data_format == "NCHW":
+            paddings = [(0, 0), (0, 0), (self.paddings[0], self.paddings[1]),
+                        (self.paddings[2], self.paddings[3])]
+        else:
+            paddings = [(0, 0), (self.paddings[0], self.paddings[1]),
+                        (self.paddings[2], self.paddings[3]), (0, 0)]
+        if self.mode == "constant":
+            out = np.pad(self.inputs['X'],
+                         paddings,
+                         mode=self.mode,
+                         constant_values=self.pad_value)
+        else:
+            out = np.pad(self.inputs['X'], paddings, mode=self.mode)
+        self.outputs = {'Out': out}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X'], 'Out', max_relative_error=0.006)
+
+    def initTestCase(self):
+        self.shape = (2, 3, 4, 4)
+        self.paddings = [0, 1, 2, 3]
+        self.mode = "constant"
+        self.data_format = "NCHW"
+        self.pad_value = 0.0
+
+
+class TestCase1(TestPad2dOp):
+    def initTestCase(self):
+        self.shape = (2, 3, 4, 4)
+        self.paddings = [0, 1, 2, 3]
+        self.mode = "reflect"
+        self.data_format = "NCHW"
+
+
+class TestCase2(TestPad2dOp):
+    def initTestCase(self):
+        self.shape = (2, 3, 4, 4)
+        self.paddings = [0, 1, 2, 3]
+        self.mode = "edge"
+        self.data_format = "NCHW"
+
+
+class TestCase3(TestPad2dOp):
+    def initTestCase(self):
+        self.shape = (2, 4, 4, 2)
+        self.paddings = [0, 1, 2, 3]
+        self.mode = "reflect"
+        self.data_format = "NHWC"
+
+
+class TestCase4(TestPad2dOp):
+    def initTestCase(self):
+        self.shape = (2, 4, 4, 2)
+        self.paddings = [0, 1, 2, 3]
+        self.mode = "edge"
+        self.data_format = "NHWC"
+
+
+class TestCase5(TestPad2dOp):
+    def initTestCase(self):
+        self.shape = (2, 4, 4, 2)
+        self.paddings = [0, 1, 2, 3]
+        self.mode = "constant"
+        self.pad_value = 1.2
+        self.data_format = "NHWC"
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_pad_constant_like.py b/python/paddle/fluid/tests/unittests/test_pad_constant_like.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b733fd8fa023f07013909502dbbd5371297216e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_pad_constant_like.py
@@ -0,0 +1,69 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestPadOp(OpTest):
+    def setUp(self):
+        self.initTestCase()
+        self.op_type = "pad_constant_like"
+        self.inputs = {
+            'X': np.random.random(self.x_shape).astype("float32"),
+            'Y': np.random.random(self.y_shape).astype("float32")
+        }
+        self.attrs = {}
+        self.attrs['pad_value'] = self.pad_value
+        self.outputs = {
+            'Out': np.pad(self.inputs['Y'],
+                          self.paddings,
+                          mode='constant',
+                          constant_values=self.pad_value)
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['Y'], 'Out', max_relative_error=0.006)
+
+    def initTestCase(self):
+        self.x_shape = (16, 16)
+        self.y_shape = (3, 16)
+        self.pad_value = 0.1
+        self.paddings = [(0, 13), (0, 0)]
+
+
+class TestCase1(TestPadOp):
+    def initTestCase(self):
+        self.x_shape = (4, 3, 4, 4)
+        self.y_shape = (2, 3, 4, 4)
+        self.paddings = [(0, 2), (0, 0), (0, 0), (0, 0)]
+        self.pad_value = 0.5
+
+
+class TestCase2(TestPadOp):
+    def initTestCase(self):
+        self.x_shape = (4, 3, 4, 4)
+        self.y_shape = (2, 3, 2, 4)
+        self.paddings = [(0, 2), (0, 0), (0, 2), (0, 0)]
+        self.pad_value = 0.5
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_pad_op.py b/python/paddle/fluid/tests/unittests/test_pad_op.py
index 300f3ffcb8d1a6152b1e03f1356582c02bc4b2a3..58e56ca1a4dbdc48765a36e1a64b9a2ec8cf9025 100644
--- a/python/paddle/fluid/tests/unittests/test_pad_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pad_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
index 63fb58c6927fa387b3b19147b9dc9d24bb8e5132..6d6917300cb66afcc8a0c509986a0f26be8b1f09 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import paddle.dataset.conll05 as conll05
 import paddle.fluid as fluid
 import unittest
@@ -167,10 +169,10 @@ class TestCRFModel(unittest.TestCase):
                 place=fluid.CPUPlace())
 
             data = train_data()
-            for i in xrange(10):
+            for i in range(10):
                 cur_batch = next(data)
-                print pe.run(feed=feeder.feed(cur_batch),
-                             fetch_list=[avg_cost.name])[0]
+                print(pe.run(feed=feeder.feed(cur_batch),
+                             fetch_list=[avg_cost.name])[0])
 
     @unittest.skip(reason="CI hangs")
     def test_update_sparse_parameter_all_reduce(self):
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
index 1f5d2f16773efb7537de85abec88344f8e0daa9f..a49c5d9b43ae1bffa7cb57764db497f68030b151 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
@@ -12,9 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import paddle.dataset.flowers as flowers
 import math
 import paddle.fluid as fluid
+import paddle.fluid.core as core
 import unittest
 import numpy as np
 import paddle
@@ -70,7 +73,7 @@ class TestFetchOp(unittest.TestCase):
 
             fetch_list = []
             all_vars = main.global_block().vars
-            for k, v in all_vars.iteritems():
+            for k, v in all_vars.items():
                 if 'tmp' not in k and k[0] is not '_' or v.persistable:
                     fetch_list.append(k)
 
@@ -82,6 +85,7 @@ class TestFetchOp(unittest.TestCase):
                     assert not math.isnan(np.sum(ret[i])) and \
                            not math.isinf(np.sum(ret[i]))
 
+    @unittest.skip(reason="CI timeout")
     def test_fetch_op(self):
         tst_reader = paddle.batch(flowers.test(use_xmap=False), batch_size=16)
         tst_reader_iter = tst_reader()
@@ -89,10 +93,11 @@ class TestFetchOp(unittest.TestCase):
         iters = 3
         train_inputs = []
         for i in range(iters):
-            train_inputs.append(tst_reader_iter.next())
+            train_inputs.append(next(tst_reader_iter))
 
         os.environ['CPU_NUM'] = str(4)
-        self.parallel_exe(train_inputs, seed=1, use_cuda=True)
+        if core.is_compiled_with_cuda():
+            self.parallel_exe(train_inputs, seed=1, use_cuda=True)
         self.parallel_exe(train_inputs, seed=1, use_cuda=False)
 
 
@@ -131,13 +136,15 @@ class TestFeedParallel(unittest.TestCase):
 
         for batch_id, data in enumerate(reader()):
             loss_np = pe.run(feed=data, fetch_list=[loss.name])[0]
-            print batch_id, loss_np
+            print(batch_id, loss_np)
             if batch_id == 2:
                 break
 
+    @unittest.skip(reason="CI timeout")
     def test_feed_op(self):
         os.environ['CPU_NUM'] = str(4)
-        self.parallel_exe(use_cuda=True, seed=1)
+        if core.is_compiled_with_cuda():
+            self.parallel_exe(use_cuda=True, seed=1)
         self.parallel_exe(use_cuda=False, seed=1)
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
index a801d99aa1ced35eb7f081fde63ad541f0eb2589..af3745987aa3eae96968bdc6b5c9cd951e9ca6fa 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
@@ -12,8 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 from parallel_executor_test_base import TestParallelExecutorBase
 import paddle.fluid as fluid
+import paddle.fluid.core as core
 import numpy as np
 import paddle
 import paddle.dataset.mnist as mnist
@@ -32,13 +35,11 @@ def simple_fc_net(use_feed):
             filenames=[MNIST_RECORDIO_FILE],
             shapes=[[-1, 784], [-1, 1]],
             lod_levels=[0, 0],
-            dtypes=['float32', 'int64'],
-            thread_num=1,
-            for_parallel=True)
+            dtypes=['float32', 'int64'])
         reader = fluid.layers.io.double_buffer(reader)
         img, label = fluid.layers.read_file(reader)
     hidden = img
-    for _ in xrange(4):
+    for _ in range(4):
         hidden = fluid.layers.fc(
             hidden,
             size=200,
@@ -60,26 +61,26 @@ def fc_with_batchnorm(use_feed):
             filenames=[MNIST_RECORDIO_FILE],
             shapes=[[-1, 784], [-1, 1]],
             lod_levels=[0, 0],
-            dtypes=['float32', 'int64'],
-            thread_num=1,
-            for_parallel=True)
+            dtypes=['float32', 'int64'])
         reader = fluid.layers.io.double_buffer(reader)
         img, label = fluid.layers.read_file(reader)
 
     hidden = img
-    for _ in xrange(1):
-        hidden = fluid.layers.fc(
-            hidden,
-            size=200,
-            act='tanh',
-            bias_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=1.0)))
-
-        hidden = fluid.layers.batch_norm(input=hidden)
-
-    prediction = fluid.layers.fc(hidden, size=10, act='softmax')
-    loss = fluid.layers.cross_entropy(input=prediction, label=label)
-    loss = fluid.layers.mean(loss)
+    for _ in range(1):
+        with fluid.name_scope("hidden"):
+            hidden = fluid.layers.fc(
+                hidden,
+                size=200,
+                act='tanh',
+                bias_attr=fluid.ParamAttr(
+                    initializer=fluid.initializer.Constant(value=1.0)))
+
+            hidden = fluid.layers.batch_norm(input=hidden)
+    with fluid.name_scope("fc_layer"):
+        prediction = fluid.layers.fc(hidden, size=10, act='softmax')
+    with fluid.name_scope("loss"):
+        loss = fluid.layers.cross_entropy(input=prediction, label=label)
+        loss = fluid.layers.mean(loss)
     return loss
 
 
@@ -101,86 +102,123 @@ class TestMNIST(TestParallelExecutorBase):
             fluid.recordio_writer.convert_reader_to_recordio_file(
                 MNIST_RECORDIO_FILE, reader, feeder)
 
-    def check_simple_fc_convergence(self,
-                                    balance_parameter_opt_between_cards,
-                                    use_cuda=True):
+    def _init_data(self):
+        np.random.seed(5)
+        img = np.random.random(size=[32, 784]).astype(np.float32)
+        label = np.ones(shape=[32, 1], dtype='int64')
+        return img, label
+
+    def _compare_reduce_and_allreduce(self, model, use_cuda):
+        if use_cuda and not core.is_compiled_with_cuda():
+            return
+        self.check_network_convergence(
+            model, use_cuda=use_cuda, use_reduce=True)
+        self.check_network_convergence(
+            model, use_cuda=use_cuda, allow_op_delay=True, use_reduce=True)
+
+        img, label = self._init_data()
+
+        all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence(
+            model,
+            feed_dict={"image": img,
+                       "label": label},
+            use_cuda=use_cuda,
+            use_reduce=False)
+        reduce_first_loss, reduce_last_loss = self.check_network_convergence(
+            model,
+            feed_dict={"image": img,
+                       "label": label},
+            use_cuda=use_cuda,
+            use_reduce=True)
+
+        for loss in zip(all_reduce_first_loss, reduce_first_loss):
+            self.assertAlmostEqual(loss[0], loss[1], delta=1e-6)
+        for loss in zip(all_reduce_last_loss, reduce_last_loss):
+            self.assertAlmostEqual(loss[0], loss[1], delta=1e-4)
+
+    # simple_fc
+    def check_simple_fc_convergence(self, use_cuda, use_reduce=False):
+        if use_cuda and not core.is_compiled_with_cuda():
+            return
         self.check_network_convergence(simple_fc_net, use_cuda=use_cuda)
         self.check_network_convergence(
             simple_fc_net, use_cuda=use_cuda, allow_op_delay=True)
 
-        img = np.zeros(shape=[32, 784], dtype='float32')
-        label = np.ones(shape=[32, 1], dtype='int64')
+        img, label = self._init_data()
+
         self.check_network_convergence(
             simple_fc_net,
             feed_dict={"image": img,
                        "label": label},
             use_cuda=use_cuda,
-            balance_parameter_opt_between_cards=balance_parameter_opt_between_cards
-        )
+            use_reduce=use_reduce)
 
     def test_simple_fc(self):
-        self.check_simple_fc_convergence(False, use_cuda=True)
-        self.check_simple_fc_convergence(False, use_cuda=False)
+        # use_cuda
+        self.check_simple_fc_convergence(True)
+        self.check_simple_fc_convergence(False)
 
     def test_simple_fc_with_new_strategy(self):
-        self.check_simple_fc_convergence(True, use_cuda=True)
-        self.check_simple_fc_convergence(True, use_cuda=False)
+        # use_cuda, use_reduce
+        self._compare_reduce_and_allreduce(simple_fc_net, True)
+        self._compare_reduce_and_allreduce(simple_fc_net, False)
+
+    def check_simple_fc_parallel_accuracy(self, use_cuda):
+        if use_cuda and not core.is_compiled_with_cuda():
+            return
+
+        img, label = self._init_data()
 
-    def check_simple_fc_parallel_accuracy(self,
-                                          balance_parameter_opt_between_cards,
-                                          use_cuda=True):
-        img = np.zeros(shape=[32, 784], dtype='float32')
-        label = np.ones(shape=[32, 1], dtype='int64')
         single_first_loss, single_last_loss = self.check_network_convergence(
             method=simple_fc_net,
-            seed=1000,
+            seed=1,
             feed_dict={"image": img,
                        "label": label},
             use_cuda=use_cuda,
             use_parallel_executor=False)
         parallel_first_loss, parallel_last_loss = self.check_network_convergence(
             method=simple_fc_net,
-            seed=1000,
+            seed=1,
             feed_dict={"image": img,
                        "label": label},
             use_cuda=use_cuda,
-            use_parallel_executor=True,
-            balance_parameter_opt_between_cards=balance_parameter_opt_between_cards
-        )
+            use_parallel_executor=True)
 
-        for p_f in parallel_first_loss:
-            self.assertAlmostEquals(p_f, single_first_loss[0], delta=1e-6)
-        for p_l in parallel_last_loss:
-            self.assertAlmostEquals(p_l, single_last_loss[0], delta=1e-6)
+        self.assertAlmostEquals(
+            np.mean(parallel_first_loss),
+            single_first_loss,
+            delta=1e-6, )
+        self.assertAlmostEquals(
+            np.mean(parallel_last_loss), single_last_loss, delta=1e-6)
 
     def test_simple_fc_parallel_accuracy(self):
-        self.check_simple_fc_parallel_accuracy(False, use_cuda=True)
-        self.check_simple_fc_parallel_accuracy(False, use_cuda=False)
+        self.check_simple_fc_parallel_accuracy(True)
+        self.check_simple_fc_parallel_accuracy(False)
 
-    def test_simple_fc_parallel_accuracy_with_new_strategy(self):
-        self.check_simple_fc_parallel_accuracy(True, use_cuda=True)
-        self.check_simple_fc_parallel_accuracy(True, use_cuda=False)
+    def check_batchnorm_fc_convergence(self, use_cuda, use_fast_executor):
+        if use_cuda and not core.is_compiled_with_cuda():
+            return
 
-    def check_batchnorm_fc_convergence(
-            self, balance_parameter_opt_between_cards, use_cuda):
         self.check_network_convergence(fc_with_batchnorm, use_cuda=use_cuda)
-        img = np.zeros(shape=[32, 784], dtype='float32')
-        label = np.ones(shape=[32, 1], dtype='int64')
+
+        img, label = self._init_data()
+
         self.check_network_convergence(
             fc_with_batchnorm,
             feed_dict={"image": img,
                        "label": label},
             use_cuda=use_cuda,
-            balance_parameter_opt_between_cards=balance_parameter_opt_between_cards
-        )
+            use_fast_executor=use_fast_executor)
 
     def test_batchnorm_fc(self):
-        self.check_batchnorm_fc_convergence(False, use_cuda=True)
-        self.check_batchnorm_fc_convergence(False, use_cuda=False)
+        for use_cuda in (False, True):
+            for use_fast_executor in (False, True):
+                self.check_batchnorm_fc_convergence(use_cuda, use_fast_executor)
 
     def test_batchnorm_fc_with_new_strategy(self):
-        self.check_batchnorm_fc_convergence(True, use_cuda=True)
-        self.check_batchnorm_fc_convergence(True, use_cuda=False)
+        # FIXME(zcd): close this test temporally.
+        # self._compare_reduce_and_allreduce(fc_with_batchnorm, True)
+        self._compare_reduce_and_allreduce(fc_with_batchnorm, False)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py
index 066299e6c6f7f6c159cb0886e86d3404b027b698..cc2d692e18430eb48e6e800106eab0c3739d3f53 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py
@@ -12,10 +12,30 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import paddle.fluid as fluid
+import paddle.fluid.layers.ops as ops
+from paddle.fluid.initializer import init_on_cpu
+from paddle.fluid.layers.learning_rate_scheduler import _decay_step_counter
+import paddle.fluid.core as core
 from parallel_executor_test_base import TestParallelExecutorBase
 import unittest
+import math
 import os
+import numpy as np
+
+# FIXME(zcd): If the neural net has dropout_op, the output of ParallelExecutor
+# and Executor is different. Because, for ParallelExecutor, the dropout_op of
+# the neural net will be copied N copies(N is the number of device). This will
+# lead to the random numbers generated by ParallelExecutor and Executor are different.
+# So, if we compare the loss of ParallelExecutor and Executor, we should remove the
+# dropout_op.
+remove_dropout = False
+
+# FIXME(zcd): If the neural net has batch_norm, the output of ParallelExecutor
+# and Executor is different.
+remove_bn = False
 
 
 def squeeze_excitation(input, num_channels, reduction_ratio):
@@ -28,7 +48,7 @@ def squeeze_excitation(input, num_channels, reduction_ratio):
     pool = fluid.layers.reduce_mean(input=reshape, dim=2)
 
     squeeze = fluid.layers.fc(input=pool,
-                              size=num_channels / reduction_ratio,
+                              size=num_channels // reduction_ratio,
                               act='relu')
     excitation = fluid.layers.fc(input=squeeze,
                                  size=num_channels,
@@ -44,11 +64,12 @@ def conv_bn_layer(input, num_filters, filter_size, stride=1, groups=1,
         num_filters=num_filters,
         filter_size=filter_size,
         stride=stride,
-        padding=(filter_size - 1) / 2,
+        padding=(filter_size - 1) // 2,
         groups=groups,
         act=None,
         bias_attr=False)
-    return fluid.layers.batch_norm(input=conv, act=act, momentum=0.1)
+    return conv if remove_bn else fluid.layers.batch_norm(
+        input=conv, act=act, momentum=0.1)
 
 
 def shortcut(input, ch_out, stride):
@@ -87,13 +108,14 @@ def bottleneck_block(input, num_filters, stride, cardinality, reduction_ratio):
     return fluid.layers.elementwise_add(x=short, y=scale, act='relu')
 
 
-def SE_ResNeXt50Small(batch_size=2, use_feed=False):
-    assert not use_feed, "SE_ResNeXt doesn't support feed yet"
+batch_size = 12
+img_shape = [3, 224, 224]
+
 
-    img = fluid.layers.fill_constant(
-        shape=[batch_size, 3, 224, 224], dtype='float32', value=0.0)
-    label = fluid.layers.fill_constant(
-        shape=[batch_size, 1], dtype='int64', value=0.0)
+def SE_ResNeXt50Small(use_feed):
+
+    img = fluid.layers.data(name='image', shape=img_shape, dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
 
     conv = conv_bn_layer(
         input=img, num_filters=16, filter_size=3, stride=2, act='relu')
@@ -122,7 +144,8 @@ def SE_ResNeXt50Small(batch_size=2, use_feed=False):
     reshape = fluid.layers.reshape(
         x=conv, shape=[-1, shape[1], shape[2] * shape[3]])
     pool = fluid.layers.reduce_mean(input=reshape, dim=2)
-    dropout = fluid.layers.dropout(x=pool, dropout_prob=0.2)
+    dropout = pool if remove_dropout else fluid.layers.dropout(
+        x=pool, dropout_prob=0.2, seed=1)
     # Classifier layer:
     prediction = fluid.layers.fc(input=dropout, size=1000, act='softmax')
     loss = fluid.layers.cross_entropy(input=prediction, label=label)
@@ -130,31 +153,135 @@ def SE_ResNeXt50Small(batch_size=2, use_feed=False):
     return loss
 
 
+def cosine_decay(learning_rate, step_each_epoch, epochs=120):
+    """
+    Applies cosine decay to the learning rate.
+    lr = 0.05 * (math.cos(epoch * (math.pi / 120)) + 1)
+    """
+    global_step = _decay_step_counter()
+
+    with init_on_cpu():
+        epoch = ops.floor(global_step / step_each_epoch)
+        decayed_lr = learning_rate * \
+                     (ops.cos(epoch * (math.pi / epochs)) + 1)/2
+    return decayed_lr
+
+
+def optimizer(learning_rate=0.01):
+    optimizer = fluid.optimizer.Momentum(
+        learning_rate=cosine_decay(
+            learning_rate=learning_rate, step_each_epoch=2, epochs=1),
+        momentum=0.9,
+        regularization=fluid.regularizer.L2Decay(1e-4))
+    return optimizer
+
+
 class TestResnet(TestParallelExecutorBase):
-    def check_resnet_convergence(self,
-                                 balance_parameter_opt_between_cards,
-                                 use_cuda=True,
-                                 iter=20):
+    @classmethod
+    def setUpClass(cls):
         os.environ['CPU_NUM'] = str(4)
+        global remove_dropout
+        global remove_bn
+        remove_dropout = False
+        remove_bn = False
+
+    def _init_data(self, batch_size=2, random=True):
+        np.random.seed(5)
+        if random:
+            img = np.random.random(
+                size=[batch_size] + img_shape).astype(np.float32)
+        else:
+            img = np.ones(shape=[batch_size] + img_shape, dtype='float32')
+        label = [np.random.randint(0, 999) for _ in range(batch_size)]
+        label = np.array(label).astype(np.int64).reshape(-1, 1)
+        return img, label
+
+    def _compare_reduce_and_allreduce(self,
+                                      model,
+                                      use_cuda,
+                                      iter=20,
+                                      delta2=1e-6):
+        if use_cuda and not core.is_compiled_with_cuda():
+            return
+
+        global remove_bn
+        remove_bn = True
 
-        import functools
-        batch_size = 2
-        self.check_network_convergence(
-            functools.partial(
-                SE_ResNeXt50Small, batch_size=batch_size),
+        img, label = self._init_data(batch_size=batch_size)
+        all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence(
+            model,
+            feed_dict={"image": img,
+                       "label": label},
             iter=iter,
             batch_size=batch_size,
             use_cuda=use_cuda,
-            balance_parameter_opt_between_cards=balance_parameter_opt_between_cards
-        )
+            use_reduce=False,
+            optimizer=optimizer)
+        reduce_first_loss, reduce_last_loss = self.check_network_convergence(
+            model,
+            feed_dict={"image": img,
+                       "label": label},
+            iter=iter,
+            batch_size=batch_size,
+            use_cuda=use_cuda,
+            use_reduce=True,
+            optimizer=optimizer)
+
+        for loss in zip(all_reduce_first_loss, reduce_first_loss):
+            self.assertAlmostEquals(loss[0], loss[1], delta=1e-6)
+        for loss in zip(all_reduce_last_loss, reduce_last_loss):
+            self.assertAlmostEquals(loss[0], loss[1], delta=delta2)
+
+    def _check_resnet_convergence(self,
+                                  model,
+                                  use_cuda=True,
+                                  use_reduce=False,
+                                  iter=20,
+                                  delta2=1e-6):
+        if use_cuda and not core.is_compiled_with_cuda():
+            return
+
+        global remove_dropout
+        global remove_bn
+        remove_dropout = True
+        remove_bn = True
+
+        img, label = self._init_data(batch_size=batch_size)
+        single_first_loss, single_last_loss = self.check_network_convergence(
+            model,
+            feed_dict={"image": img,
+                       "label": label},
+            iter=iter,
+            batch_size=batch_size,
+            use_cuda=use_cuda,
+            use_reduce=use_reduce,
+            optimizer=optimizer,
+            use_parallel_executor=False)
+        parallel_first_loss, parallel_last_loss = self.check_network_convergence(
+            model,
+            feed_dict={"image": img,
+                       "label": label},
+            iter=iter,
+            batch_size=batch_size,
+            use_cuda=use_cuda,
+            use_reduce=use_reduce,
+            optimizer=optimizer)
+
+        self.assertAlmostEquals(
+            np.mean(parallel_first_loss), single_first_loss[0], delta=1e-6)
+        self.assertAlmostEquals(
+            np.mean(parallel_last_loss), single_last_loss[0], delta=delta2)
 
-    def test_resnet(self):
-        self.check_resnet_convergence(False, use_cuda=True)
-        self.check_resnet_convergence(False, use_cuda=False, iter=5)
+    def test_seresnext_with_learning_rate_decay(self):
+        self._check_resnet_convergence(model=SE_ResNeXt50Small, use_cuda=True)
+        self._check_resnet_convergence(
+            model=SE_ResNeXt50Small, use_cuda=False, iter=2, delta2=1e-3)
 
-    def test_resnet_with_new_strategy(self):
-        self.check_resnet_convergence(True, use_cuda=True)
-        self.check_resnet_convergence(True, use_cuda=False, iter=5)
+    def test_seresnext_with_new_strategy(self):
+        self._compare_reduce_and_allreduce(
+            model=SE_ResNeXt50Small, use_cuda=True, delta2=1e-2)
+        self._compare_reduce_and_allreduce(
+            model=SE_ResNeXt50Small, use_cuda=False, iter=5)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
index 9a2733927d38f1a2b1af92fcc12f036158b4d06f..f5a0ba624698b49e0d323e6f830be23a4148392b 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
@@ -12,7 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import paddle.fluid as fluid
+import paddle.fluid.core as core
 import numpy as np
 import unittest
 import os
@@ -24,7 +27,7 @@ def simple_fc_net():
     img = fluid.layers.data(name='image', shape=[784], dtype='float32')
     label = fluid.layers.data(name='label', shape=[1], dtype='int64')
     hidden = img
-    for _ in xrange(4):
+    for _ in range(4):
         hidden = fluid.layers.fc(
             hidden,
             size=200,
@@ -70,7 +73,7 @@ class ParallelExecutorTestingDuringTraining(unittest.TestCase):
                 share_vars_from=train_exe,
                 build_strategy=build_strategy)
 
-            for i in xrange(5):
+            for i in range(5):
                 test_loss, = test_exe.run([loss.name], feed=feed_dict)
 
                 train_loss, = train_exe.run([loss.name], feed=feed_dict)
@@ -92,16 +95,18 @@ class ParallelExecutorTestingDuringTraining(unittest.TestCase):
     def test_parallel_testing(self):
         build_strategy = fluid.BuildStrategy()
         build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
-        self.check_network_convergence(
-            use_cuda=True, build_strategy=build_strategy)
+        if core.is_compiled_with_cuda():
+            self.check_network_convergence(
+                use_cuda=True, build_strategy=build_strategy)
         self.check_network_convergence(
             use_cuda=False, build_strategy=build_strategy)
 
     def test_parallel_testing_with_new_strategy(self):
         build_strategy = fluid.BuildStrategy()
         build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
-        self.check_network_convergence(
-            use_cuda=True, build_strategy=build_strategy)
+        if core.is_compiled_with_cuda():
+            self.check_network_convergence(
+                use_cuda=True, build_strategy=build_strategy)
         self.check_network_convergence(
             use_cuda=False, build_strategy=build_strategy)
 
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
index b6215fddb11bb6b3a76b5a6395e7254d21971c13..a55b2002ed989d4588716202a37aa6f4139825ea 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
@@ -12,16 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import paddle.fluid as fluid
 import transformer_model
 import numpy as np
 from parallel_executor_test_base import TestParallelExecutorBase
 import unittest
 import paddle
+import paddle.fluid.core as core
 import paddle.dataset.wmt16 as wmt16
 import os
 
-WMT16_RECORDIO_FILE = "./wmt16_test_pe.recordio"
+WMT16_RECORDIO_FILE = "/tmp/wmt16.recordio"
 
 
 class ModelHyperParams(object):
@@ -167,10 +170,10 @@ class TestTransformer(TestParallelExecutorBase):
                     writer.append_tensor(t)
                 writer.complete_append_tensor()
 
-    @unittest.skip("transformer is buggy in multi gpu")
     def test_main(self):
-        self.check_network_convergence(transformer, use_cuda=True)
-        self.check_network_convergence(transformer, use_cuda=False)
+        if core.is_compiled_with_cuda():
+            self.check_network_convergence(transformer, use_cuda=True)
+        self.check_network_convergence(transformer, use_cuda=False, iter=5)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_op.py b/python/paddle/fluid/tests/unittests/test_parallel_op.py
index 79bea148f9398152a02d70946cdc5fff1f47ba6b..d7b9af8bac67ef89cc1ae59ccf002c2c488f3436 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_op.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_op.py
@@ -12,11 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 
 import paddle.fluid as fluid
+from paddle.fluid.layers.device import get_places
 import paddle.fluid.profiler as profiler
 import numpy
+import six
 
 
 class BaseParallelForTest(unittest.TestCase):
@@ -24,20 +28,20 @@ class BaseParallelForTest(unittest.TestCase):
         """
         Run the unittest for parallel.for
         Args:
-            callback(callable): A callable function returns a generator. There 
-                are two yields in the generator function. The first yield 
-                returns the data layers, and the second yield returns the loss. 
-                The modified data variables will be sent back during the first 
+            callback(callable): A callable function returns a generator. There
+                are two yields in the generator function. The first yield
+                returns the data layers, and the second yield returns the loss.
+                The modified data variables will be sent back during the first
                 yield.
 
             feed(dict): The executor feeding dictionary.
-            fetch(list|basestr): The fetch name lists. 
+            fetch(list|basestr): The fetch name lists.
 
         Returns:
             None
 
         Raises:
-            AssertionError when the computation of cpu, parallel.for in cpu, 
+            AssertionError when the computation of cpu, parallel.for in cpu,
                 gpu, parallel.for in gpu are different.
 
         """
@@ -94,14 +98,14 @@ class BaseParallelForTest(unittest.TestCase):
         """
         Run a single test, returns the fetch values
         Args:
-            place(Place): the computation place. 
-            use_parallel(bool): Whether use parallel.for or not. 
+            place(Place): the computation place.
+            use_parallel(bool): Whether use parallel.for or not.
 
         Returns:
             Fetched numpy arrays.
 
         """
-        if isinstance(fetch, basestring):
+        if isinstance(fetch, six.string_types):
             fetch = [fetch]
         main = fluid.Program()
         startup = fluid.Program()
@@ -113,15 +117,17 @@ class BaseParallelForTest(unittest.TestCase):
             generator = callback()
             # Automatically insert parallel do if use_parallel = True
             if use_parallel:
-                places = fluid.layers.get_places()
+                thread_num = fluid.core.get_cuda_device_count(
+                ) if use_gpu else 8
+                places = get_places(thread_num)
                 pd = fluid.layers.ParallelDo(places, use_nccl=use_nccl)
                 data = next(generator)
 
-                if isinstance(data, fluid.Variable):
+                if isinstance(data, fluid.framework.Variable):
                     data = [data]
 
                 with pd.do():
-                    ins = map(pd.read_input, data)
+                    ins = list(map(pd.read_input, data))
                     if len(ins) == 1:
                         ins = ins[0]
                     loss = generator.send(ins)  # patch input
@@ -153,7 +159,7 @@ class BaseParallelForTest(unittest.TestCase):
 
         Returns:
             None
-            
+
         Raises:
             AssertionError
 
diff --git a/python/paddle/fluid/tests/unittests/test_parameter.py b/python/paddle/fluid/tests/unittests/test_parameter.py
index e09865074e8aa9345fd9cc84e1f19eaf0436142f..df42e6cb9a050b76099b4a53fdd08d2852284d1f 100644
--- a/python/paddle/fluid/tests/unittests/test_parameter.py
+++ b/python/paddle/fluid/tests/unittests/test_parameter.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 from paddle.fluid.framework import default_main_program
 import paddle.fluid.core as core
diff --git a/python/paddle/fluid/tests/unittests/test_polygon_box_transform.py b/python/paddle/fluid/tests/unittests/test_polygon_box_transform.py
index 2105d320665367e3ec1bfd7b3a353a144c91244f..dfedf8190f75ec26532f281338f076ca0c7d83af 100644
--- a/python/paddle/fluid/tests/unittests/test_polygon_box_transform.py
+++ b/python/paddle/fluid/tests/unittests/test_polygon_box_transform.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
@@ -23,9 +25,9 @@ def PolygonBoxRestore(input):
     geo_channels = shape[1]
     h = shape[2]
     w = shape[3]
-    h_indexes = np.array(range(h) * w).reshape(
+    h_indexes = np.array(list(range(h)) * w).reshape(
         [w, h]).transpose()[np.newaxis, :]  # [1, h, w]
-    w_indexes = np.array(range(w) * h).reshape(
+    w_indexes = np.array(list(range(w)) * h).reshape(
         [h, w])[np.newaxis, :]  # [1, h, w]
     indexes = np.concatenate(
         (w_indexes, h_indexes))[np.newaxis, :]  # [1, 2, h, w]
diff --git a/python/paddle/fluid/tests/unittests/test_pool2d_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_pool2d_mkldnn_op.py
index 003ebba18b26198427d9f313596ae85656ac24fa..14d7ed9057d622b136056e1b5bbbe57f9a04d5d7 100644
--- a/python/paddle/fluid/tests/unittests/test_pool2d_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool2d_mkldnn_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 from test_pool2d_op import TestPool2d_Op, TestCase1, TestCase2, TestCase3, TestCase4, TestCase5
 
diff --git a/python/paddle/fluid/tests/unittests/test_pool2d_op.py b/python/paddle/fluid/tests/unittests/test_pool2d_op.py
index f7e1e8573290766cde0c35816d687e7ba6fa4220..26969bd5230afdac83a943d2dc21094a0972d60a 100644
--- a/python/paddle/fluid/tests/unittests/test_pool2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool2d_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 
@@ -29,14 +31,14 @@ def max_pool2D_forward_naive(x,
     if global_pool == 1:
         ksize = [H, W]
     H_out = (H - ksize[0] + 2 * paddings[0] + strides[0] - 1
-             ) / strides[0] + 1 if ceil_mode else (H - ksize[0] + 2 *
-                                                   paddings[0]) / strides[0] + 1
+             ) // strides[0] + 1 if ceil_mode else (
+                 H - ksize[0] + 2 * paddings[0]) // strides[0] + 1
     W_out = (W - ksize[1] + 2 * paddings[1] + strides[1] - 1
-             ) / strides[1] + 1 if ceil_mode else (W - ksize[1] + 2 *
-                                                   paddings[1]) / strides[1] + 1
+             ) // strides[1] + 1 if ceil_mode else (
+                 W - ksize[1] + 2 * paddings[1]) // strides[1] + 1
     out = np.zeros((N, C, H_out, W_out))
-    for i in xrange(H_out):
-        for j in xrange(W_out):
+    for i in range(H_out):
+        for j in range(W_out):
             r_start = np.max((i * strides[0] - paddings[0], 0))
             r_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
             c_start = np.max((j * strides[1] - paddings[1], 0))
@@ -57,14 +59,14 @@ def avg_pool2D_forward_naive(x,
     if global_pool == 1:
         ksize = [H, W]
     H_out = (H - ksize[0] + 2 * paddings[0] + strides[0] - 1
-             ) / strides[0] + 1 if ceil_mode else (H - ksize[0] + 2 *
-                                                   paddings[0]) / strides[0] + 1
+             ) // strides[0] + 1 if ceil_mode else (
+                 H - ksize[0] + 2 * paddings[0]) // strides[0] + 1
     W_out = (W - ksize[1] + 2 * paddings[1] + strides[1] - 1
-             ) / strides[1] + 1 if ceil_mode else (W - ksize[1] + 2 *
-                                                   paddings[1]) / strides[1] + 1
+             ) // strides[1] + 1 if ceil_mode else (
+                 W - ksize[1] + 2 * paddings[1]) // strides[1] + 1
     out = np.zeros((N, C, H_out, W_out))
-    for i in xrange(H_out):
-        for j in xrange(W_out):
+    for i in range(H_out):
+        for j in range(W_out):
             r_start = np.max((i * strides[0] - paddings[0], 0))
             r_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
             c_start = np.max((j * strides[1] - paddings[1], 0))
diff --git a/python/paddle/fluid/tests/unittests/test_pool3d_op.py b/python/paddle/fluid/tests/unittests/test_pool3d_op.py
index 142165f29beeaedfaa660f04424147e06710d192..77045c1307baead3711d58ed368dfa5f2acc3699 100644
--- a/python/paddle/fluid/tests/unittests/test_pool3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool3d_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 
@@ -29,22 +31,22 @@ def max_pool3D_forward_naive(x,
     if global_pool == 1:
         ksize = [D, H, W]
     D_out = (D - ksize[0] + 2 * paddings[0] + strides[0] - 1
-             ) / strides[0] + 1 if ceil_mode else (H - ksize[0] + 2 *
-                                                   paddings[0]) / strides[0] + 1
+             ) // strides[0] + 1 if ceil_mode else (
+                 H - ksize[0] + 2 * paddings[0]) // strides[0] + 1
     H_out = (H - ksize[1] + 2 * paddings[1] + strides[1] - 1
-             ) / strides[1] + 1 if ceil_mode else (W - ksize[1] + 2 *
-                                                   paddings[1]) / strides[1] + 1
+             ) // strides[1] + 1 if ceil_mode else (
+                 W - ksize[1] + 2 * paddings[1]) // strides[1] + 1
     W_out = (W - ksize[2] + 2 * paddings[2] + strides[2] - 1
-             ) / strides[2] + 1 if ceil_mode else (W - ksize[2] + 2 *
-                                                   paddings[2]) / strides[2] + 1
+             ) // strides[2] + 1 if ceil_mode else (
+                 W - ksize[2] + 2 * paddings[2]) // strides[2] + 1
     out = np.zeros((N, C, D_out, H_out, W_out))
-    for k in xrange(D_out):
+    for k in range(D_out):
         d_start = np.max((k * strides[0] - paddings[0], 0))
         d_end = np.min((k * strides[0] + ksize[0] - paddings[0], D))
-        for i in xrange(H_out):
+        for i in range(H_out):
             h_start = np.max((i * strides[0] - paddings[0], 0))
             h_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
-            for j in xrange(W_out):
+            for j in range(W_out):
                 w_start = np.max((j * strides[1] - paddings[1], 0))
                 w_end = np.min((j * strides[1] + ksize[1] - paddings[1], W))
                 x_masked = x[:, :, d_start:d_end, h_start:h_end, w_start:w_end]
@@ -63,22 +65,22 @@ def avg_pool3D_forward_naive(x,
     if global_pool == 1:
         ksize = [D, H, W]
     D_out = (D - ksize[0] + 2 * paddings[0] + strides[0] - 1
-             ) / strides[0] + 1 if ceil_mode else (H - ksize[0] + 2 *
-                                                   paddings[0]) / strides[0] + 1
+             ) // strides[0] + 1 if ceil_mode else (
+                 H - ksize[0] + 2 * paddings[0]) // strides[0] + 1
     H_out = (H - ksize[1] + 2 * paddings[1] + strides[1] - 1
-             ) / strides[1] + 1 if ceil_mode else (W - ksize[1] + 2 *
-                                                   paddings[1]) / strides[1] + 1
+             ) // strides[1] + 1 if ceil_mode else (
+                 W - ksize[1] + 2 * paddings[1]) // strides[1] + 1
     W_out = (W - ksize[2] + 2 * paddings[2] + strides[2] - 1
-             ) / strides[2] + 1 if ceil_mode else (W - ksize[2] + 2 *
-                                                   paddings[2]) / strides[2] + 1
+             ) // strides[2] + 1 if ceil_mode else (
+                 W - ksize[2] + 2 * paddings[2]) // strides[2] + 1
     out = np.zeros((N, C, D_out, H_out, W_out))
-    for k in xrange(D_out):
+    for k in range(D_out):
         d_start = np.max((k * strides[0] - paddings[0], 0))
         d_end = np.min((k * strides[0] + ksize[0] - paddings[0], D))
-        for i in xrange(H_out):
+        for i in range(H_out):
             h_start = np.max((i * strides[0] - paddings[0], 0))
             h_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
-            for j in xrange(W_out):
+            for j in range(W_out):
                 w_start = np.max((j * strides[1] - paddings[1], 0))
                 w_end = np.min((j * strides[1] + ksize[1] - paddings[1], W))
                 x_masked = x[:, :, d_start:d_end, h_start:h_end, w_start:w_end]
diff --git a/python/paddle/fluid/tests/unittests/test_pool_max_op.py b/python/paddle/fluid/tests/unittests/test_pool_max_op.py
index cf9b7639224ef3804b946f729bb6a9cead4aae23..488ff431d4f2ef76ce0c9486d8c307b4e01b5544 100644
--- a/python/paddle/fluid/tests/unittests/test_pool_max_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool_max_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
@@ -24,26 +26,26 @@ def max_pool3D_forward_naive(x, ksize, strides, paddings, global_pool=False):
         ksize = [D, H, W]
         paddings = [0, 0, 0]
 
-    D_out = (D - ksize[0] + 2 * paddings[0]) / strides[0] + 1
-    H_out = (H - ksize[1] + 2 * paddings[1]) / strides[1] + 1
-    W_out = (W - ksize[2] + 2 * paddings[2]) / strides[2] + 1
+    D_out = (D - ksize[0] + 2 * paddings[0]) // strides[0] + 1
+    H_out = (H - ksize[1] + 2 * paddings[1]) // strides[1] + 1
+    W_out = (W - ksize[2] + 2 * paddings[2]) // strides[2] + 1
     out = np.zeros((N, C, D_out, H_out, W_out))
     mask = np.zeros((N, C, D_out, H_out, W_out))
-    for k in xrange(D_out):
+    for k in range(D_out):
         d_start = np.max((k * strides[0] - paddings[0], 0))
         d_end = np.min((k * strides[0] + ksize[0] - paddings[0], D))
-        for i in xrange(H_out):
+        for i in range(H_out):
             h_start = np.max((i * strides[0] - paddings[0], 0))
             h_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
-            for j in xrange(W_out):
+            for j in range(W_out):
                 w_start = np.max((j * strides[1] - paddings[1], 0))
                 w_end = np.min((j * strides[1] + ksize[1] - paddings[1], W))
                 x_masked = x[:, :, d_start:d_end, h_start:h_end, w_start:w_end]
 
                 out[:, :, k, i, j] = np.max(x_masked, axis=(2, 3, 4))
 
-                for n in xrange(N):
-                    for c in xrange(C):
+                for n in range(N):
+                    for c in range(C):
                         arr = x_masked[n, c, :, :, :]
                         index = np.where(arr == np.max(arr))
                         sub_deep = index[0][0]
@@ -63,12 +65,12 @@ def max_pool2D_forward_naive(x, ksize, strides, paddings, global_pool=False):
         ksize = [H, W]
         paddings = [0, 0]
 
-    H_out = (H - ksize[0] + 2 * paddings[0]) / strides[0] + 1
-    W_out = (W - ksize[1] + 2 * paddings[1]) / strides[1] + 1
+    H_out = (H - ksize[0] + 2 * paddings[0]) // strides[0] + 1
+    W_out = (W - ksize[1] + 2 * paddings[1]) // strides[1] + 1
     out = np.zeros((N, C, H_out, W_out))
     mask = np.zeros((N, C, H_out, W_out))
-    for i in xrange(H_out):
-        for j in xrange(W_out):
+    for i in range(H_out):
+        for j in range(W_out):
             r_start = np.max((i * strides[0] - paddings[0], 0))
             r_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
             c_start = np.max((j * strides[1] - paddings[1], 0))
@@ -77,8 +79,8 @@ def max_pool2D_forward_naive(x, ksize, strides, paddings, global_pool=False):
 
             out[:, :, i, j] = np.max(x_masked, axis=(2, 3))
 
-            for n in xrange(N):
-                for c in xrange(C):
+            for n in range(N):
+                for c in range(C):
                     arr = x_masked[n, c, :, :]
                     index = np.where(arr == np.max(arr))
                     sub_row = index[0][0]
diff --git a/python/paddle/fluid/tests/unittests/test_positive_negative_pair_op.py b/python/paddle/fluid/tests/unittests/test_positive_negative_pair_op.py
index 091cfc9c72769fefc9c792bfeaa872cb357736b7..afe8d212d6ec218c3799780849c377e46a44bd6c 100644
--- a/python/paddle/fluid/tests/unittests/test_positive_negative_pair_op.py
+++ b/python/paddle/fluid/tests/unittests/test_positive_negative_pair_op.py
@@ -12,9 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import itertools
 import numpy as np
+import six
 from op_test import OpTest
 
 
@@ -32,7 +35,7 @@ def py_pnpair_op(score, label, query, column=-1, weight=None):
 
     # accumulate statistics
     pos, neg, neu = 0, 0, 0
-    for _, ranks in predictions.items():
+    for _, ranks in six.iteritems(predictions):
         for e1, e2 in itertools.combinations(ranks, 2):
             s1, s2, l1, l2, w1, w2 = e1[0], e2[0], e1[1], e2[1], e1[2], e2[2]
             w = (w1 + w2) * 0.5
diff --git a/python/paddle/fluid/tests/unittests/test_precision_recall_op.py b/python/paddle/fluid/tests/unittests/test_precision_recall_op.py
index 7830ba29583d369c4b9f6f3077dda1dda1fd1c46..645637625959f214db3875bc58e4c593c27ae8f6 100644
--- a/python/paddle/fluid/tests/unittests/test_precision_recall_op.py
+++ b/python/paddle/fluid/tests/unittests/test_precision_recall_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
@@ -39,19 +41,19 @@ def get_states(idxs, labels, cls_num, weights=None):
     ins_num = idxs.shape[0]
     # TP FP TN FN
     states = np.zeros((cls_num, 4)).astype('float32')
-    for i in xrange(ins_num):
+    for i in range(ins_num):
         w = weights[i] if weights is not None else 1.0
         idx = idxs[i][0]
         label = labels[i][0]
         if idx == label:
             states[idx][0] += w
-            for j in xrange(cls_num):
+            for j in range(cls_num):
                 states[j][2] += w
             states[idx][2] -= w
         else:
             states[label][3] += w
             states[idx][1] += w
-            for j in xrange(cls_num):
+            for j in range(cls_num):
                 states[j][2] += w
             states[label][2] -= w
             states[idx][2] -= w
@@ -64,7 +66,7 @@ def compute_metrics(states, cls_num):
     total_fn_count = 0.0
     macro_avg_precision = 0.0
     macro_avg_recall = 0.0
-    for i in xrange(cls_num):
+    for i in range(cls_num):
         total_tp_count += states[i][0]
         total_fp_count += states[i][1]
         total_fn_count += states[i][3]
@@ -90,9 +92,9 @@ class TestPrecisionRecallOp_0(OpTest):
         ins_num = 64
         cls_num = 10
         max_probs = np.random.uniform(0, 1.0, (ins_num, 1)).astype('float32')
-        idxs = np.random.choice(xrange(cls_num), ins_num).reshape(
+        idxs = np.random.choice(range(cls_num), ins_num).reshape(
             (ins_num, 1)).astype('int32')
-        labels = np.random.choice(xrange(cls_num), ins_num).reshape(
+        labels = np.random.choice(range(cls_num), ins_num).reshape(
             (ins_num, 1)).astype('int32')
         states = get_states(idxs, labels, cls_num)
         metrics = compute_metrics(states, cls_num)
@@ -117,10 +119,10 @@ class TestPrecisionRecallOp_1(OpTest):
         ins_num = 64
         cls_num = 10
         max_probs = np.random.uniform(0, 1.0, (ins_num, 1)).astype('float32')
-        idxs = np.random.choice(xrange(cls_num), ins_num).reshape(
+        idxs = np.random.choice(range(cls_num), ins_num).reshape(
             (ins_num, 1)).astype('int32')
         weights = np.random.uniform(0, 1.0, (ins_num, 1)).astype('float32')
-        labels = np.random.choice(xrange(cls_num), ins_num).reshape(
+        labels = np.random.choice(range(cls_num), ins_num).reshape(
             (ins_num, 1)).astype('int32')
 
         states = get_states(idxs, labels, cls_num, weights)
@@ -151,10 +153,10 @@ class TestPrecisionRecallOp_2(OpTest):
         ins_num = 64
         cls_num = 10
         max_probs = np.random.uniform(0, 1.0, (ins_num, 1)).astype('float32')
-        idxs = np.random.choice(xrange(cls_num), ins_num).reshape(
+        idxs = np.random.choice(range(cls_num), ins_num).reshape(
             (ins_num, 1)).astype('int32')
         weights = np.random.uniform(0, 1.0, (ins_num, 1)).astype('float32')
-        labels = np.random.choice(xrange(cls_num), ins_num).reshape(
+        labels = np.random.choice(range(cls_num), ins_num).reshape(
             (ins_num, 1)).astype('int32')
         states = np.random.randint(0, 30, (cls_num, 4)).astype('float32')
 
diff --git a/python/paddle/fluid/tests/unittests/test_prelu_op.py b/python/paddle/fluid/tests/unittests/test_prelu_op.py
index ae19a553bb826002c562c15ee07759391d51b4d8..48a6b0577b6787d2e1231fdcbe6d2c1bb46414ed 100644
--- a/python/paddle/fluid/tests/unittests/test_prelu_op.py
+++ b/python/paddle/fluid/tests/unittests/test_prelu_op.py
@@ -12,38 +12,70 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
+import six
 from op_test import OpTest
 
 
 class PReluTest(OpTest):
     def setUp(self):
         self.op_type = "prelu"
-        x_np = np.random.normal(size=(10, 10)).astype("float32")
-
-        for pos, val in np.ndenumerate(x_np):
-            # Since zero point in prelu is not differentiable, avoid randomize
-            # zero.
-            while abs(val) < 1e-3:
-                x_np[pos] = np.random.normal()
-                val = x_np[pos]
-
-        x_np_sign = np.sign(x_np)
-        x_np = x_np_sign * np.maximum(x_np, .005)
-        alpha_np = np.array([.1], dtype="float32")
-        self.inputs = {'X': x_np, 'Alpha': alpha_np}
+        self.initTestCase()
+        x_np = np.random.normal(size=(3, 5, 5, 10)).astype("float32")
+
+        # Since zero point in prelu is not differentiable, avoid randomize
+        # zero.
+        x_np[np.abs(x_np) < 0.005] = 0.02
+
+        if self.attrs == {'mode': "all"}:
+            alpha_np = np.random.rand(1).astype("float32")
+            self.inputs = {'X': x_np, 'Alpha': alpha_np}
+        elif self.attrs == {'mode': "channel"}:
+            alpha_np = np.random.rand(1, x_np.shape[1], 1, 1).astype("float32")
+            self.inputs = {'X': x_np, 'Alpha': alpha_np}
+        else:
+            alpha_np = np.random.rand(*x_np.shape).astype("float32")
+            self.inputs = {'X': x_np, 'Alpha': alpha_np}
+
         out_np = np.maximum(self.inputs['X'], 0.)
         out_np = out_np + np.minimum(self.inputs['X'],
                                      0.) * self.inputs['Alpha']
         assert out_np is not self.inputs['X']
         self.outputs = {'Out': out_np}
 
+    def initTestCase(self):
+        self.attrs = {'mode': "channel"}
+
     def test_check_output(self):
         self.check_output()
 
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
+    def test_check_grad_1_ignore_x(self):
+        self.check_grad(['Alpha'], 'Out', no_grad_set=set('X'))
+
+    def test_check_grad_2(self):
+        self.check_grad(['X', 'Alpha'], 'Out')
+
+    def test_check_grad_3_ignore_alpha(self):
+        self.check_grad(['X'], 'Out', no_grad_set=set('Alpha'))
+
+
+# TODO(minqiyang): Resume these test cases after fixing Python3 CI job issues
+if six.PY2:
+
+    class TestCase1(PReluTest):
+        def initTestCase(self):
+            self.attrs = {'mode': "all"}
+
+    class TestCase2(PReluTest):
+        def initTestCase(self):
+            self.attrs = {'mode': "channel"}
+
+    class TestCase3(PReluTest):
+        def initTestCase(self):
+            self.attrs = {'mode': "element"}
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_preprocessor.py b/python/paddle/fluid/tests/unittests/test_preprocessor.py
index cbf1a7e0c50a87cd43507ffdb94109873cf4e5d9..98e609b76982650c9d18f87c3c0637056cc40899 100644
--- a/python/paddle/fluid/tests/unittests/test_preprocessor.py
+++ b/python/paddle/fluid/tests/unittests/test_preprocessor.py
@@ -12,12 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 
+import paddle
 import paddle.fluid as fluid
-import paddle.v2 as paddle
-import paddle.v2.dataset.mnist as mnist
+import paddle.dataset.mnist as mnist
 
 
 class TestPreprocessor(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_print_op.py b/python/paddle/fluid/tests/unittests/test_print_op.py
index e01af42a58b86042fd0282928d1a78d9c3239fe3..8097b5f734343ca97c131474338ed1cd60eefc85 100644
--- a/python/paddle/fluid/tests/unittests/test_print_op.py
+++ b/python/paddle/fluid/tests/unittests/test_print_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import paddle.fluid.core as core
 from paddle.fluid.executor import Executor
@@ -33,9 +35,8 @@ class TestPrintOpCPU(unittest.TestCase):
     def build_network(self, only_forward, **kargs):
         x = layers.data('x', shape=[3], dtype='float32', lod_level=1)
         x.stop_gradient = False
-        printed = layers.Print(input=x, **kargs)
-        if only_forward: return printed
-        loss = layers.mean(printed)
+        layers.Print(input=x, **kargs)
+        loss = layers.mean(x)
         append_backward(loss=loss)
         return loss
 
@@ -56,6 +57,8 @@ class TestPrintOpCPU(unittest.TestCase):
                        return_numpy=False)
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
 class TestPrintOpGPU(TestPrintOpCPU):
     def setUp(self):
         self.place = core.CUDAPlace(0)
diff --git a/python/paddle/fluid/tests/unittests/test_prior_box_op.py b/python/paddle/fluid/tests/unittests/test_prior_box_op.py
index bcbc02a2baa46b9ab583ecf3006bd3262e6038fd..7381b74af71051f8b993ba6d116b5282dd9b84e1 100644
--- a/python/paddle/fluid/tests/unittests/test_prior_box_op.py
+++ b/python/paddle/fluid/tests/unittests/test_prior_box_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 import sys
@@ -32,6 +34,7 @@ class TestPriorBoxOp(OpTest):
             'variances': self.variances,
             'flip': self.flip,
             'clip': self.clip,
+            'min_max_aspect_ratios_order': self.min_max_aspect_ratios_order,
             'step_w': self.step_w,
             'step_h': self.step_h,
             'offset': self.offset
@@ -52,6 +55,9 @@ class TestPriorBoxOp(OpTest):
         max_sizes = [5, 10]
         self.max_sizes = np.array(max_sizes).astype('float32').tolist()
 
+    def set_min_max_aspect_ratios_order(self):
+        self.min_max_aspect_ratios_order = False
+
     def init_test_params(self):
         self.layer_w = 32
         self.layer_h = 32
@@ -71,6 +77,7 @@ class TestPriorBoxOp(OpTest):
         self.set_max_sizes()
         self.aspect_ratios = [2.0, 3.0]
         self.flip = True
+        self.set_min_max_aspect_ratios_order()
         self.real_aspect_ratios = [1, 2.0, 1.0 / 2.0, 3.0, 1.0 / 3.0]
         self.aspect_ratios = np.array(
             self.aspect_ratios, dtype=np.float).flatten()
@@ -78,7 +85,6 @@ class TestPriorBoxOp(OpTest):
         self.variances = np.array(self.variances, dtype=np.float).flatten()
 
         self.clip = True
-
         self.num_priors = len(self.real_aspect_ratios) * len(self.min_sizes)
         if len(self.max_sizes) > 0:
             self.num_priors += len(self.max_sizes)
@@ -106,26 +112,60 @@ class TestPriorBoxOp(OpTest):
                 idx = 0
                 for s in range(len(self.min_sizes)):
                     min_size = self.min_sizes[s]
-                    # rest of priors
-                    for r in range(len(self.real_aspect_ratios)):
-                        ar = self.real_aspect_ratios[r]
-                        c_w = min_size * math.sqrt(ar) / 2
-                        c_h = (min_size / math.sqrt(ar)) / 2
-                        out_boxes[h, w, idx, :] = [(c_x - c_w) / self.image_w,
-                                                   (c_y - c_h) / self.image_h,
-                                                   (c_x + c_w) / self.image_w,
-                                                   (c_y + c_h) / self.image_h]
-                        idx += 1
-
-                    if len(self.max_sizes) > 0:
-                        max_size = self.max_sizes[s]
-                        # second prior: aspect_ratio = 1,
-                        c_w = c_h = math.sqrt(min_size * max_size) / 2
+                    if not self.min_max_aspect_ratios_order:
+                        # rest of priors
+                        for r in range(len(self.real_aspect_ratios)):
+                            ar = self.real_aspect_ratios[r]
+                            c_w = min_size * math.sqrt(ar) / 2
+                            c_h = (min_size / math.sqrt(ar)) / 2
+                            out_boxes[h, w, idx, :] = [
+                                (c_x - c_w) / self.image_w, (c_y - c_h) /
+                                self.image_h, (c_x + c_w) / self.image_w,
+                                (c_y + c_h) / self.image_h
+                            ]
+                            idx += 1
+
+                        if len(self.max_sizes) > 0:
+                            max_size = self.max_sizes[s]
+                            # second prior: aspect_ratio = 1,
+                            c_w = c_h = math.sqrt(min_size * max_size) / 2
+                            out_boxes[h, w, idx, :] = [
+                                (c_x - c_w) / self.image_w, (c_y - c_h) /
+                                self.image_h, (c_x + c_w) / self.image_w,
+                                (c_y + c_h) / self.image_h
+                            ]
+                            idx += 1
+                    else:
+                        c_w = c_h = min_size / 2.
                         out_boxes[h, w, idx, :] = [(c_x - c_w) / self.image_w,
                                                    (c_y - c_h) / self.image_h,
                                                    (c_x + c_w) / self.image_w,
                                                    (c_y + c_h) / self.image_h]
                         idx += 1
+                        if len(self.max_sizes) > 0:
+                            max_size = self.max_sizes[s]
+                            # second prior: aspect_ratio = 1,
+                            c_w = c_h = math.sqrt(min_size * max_size) / 2
+                            out_boxes[h, w, idx, :] = [
+                                (c_x - c_w) / self.image_w, (c_y - c_h) /
+                                self.image_h, (c_x + c_w) / self.image_w,
+                                (c_y + c_h) / self.image_h
+                            ]
+                            idx += 1
+
+                        # rest of priors
+                        for r in range(len(self.real_aspect_ratios)):
+                            ar = self.real_aspect_ratios[r]
+                            if abs(ar - 1.) < 1e-6:
+                                continue
+                            c_w = min_size * math.sqrt(ar) / 2
+                            c_h = (min_size / math.sqrt(ar)) / 2
+                            out_boxes[h, w, idx, :] = [
+                                (c_x - c_w) / self.image_w, (c_y - c_h) /
+                                self.image_h, (c_x + c_w) / self.image_w,
+                                (c_y + c_h) / self.image_h
+                            ]
+                            idx += 1
 
         # clip the prior's coordidate such that it is within[0, 1]
         if self.clip:
@@ -137,10 +177,15 @@ class TestPriorBoxOp(OpTest):
         self.out_var = out_var.astype('float32')
 
 
-class TestPriorBoxOpWithMaxSize(TestPriorBoxOp):
+class TestPriorBoxOpWithoutMaxSize(TestPriorBoxOp):
     def set_max_sizes(self):
         self.max_sizes = []
 
 
+class TestPriorBoxOpWithSpecifiedOutOrder(TestPriorBoxOp):
+    def set_min_max_aspect_ratios_order(self):
+        self.min_max_aspect_ratios_order = True
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_profiler.py b/python/paddle/fluid/tests/unittests/test_profiler.py
index cf6fe14a86aa1ab6ea3f60ad15f33d708e9b803a..7934164b84931f886967982ce0cb65c406bbf800 100644
--- a/python/paddle/fluid/tests/unittests/test_profiler.py
+++ b/python/paddle/fluid/tests/unittests/test_profiler.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import os
 import numpy as np
@@ -23,9 +25,6 @@ import paddle.fluid.core as core
 
 class TestProfiler(unittest.TestCase):
     def net_profiler(self, state, profile_path='/tmp/profile'):
-        enable_if_gpu = state == 'GPU' or state == "All"
-        if enable_if_gpu and not core.is_compiled_with_cuda():
-            return
         startup_program = fluid.Program()
         main_program = fluid.Program()
 
@@ -82,12 +81,16 @@ class TestProfiler(unittest.TestCase):
     def test_cpu_profiler(self):
         self.net_profiler('CPU')
 
+    @unittest.skipIf(not core.is_compiled_with_cuda(),
+                     "profiler is enabled only with GPU")
     def test_cuda_profiler(self):
         self.net_profiler('GPU')
 
+    @unittest.skipIf(not core.is_compiled_with_cuda(),
+                     "profiler is enabled only with GPU")
     def test_all_profiler(self):
         self.net_profiler('All', '/tmp/profile_out')
-        with open('/tmp/profile_out', 'r') as f:
+        with open('/tmp/profile_out', 'rb') as f:
             self.assertGreater(len(f.read()), 0)
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_program.py b/python/paddle/fluid/tests/unittests/test_program.py
index c51a48239330621d8e008415f81361616467cabf..0997afc97a97333c914a3027103ec48733b410dc 100644
--- a/python/paddle/fluid/tests/unittests/test_program.py
+++ b/python/paddle/fluid/tests/unittests/test_program.py
@@ -17,6 +17,7 @@ import unittest
 
 from paddle.fluid.framework import Program, default_main_program, program_guard, grad_var_name
 import paddle.fluid.layers as layers
+import paddle.fluid as fluid
 
 main_program = default_main_program()
 
@@ -98,6 +99,39 @@ class TestProgram(unittest.TestCase):
         new_program = main_program.clone()
         self.assertNotEqual(0, len(new_program.blocks[0].all_parameters()))
 
+    def test_program_inference_optimize(self):
+        def net():
+            reader = fluid.layers.py_reader(
+                capacity=10,
+                shapes=[[-1, 10], [-1, 1]],
+                lod_levels=[0, 0],
+                dtypes=['float32', 'int64'],
+                use_double_buffer=True)
+            in_data, label = fluid.layers.read_file(reader)
+            predict_label = fluid.layers.fc(in_data, size=2, act='softmax')
+            loss = fluid.layers.mean(
+                fluid.layers.cross_entropy(
+                    input=predict_label, label=label))
+
+            optimizer = fluid.optimizer.Adam()
+            optimizer.minimize(loss)
+
+        startup_program = fluid.Program()
+        main_program = fluid.Program()
+        with fluid.program_guard(main_program, startup_program):
+            net()
+        no_read_program = main_program.inference_optimize()
+        keep_read_program = main_program.inference_optimize(
+            export_for_deployment=False)
+        no_read_ops = no_read_program.global_block().ops
+        keep_read_ops = keep_read_program.global_block().ops
+        self.assertEqual(len(keep_read_ops) - len(no_read_ops), 2)
+        self.assertEqual(keep_read_ops[0].type, 'create_double_buffer_reader')
+        self.assertEqual(keep_read_ops[1].type, 'read')
+
+        for i in range(len(no_read_ops)):
+            self.assertEqual(no_read_ops[i].type, keep_read_ops[i + 2].type)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_program_code.py b/python/paddle/fluid/tests/unittests/test_program_code.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9c2b928617dce3904ca119896ca81454256e82e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_program_code.py
@@ -0,0 +1,81 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import time
+import unittest
+from multiprocessing import Process
+import signal
+
+import numpy
+
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+from paddle.fluid.layers.io import ListenAndServ
+from paddle.fluid.layers.io import Recv
+from paddle.fluid.layers.io import Send
+
+from paddle.fluid.transpiler.details import program_to_code
+
+
+class TestProgram2Code(unittest.TestCase):
+    def test_print(self):
+        place = fluid.CPUPlace()
+        self.init_serv(place)
+        self.init_client(place, 9123)
+
+    def init_serv(self, place):
+        main = fluid.Program()
+
+        with fluid.program_guard(main):
+            serv = ListenAndServ("127.0.0.1:0", ["X"], optimizer_mode=False)
+            with serv.do():
+                out_var = main.global_block().create_var(
+                    name="scale_0.tmp_0",
+                    psersistable=True,
+                    dtype="float32",
+                    shape=[32, 32])
+                x = layers.data(
+                    shape=[32, 32],
+                    dtype='float32',
+                    name="X",
+                    append_batch_size=False)
+                fluid.initializer.Constant(value=1.0)(x, main.global_block())
+                layers.scale(x=x, scale=10.0, out=out_var)
+
+        program_to_code(main)
+
+    def init_client(self, place, port):
+        main = fluid.Program()
+        with fluid.program_guard(main):
+            x = layers.data(
+                shape=[32, 32],
+                dtype='float32',
+                name='X',
+                append_batch_size=False)
+            fluid.initializer.Constant(value=2.3)(x, main.global_block())
+            get_var = main.global_block().create_var(
+                name="scale_0.tmp_0",  # server side var
+                dtype="float32",
+                persistable=False,
+                shape=[32, 32])
+            fluid.initializer.Constant(value=2.3)(get_var, main.global_block())
+            Send("127.0.0.1:%d" % port, [x])
+            o = Recv("127.0.0.1:%d" % port, [get_var])
+
+        program_to_code(main)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_protobuf.py b/python/paddle/fluid/tests/unittests/test_protobuf.py
index c3f1fa80185bfc4afc3ed715d736bcba092629d8..7b80927c48d02e83a9bfaac572c81a6a95a69c8c 100644
--- a/python/paddle/fluid/tests/unittests/test_protobuf.py
+++ b/python/paddle/fluid/tests/unittests/test_protobuf.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import paddle.fluid.proto.framework_pb2 as framework_pb2
 import unittest
 
diff --git a/python/paddle/fluid/tests/unittests/test_protobuf_descs.py b/python/paddle/fluid/tests/unittests/test_protobuf_descs.py
index 3f9059fb5b31cd009c068ccddc9a8938adae5772..d24b5cbd06ddf9f332c1369ebd513bef27b77e14 100644
--- a/python/paddle/fluid/tests/unittests/test_protobuf_descs.py
+++ b/python/paddle/fluid/tests/unittests/test_protobuf_descs.py
@@ -12,8 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import paddle.fluid.core as core
+import paddle.compat as cpt
 from paddle.fluid.framework import Program
 
 
@@ -68,7 +71,7 @@ class TestOpDesc(unittest.TestCase):
         self.assertEqual(8, len(op.attr_names()))
 
         op.set_block_attr("block_attr", program_desc.block(0))
-        self.assertEqual(0, op.block_attr("block_attr"))
+        self.assertEqual(0, op.block_attr_id("block_attr"))
 
         mul_op = block.append_op()
         mul_op.set_type("mul")
@@ -108,7 +111,7 @@ class TestVarDesc(unittest.TestCase):
     def test_shape(self):
         program_desc = core.ProgramDesc()
         block = program_desc.block(0)
-        var = block.var('my_var')
+        var = block.var(cpt.to_bytes('my_var'))
         var.set_type(core.VarDesc.VarType.SELECTED_ROWS)
         src_shape = [3, 2, 10, 8]
         var.set_shape(src_shape)
@@ -119,7 +122,7 @@ class TestVarDesc(unittest.TestCase):
     def test_multiple_shape(self):
         program_desc = core.ProgramDesc()
         block = program_desc.block(0)
-        var = block.var('my_reader')
+        var = block.var(cpt.to_bytes('my_reader'))
         var.set_type(core.VarDesc.VarType.READER)
         src_shapes = [[2, 3, 3], [4, 5], [6, 7, 8, 9]]
         var.set_shapes(src_shapes)
@@ -130,7 +133,7 @@ class TestVarDesc(unittest.TestCase):
     def test_dtype(self):
         program_desc = core.ProgramDesc()
         block = program_desc.block(0)
-        var = block.var('my_var')
+        var = block.var(cpt.to_bytes('my_var'))
         var.set_type(core.VarDesc.VarType.LOD_TENSOR)
         var.set_dtype(core.VarDesc.VarType.INT32)
         self.assertEqual(core.VarDesc.VarType.INT32, var.dtype())
@@ -139,7 +142,7 @@ class TestVarDesc(unittest.TestCase):
     def test_multiple_dtype(self):
         program_desc = core.ProgramDesc()
         block = program_desc.block(0)
-        var = block.var('my_reader')
+        var = block.var(cpt.to_bytes('my_reader'))
         var.set_type(core.VarDesc.VarType.READER)
         src_types = [
             core.VarDesc.VarType.INT32, core.VarDesc.VarType.FP64,
@@ -152,7 +155,7 @@ class TestVarDesc(unittest.TestCase):
     def test_multiple_lod_level(self):
         program_desc = core.ProgramDesc()
         block = program_desc.block(0)
-        var = block.var('my_reader')
+        var = block.var(cpt.to_bytes('my_reader'))
         var.set_type(core.VarDesc.VarType.READER)
         src_types = [3, 1, 2]
         var.set_lod_levels(src_types)
@@ -166,12 +169,12 @@ class TestBlockDesc(unittest.TestCase):
         self.assertIsNotNone(program_desc)
         block = program_desc.block(0)
         self.assertIsNotNone(block)
-        var1 = block.var("var1")
-        var2 = block.var("var2")
-        var3 = block.var("var3")
+        var1 = block.var(cpt.to_bytes("var1"))
+        var2 = block.var(cpt.to_bytes("var2"))
+        var3 = block.var(cpt.to_bytes("var3"))
         all_vars = block.all_vars()
         self.assertEqual(set(all_vars), {var1, var2, var3})
-        var2_re = block.find_var("var2")
+        var2_re = block.find_var(cpt.to_bytes("var2"))
         self.assertEqual(var2_re, var2)
 
     def test_add_op(self):
@@ -181,13 +184,13 @@ class TestBlockDesc(unittest.TestCase):
         self.assertIsNotNone(block)
         op1 = block.append_op()
         op2 = block.append_op()
-        op0 = block.prepend_op()
+        op0 = block._prepend_op()
         all_ops = []
-        for idx in xrange(0, block.op_size()):
+        for idx in range(0, block.op_size()):
             all_ops.append(block.op(idx))
         self.assertEqual(all_ops, [op0, op1, op2])
 
-    def test_remove_op(self):
+    def test__remove_op(self):
         program = Program()
         program_desc = program.desc
         self.assertIsNotNone(program_desc)
@@ -201,11 +204,11 @@ class TestBlockDesc(unittest.TestCase):
         op1.set_type("test")
         op2.set_type("test")
 
-        block.remove_op(1, 2)
-        program.sync_with_cpp()
+        block._remove_op(1, 2)
+        program._sync_with_cpp()
 
         all_ops = []
-        for idx in xrange(0, block.op_size()):
+        for idx in range(0, block.op_size()):
             all_ops.append(block.op(idx))
         self.assertEqual(all_ops, [op0, op2])
 
diff --git a/python/paddle/fluid/tests/unittests/test_proximal_adagrad_op.py b/python/paddle/fluid/tests/unittests/test_proximal_adagrad_op.py
index 3c2689585061af5a11a247a01b87b432dcd86e13..57e96f1fa34fa94f5e095d088016655f24b58d0c 100644
--- a/python/paddle/fluid/tests/unittests/test_proximal_adagrad_op.py
+++ b/python/paddle/fluid/tests/unittests/test_proximal_adagrad_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_proximal_gd_op.py b/python/paddle/fluid/tests/unittests/test_proximal_gd_op.py
index 137594b9a08e13bf6c3f3779356c209596f9ba8e..067502baecc73cc84a6aa8ab78a9afbcc191c49a 100644
--- a/python/paddle/fluid/tests/unittests/test_proximal_gd_op.py
+++ b/python/paddle/fluid/tests/unittests/test_proximal_gd_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_py_reader_push_pop.py b/python/paddle/fluid/tests/unittests/test_py_reader_push_pop.py
new file mode 100644
index 0000000000000000000000000000000000000000..3efe5aac8848b8230f42f4f3905eefc517c0fa5e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_py_reader_push_pop.py
@@ -0,0 +1,102 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import paddle.fluid as fluid
+import numpy as np
+from threading import Thread
+
+
+def feed_data(feed_queue, inputs):
+    for in_data in inputs:
+        feed_queue.push(in_data)
+
+
+class TestPyReader(unittest.TestCase):
+    def setUp(self):
+        self.capacity = 10
+        self.batch_size_min = 10
+        self.batch_size_max = 20
+        self.shapes = [(-1, 3, 2, 1), (-1, 1)]
+        self.lod_levels = [0, 0]
+        self.dtypes = ['float32', 'int64']
+        self.iterations = 20
+
+    def test_single_thread_main(self):
+        self.main(use_thread=False)
+
+    def test_multiple_thread_main(self):
+        self.main(use_thread=True)
+
+    def main(self, use_thread=False):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+            ) else fluid.CPUPlace()
+            executor = fluid.Executor(place)
+
+            data_file = fluid.layers.py_reader(
+                capacity=self.capacity,
+                dtypes=self.dtypes,
+                lod_levels=self.lod_levels,
+                shapes=self.shapes)
+            feed_queue = data_file.queue
+            read_out_data = fluid.layers.read_file(data_file)
+            self.inputs = []
+
+            for i in range(self.iterations):
+                in_data = fluid.LoDTensorArray()
+                batch_size = np.random.random_integers(self.batch_size_min,
+                                                       self.batch_size_max)
+                for shape, dtype in zip(self.shapes, self.dtypes):
+                    next_data = np.random.uniform(
+                        low=0, high=1000,
+                        size=(batch_size, ) + shape[1:]).astype(dtype)
+                    in_data.append(
+                        fluid.executor._as_lodtensor(next_data, place))
+
+                self.inputs.append(in_data)
+
+            executor.run(fluid.default_startup_program())
+            self.outputs = []
+            if use_thread:
+                thread = Thread(
+                    target=feed_data, args=(feed_queue, self.inputs))
+                thread.start()
+                for in_data in self.inputs:
+                    self.outputs.append(
+                        executor.run(fetch_list=list(read_out_data)))
+            else:
+                for in_data in self.inputs:
+                    feed_queue.push(in_data)
+                    self.outputs.append(
+                        executor.run(fetch_list=list(read_out_data)))
+
+            feed_queue.close()
+            self.validate()
+
+    def validate(self):
+        self.assertEqual(len(self.inputs), len(self.outputs))
+        for in_data_list, out_data_list in zip(self.inputs, self.outputs):
+            self.assertEqual(len(in_data_list), len(out_data_list))
+            in_data_list_np = [
+                np.array(in_lod_tensor) for in_lod_tensor in in_data_list
+            ]
+            for in_data, out_data in zip(in_data_list_np, out_data_list):
+                self.assertTrue((in_data == out_data).all())
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py b/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7fad9b3a60632adb564e1d155a3d935706b467f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py
@@ -0,0 +1,229 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import numpy as np
+import threading
+import multiprocessing
+import os
+
+
+def as_tensor(np_array_or_tensor, place=None):
+    if isinstance(np_array_or_tensor, fluid.LoDTensor):
+        return np_array_or_tensor
+
+    if place is None:
+        place = fluid.CPUPlace()
+
+    tensor = fluid.LoDTensor()
+    tensor.set(np_array_or_tensor, place)
+    return tensor
+
+
+def as_numpy(tensor_or_numpy):
+    return tensor_or_numpy if isinstance(
+        tensor_or_numpy, np.ndarray) else np.array(tensor_or_numpy)
+
+
+def feed_data(feed_queue, reader):
+    data_generator = reader()
+    while True:
+        data = next(data_generator, None)
+        if data is None or not feed_queue.push(data):
+            break
+
+
+def simple_fc_net(in_size,
+                  class_num,
+                  hidden_sizes,
+                  batch_size,
+                  queue_capacity,
+                  use_double_buffer=False):
+    reader = fluid.layers.py_reader(
+        capacity=queue_capacity,
+        shapes=[[-1, in_size], [-1, 1]],
+        lod_levels=[0, 0],
+        dtypes=['float32', 'int64'],
+        use_double_buffer=False)
+    feed_queue = reader.queue
+    reader = fluid.layers.batch(reader, batch_size=batch_size)
+    if use_double_buffer:
+        reader = fluid.layers.double_buffer(reader)
+
+    in_data, label = fluid.layers.read_file(reader)
+
+    hidden = in_data
+    for hidden_size in hidden_sizes:
+        hidden = fluid.layers.fc(
+            hidden,
+            size=hidden_size,
+            act='tanh',
+            bias_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=1.0)))
+
+    predict_label = fluid.layers.fc(hidden, size=class_num, act='softmax')
+    loss = fluid.layers.mean(
+        fluid.layers.cross_entropy(
+            input=predict_label, label=label))
+
+    optimizer = fluid.optimizer.Adam()
+    optimizer.minimize(loss)
+    return in_data, label, loss, optimizer, feed_queue
+
+
+class TestPyReaderUsingExecutor(unittest.TestCase):
+    def setUp(self):
+        self.in_size = 1000
+        self.hidden_sizes = [50, 30, 20]
+        self.class_num = 10
+        self.batch_size = 32
+        self.iterations = 10
+        self.queue_capacity = 50
+
+    def test(self):
+        for use_cuda in ([False, True]
+                         if core.is_compiled_with_cuda() else [False]):
+            for use_parallel_executor in [False, True]:
+                for use_double_buffer in [False, True]:
+                    print('Test Parameters:'),
+                    print({
+                        'use_cuda': use_cuda,
+                        'use_parallel_executor': use_parallel_executor,
+                        'use_double_buffer': use_double_buffer
+                    })
+                    self.main(use_cuda, use_parallel_executor,
+                              use_double_buffer)
+
+    def random_reader(self):
+        def reader():
+            self.inputs = []
+            cnt = 0
+            while True:
+                tensors = fluid.LoDTensorArray()
+                in_data = np.random.uniform(
+                    low=0, high=1, size=(1, self.in_size)).astype('float32')
+                tensors.append(as_tensor(in_data))
+                label = np.random.random_integers(
+                    low=0, high=self.class_num - 1, size=(1, 1)).astype('int64')
+                tensors.append(as_tensor(label))
+
+                if cnt < self.iterations * self.batch_size * self.batch_size_times:
+                    if cnt % (self.batch_size * self.batch_size_times) == 0:
+                        self.inputs.append([in_data, label])
+                    else:
+                        self.inputs[-1][0] = np.concatenate(
+                            (self.inputs[-1][0], in_data), axis=0)
+                        self.inputs[-1][1] = np.concatenate(
+                            (self.inputs[-1][1], label), axis=0)
+                elif not self.use_double_buffer:
+                    break
+
+                yield tensors
+                cnt += 1
+
+            yield None
+
+        return reader
+
+    def main(self,
+             use_cuda=True,
+             use_parallel_executor=False,
+             use_double_buffer=False):
+        assert not use_cuda or use_cuda and core.is_compiled_with_cuda()
+
+        self.use_cuda = use_cuda
+        self.use_parallel_executor = use_parallel_executor
+        self.use_double_buffer = use_double_buffer
+
+        startup_program = fluid.Program()
+        main_program = fluid.Program()
+
+        with fluid.program_guard(main_program, startup_program):
+            in_data, label, loss, optimizer, feed_queue = simple_fc_net(
+                in_size=self.in_size,
+                class_num=self.class_num,
+                hidden_sizes=self.hidden_sizes,
+                batch_size=self.batch_size,
+                queue_capacity=self.queue_capacity,
+                use_double_buffer=self.use_double_buffer)
+
+            place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+
+            startup_exe = fluid.Executor(place)
+            startup_exe.run(startup_program)
+
+            if use_parallel_executor:
+                main_exe = fluid.ParallelExecutor(use_cuda, loss_name=loss.name)
+                if use_cuda:
+                    self.batch_size_times = core.get_cuda_device_count()
+                else:
+                    self.batch_size_times = int(
+                        os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
+            else:
+                main_exe = startup_exe
+                self.batch_size_times = 1
+
+            reader = self.random_reader()
+            thread = threading.Thread(
+                target=feed_data, args=(feed_queue, reader))
+            thread.start()
+
+            self.outputs = []
+            for _ in range(self.iterations):
+                fetches = main_exe.run(fetch_list=[in_data.name, label.name])
+                fetches = [as_numpy(fetch) for fetch in fetches]
+                self.outputs.append(fetches)
+
+            feed_queue.close()
+            self.validate()
+
+    def validate(self):
+        self.assertEqual(len(self.inputs), len(self.outputs))
+        for batch_in, batch_out in zip(self.inputs, self.outputs):
+            self.assertEqual(len(batch_in), len(batch_out))
+            if self.use_parallel_executor and not self.use_double_buffer:
+                self.validate_unordered_batch(batch_in, batch_out)
+            else:
+                for in_data, out_data in zip(batch_in, batch_out):
+                    self.assertEqual(in_data.shape, out_data.shape)
+                    if not self.use_parallel_executor:
+                        self.assertTrue((in_data == out_data).all())
+
+    def validate_unordered_batch(self, batch_in, batch_out):
+        out_index_left_set = set(range(self.batch_size * self.batch_size_times))
+        mapping_num = 0
+        for i in range(self.batch_size * self.batch_size_times):
+            for j in out_index_left_set:
+                flag = True
+                for k in range(len(batch_in)):
+                    in_data = batch_in[k][i]
+                    out_data = batch_out[k][j]
+                    if (in_data != out_data).any():
+                        flag = False
+                        break
+
+                if flag:
+                    out_index_left_set.remove(j)
+                    mapping_num += 1
+                    break
+
+        self.assertEqual(mapping_num, self.batch_size * self.batch_size_times)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_random_crop_op.py b/python/paddle/fluid/tests/unittests/test_random_crop_op.py
index 1c708d0386da4028f1f3d177d0a3fd494c077c6e..f29dddff7a28ed041908741007361224624e436a 100644
--- a/python/paddle/fluid/tests/unittests/test_random_crop_op.py
+++ b/python/paddle/fluid/tests/unittests/test_random_crop_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 import paddle.fluid.core as core
@@ -21,11 +23,12 @@ from op_test import OpTest
 class TestRandomCropOp(OpTest):
     def setUp(self):
         to_crop = np.array([[[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]] *
-                           5).astype("float32")
+                           5).astype(np.int32)
         self.possible_res = [
-            np.array([[1, 2, 3], [5, 6, 7]]), np.array([[2, 3, 4], [6, 7, 8]]),
-            np.array([[5, 6, 7], [9, 10, 11]]),
-            np.array([[6, 7, 8], [10, 11, 12]])
+            np.array([[1, 2, 3], [5, 6, 7]]).astype(np.int32),
+            np.array([[2, 3, 4], [6, 7, 8]]).astype(np.int32),
+            np.array([[5, 6, 7], [9, 10, 11]]).astype(np.int32),
+            np.array([[6, 7, 8], [10, 11, 12]]).astype(np.int32)
         ]
         self.op_type = "random_crop"
         self.inputs = {'X': to_crop, 'Seed': np.array([10])}
diff --git a/python/paddle/fluid/tests/unittests/test_rank_loss_op.py b/python/paddle/fluid/tests/unittests/test_rank_loss_op.py
index 7eba1e2077e25325d537f01f043ed1afa372800c..c9fa24b103deb50aa896403e09b11e891fb62c6d 100644
--- a/python/paddle/fluid/tests/unittests/test_rank_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rank_loss_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_reader_reset.py b/python/paddle/fluid/tests/unittests/test_reader_reset.py
new file mode 100644
index 0000000000000000000000000000000000000000..e97a05b6f929821f82d96b462598a5ff03cf0a48
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_reader_reset.py
@@ -0,0 +1,120 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import os
+import paddle.fluid as fluid
+import paddle
+import numpy as np
+import unittest
+
+
+class TestReaderReset(unittest.TestCase):
+    def prepare_data(self):
+        def fake_data_generator():
+            for n in range(self.total_ins_num):
+                yield np.ones(self.ins_shape) * n, n
+
+        # Prepare data
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            reader = paddle.batch(fake_data_generator, batch_size=1)
+            feeder = fluid.DataFeeder(
+                feed_list=[
+                    fluid.layers.data(
+                        name='data', shape=[3], dtype='float32'),
+                    fluid.layers.data(
+                        name='label', shape=[1], dtype='int64'),
+                ],
+                place=fluid.CPUPlace())
+            fluid.recordio_writer.convert_reader_to_recordio_file(
+                self.data_file_name, reader, feeder)
+
+    def setUp(self):
+        # set parallel threads to fit 20 batches in line 49
+        os.environ['CPU_NUM'] = str(20)
+        self.use_cuda = fluid.core.is_compiled_with_cuda()
+        self.data_file_name = './reader_reset_test.recordio'
+        self.ins_shape = [3]
+        self.batch_size = 5
+        self.total_ins_num = self.batch_size * 20
+        self.test_pass_num = 100
+        self.prepare_data()
+
+    def main(self, with_double_buffer):
+        main_prog = fluid.Program()
+        startup_prog = fluid.Program()
+
+        with fluid.program_guard(main_prog, startup_prog):
+            data_reader_handle = fluid.layers.io.open_files(
+                filenames=[self.data_file_name],
+                shapes=[[-1] + self.ins_shape, [-1, 1]],
+                lod_levels=[0, 0],
+                dtypes=['float32', 'int64'],
+                thread_num=1,
+                pass_num=1)
+            data_reader = fluid.layers.io.batch(data_reader_handle,
+                                                self.batch_size)
+            if with_double_buffer:
+                data_reader = fluid.layers.double_buffer(data_reader)
+            image, label = fluid.layers.read_file(data_reader)
+            fetch_list = [image.name, label.name]
+
+        place = fluid.CUDAPlace(0) if self.use_cuda else fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(startup_prog)
+
+        build_strategy = fluid.BuildStrategy()
+        if with_double_buffer:
+            build_strategy.enable_data_balance = True
+        exec_strategy = fluid.ExecutionStrategy()
+        parallel_exe = fluid.ParallelExecutor(
+            use_cuda=self.use_cuda,
+            main_program=main_prog,
+            build_strategy=build_strategy,
+            exec_strategy=exec_strategy)
+
+        data_appeared = [False] * self.total_ins_num
+        pass_count = 0
+        while (True):
+            try:
+                data_val, label_val = parallel_exe.run(fetch_list,
+                                                       return_numpy=True)
+                ins_num = data_val.shape[0]
+                broadcasted_label = np.ones((ins_num, ) + tuple(
+                    self.ins_shape)) * label_val.reshape((ins_num, 1))
+                self.assertEqual(data_val.all(), broadcasted_label.all())
+                for l in label_val:
+                    self.assertFalse(data_appeared[l[0]])
+                    data_appeared[l[0]] = True
+
+            except fluid.core.EOFException:
+                pass_count += 1
+                if with_double_buffer:
+                    data_appeared = data_appeared[:-parallel_exe.device_count *
+                                                  self.batch_size]
+                for i in data_appeared:
+                    self.assertTrue(i)
+                if pass_count < self.test_pass_num:
+                    data_appeared = [False] * self.total_ins_num
+                    data_reader_handle.reset()
+                else:
+                    break
+
+    def test_all(self):
+        self.main(with_double_buffer=False)
+        self.main(with_double_buffer=True)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_recordio_reader.py b/python/paddle/fluid/tests/unittests/test_recordio_reader.py
index 69a522e273db017ac55b408276b4a28f5f907c42..c5210bb2085bc386df43cd0d20292d7b308a1093 100644
--- a/python/paddle/fluid/tests/unittests/test_recordio_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_recordio_reader.py
@@ -12,11 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 
 import paddle.fluid as fluid
-import paddle.v2 as paddle
-import paddle.v2.dataset.mnist as mnist
+import paddle
+import paddle.dataset.mnist as mnist
 
 
 class TestRecordIO(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_recurrent_op.py b/python/paddle/fluid/tests/unittests/test_recurrent_op.py
index d6ff18430e319e236f03d5661381e923cc956590..6dfc85e301a2eda66bade09a8b6dd0004155f385 100644
--- a/python/paddle/fluid/tests/unittests/test_recurrent_op.py
+++ b/python/paddle/fluid/tests/unittests/test_recurrent_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 
 import paddle.fluid.layers as layers
@@ -203,12 +205,12 @@ class RecurrentOpTest1(unittest.TestCase):
                     num_grad[idx], ana_grad[idx], rtol=0.1).all())
 
     def check_forward(self):
-        print 'test recurrent op forward'
+        print('test recurrent op forward')
         pd_output = self.forward()
         py_output = self.py_rnn.forward()
-        print 'pd_output', pd_output
+        print('pd_output', pd_output)
         print
-        print 'py_output', py_output
+        print('py_output', py_output)
         self.assertEqual(pd_output.shape, py_output.shape)
         self.assertTrue(np.isclose(pd_output, py_output, rtol=0.1).all())
 
@@ -445,7 +447,7 @@ class RecurrentOpNoMemBootTest(RecurrentOpTest1):
         self.py_rnn = RecurrentOpNoMemBootTest.PySimpleRNN4(self.input_shape,
                                                             self.output_shape)
         self.output = layers.mean(self.create_rnn_op(), **self.p_info)
-        print self.main_program
+        print(self.main_program)
 
     def create_rnn_op(self):
         x = layers.data(
diff --git a/python/paddle/fluid/tests/unittests/test_reduce_op.py b/python/paddle/fluid/tests/unittests/test_reduce_op.py
index 865c2b7df085aa6a6cb0d6eb461c342ce08695cd..328f0f0011381b77cccb8b2d9b266aa53b259473 100644
--- a/python/paddle/fluid/tests/unittests/test_reduce_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reduce_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
@@ -89,15 +91,11 @@ class TestProdOp(OpTest):
         self.check_grad(['X'], 'Out')
 
 
-class TestKeepDimReduce(OpTest):
+class Test1DReduce(OpTest):
     def setUp(self):
         self.op_type = "reduce_sum"
-        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
-        self.attrs = {'dim': [-2], 'keep_dim': True}
-        self.outputs = {
-            'Out':
-            self.inputs['X'].sum(axis=tuple(self.attrs['dim']), keepdims=True)
-        }
+        self.inputs = {'X': np.random.random(20).astype("float64")}
+        self.outputs = {'Out': self.inputs['X'].sum(axis=0)}
 
     def test_check_output(self):
         self.check_output()
@@ -106,32 +104,82 @@ class TestKeepDimReduce(OpTest):
         self.check_grad(['X'], 'Out')
 
 
-class Test1DReduce(OpTest):
+class Test2DReduce0(Test1DReduce):
     def setUp(self):
         self.op_type = "reduce_sum"
-        self.inputs = {'X': np.random.random(20).astype("float64")}
+        self.attrs = {'dim': [0]}
+        self.inputs = {'X': np.random.random((20, 10)).astype("float64")}
         self.outputs = {'Out': self.inputs['X'].sum(axis=0)}
 
-    def test_check_output(self):
-        self.check_output()
 
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
+class Test2DReduce1(Test1DReduce):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.attrs = {'dim': [1]}
+        self.inputs = {'X': np.random.random((20, 10)).astype("float64")}
+        self.outputs = {
+            'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']))
+        }
+
+
+class Test3DReduce0(Test1DReduce):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.attrs = {'dim': [1]}
+        self.inputs = {'X': np.random.random((5, 6, 7)).astype("float64")}
+        self.outputs = {
+            'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']))
+        }
+
+
+class Test3DReduce1(Test1DReduce):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.attrs = {'dim': [2]}
+        self.inputs = {'X': np.random.random((5, 6, 7)).astype("float64")}
+        self.outputs = {
+            'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']))
+        }
+
+
+class Test3DReduce2(Test1DReduce):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.attrs = {'dim': [-2]}
+        self.inputs = {'X': np.random.random((5, 6, 7)).astype("float64")}
+        self.outputs = {
+            'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']))
+        }
+
+
+class Test3DReduce3(Test1DReduce):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.attrs = {'dim': [1, 2]}
+        self.inputs = {'X': np.random.random((5, 6, 7)).astype("float64")}
+        self.outputs = {
+            'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']))
+        }
 
 
-class TestReduceAll(OpTest):
+class TestKeepDimReduce(Test1DReduce):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
+        self.attrs = {'dim': [1], 'keep_dim': True}
+        self.outputs = {
+            'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']),
+                                        keepdims=self.attrs['keep_dim'])
+        }
+
+
+class TestReduceAll(Test1DReduce):
     def setUp(self):
         self.op_type = "reduce_sum"
         self.inputs = {'X': np.random.random((5, 6, 2, 10)).astype("float64")}
         self.attrs = {'reduce_all': True}
         self.outputs = {'Out': self.inputs['X'].sum()}
 
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
 
 ## reduction in multi dims
 class TestReduceMeanOpMultiAxises(OpTest):
diff --git a/python/paddle/fluid/tests/unittests/test_registry.py b/python/paddle/fluid/tests/unittests/test_registry.py
index a361c4624e3e2efa817e8137ff31133997a0a1fb..7381bb61eb4630cb67bc306fde211704e9580af4 100644
--- a/python/paddle/fluid/tests/unittests/test_registry.py
+++ b/python/paddle/fluid/tests/unittests/test_registry.py
@@ -11,6 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from __future__ import print_function
 import unittest
 
 import paddle.fluid as fluid
diff --git a/python/paddle/fluid/tests/unittests/test_regularizer.py b/python/paddle/fluid/tests/unittests/test_regularizer.py
index 9b1c4ceada52322b3f1fdc4ab2e90a2c089ee67e..6727335c6059161d235a64a1b90d36b84004f9b3 100644
--- a/python/paddle/fluid/tests/unittests/test_regularizer.py
+++ b/python/paddle/fluid/tests/unittests/test_regularizer.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 
 import paddle.fluid.framework as framework
diff --git a/python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py b/python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py
index a70321bd800bf25eeb9e5d197ea7e08626b9aede..28c8c4699adbc108c05e4a500815752e2ec24c61 100644
--- a/python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py
@@ -12,10 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import paddle.fluid as fluid
 import paddle.fluid.core as core
+from paddle.fluid.layers.control_flow import lod_rank_table
 import numpy
+import functools
 
 
 class TestReorderLoDTensor(unittest.TestCase):
@@ -34,7 +38,7 @@ class TestReorderLoDTensor(unittest.TestCase):
         dat.stop_gradient = False
         rank_dat = fluid.layers.data(
             name=cls.data_desc[1][0], shape=cls.data_desc[1][1])
-        table = fluid.layers.lod_rank_table(rank_dat)
+        table = lod_rank_table(rank_dat)
         new_dat = fluid.layers.reorder_lod_tensor_by_rank(
             x=dat, rank_table=table)
         loss = fluid.layers.reduce_sum(new_dat)
@@ -100,7 +104,8 @@ class TestReorderLoDTensor(unittest.TestCase):
         rank_table = []  # list of (index, length)
         for i in range(len(ref_lod)):
             rank_table.append((i, ref_lod[i]))
-        rank_table = sorted(rank_table, lambda x, y: y[1] - x[1])
+        rank_table = sorted(
+            rank_table, key=functools.cmp_to_key(lambda x, y: y[1] - x[1]))
 
         # compute the input sequence info according to input_lod
         input_value, input_lod = self.data[self.data_desc[0][0]]
diff --git a/python/paddle/fluid/tests/unittests/test_reshape_op.py b/python/paddle/fluid/tests/unittests/test_reshape_op.py
index f51b5a7e9907294a5b91c920a363830d8b9a7137..0557593657e2e480a509902a07f25723b2c710b0 100644
--- a/python/paddle/fluid/tests/unittests/test_reshape_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reshape_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 
@@ -20,106 +22,39 @@ from op_test import OpTest
 
 class TestReshapeOp(OpTest):
     def setUp(self):
-        ori_shape = (2, 25)
-        new_shape = (5, 10)
-
-        self.op_type = "reshape"
-        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"shape": new_shape, "inplace": False}
-        self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
-
-
-class TestReshapeOpDimInfer1(OpTest):
-    def setUp(self):
-        ori_shape = (5, 10)
-        new_shape = (5, -1, 5)
-
-        self.op_type = "reshape"
-        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"shape": new_shape, "inplace": False}
-        self.outputs = {"Out": self.inputs["X"].reshape(self.attrs["shape"])}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
-
-
-class TestReshapeOpDimInfer2(OpTest):
-    def setUp(self):
-        ori_shape = (2, 2, 6)
-        new_shape = (2, 0, 3, -1)
-        infered_shape = (2, 2, 3, -1)
-
-        self.op_type = "reshape"
-        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"shape": new_shape, "inplace": False}
-        self.outputs = {"Out": self.inputs["X"].reshape(infered_shape)}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
-
-
-class TestReshapeOpInplace(OpTest):
-    def setUp(self):
-        ori_shape = (2, 25)
-        new_shape = (5, 10)
-
-        self.op_type = "reshape"
-        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"shape": new_shape}
-        self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
-
-
-class TestReshapeOpDimInferInplace1(OpTest):
-    def setUp(self):
-        ori_shape = (5, 10)
-        new_shape = (5, -1, 5)
+        self.init_data()
+        self.op_type = "reshape2"
+        self.inputs = {"X": np.random.random(self.ori_shape).astype("float32")}
+        self.attrs = {"shape": self.new_shape}
+        self.outputs = {
+            "Out": self.inputs["X"].reshape(self.infered_shape),
+            'XShape': np.random.random(self.ori_shape).astype("float32")
+        }
 
-        self.op_type = "reshape"
-        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"shape": new_shape}
-        self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
+    def init_data(self):
+        self.ori_shape = (2, 25)
+        self.new_shape = (5, 10)
+        self.infered_shape = (5, 10)
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(no_check_set=['XShape'])
 
     def test_check_grad(self):
         self.check_grad(["X"], "Out")
 
 
-class TestReshapeOpDimInferInplace2(OpTest):
-    def setUp(self):
-        ori_shape = (2, 2, 6)
-        new_shape = (2, 0, 3, -1)
-        infered_shape = (2, 2, 3, -1)
-
-        self.op_type = "reshape"
-        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"shape": new_shape}
-        self.outputs = {"Out": self.inputs["X"].reshape(infered_shape)}
+class TestReshapeOpDimInfer1(TestReshapeOp):
+    def init_data(self):
+        self.ori_shape = (5, 10)
+        self.new_shape = (5, -1, 5)
+        self.infered_shape = (5, -1, 5)
 
-    def test_check_output(self):
-        self.check_output()
 
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
+class TestReshapeOpDimInfer2(TestReshapeOp):
+    def init_data(self):
+        self.ori_shape = (2, 2, 6)
+        self.new_shape = (2, 0, 3, -1)
+        self.infered_shape = (2, 2, 3, -1)
 
 
 class TestReshapeOpWithInputShape(OpTest):
@@ -128,20 +63,23 @@ class TestReshapeOpWithInputShape(OpTest):
         new_shape = (0, -1, 5)
         actual_shape = (2, 3, 5)
 
-        self.op_type = "reshape"
+        self.op_type = "reshape2"
         self.inputs = {
             "X": np.random.random(ori_shape).astype("float32"),
             "Shape": np.array(
                 actual_shape, dtype="int32")
         }
         self.attrs = {"shape": new_shape}
-        self.outputs = {"Out": self.inputs["X"].reshape(actual_shape)}
+        self.outputs = {
+            "Out": self.inputs["X"].reshape(actual_shape),
+            'XShape': np.random.random(ori_shape).astype("float32")
+        }
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(no_check_set=['XShape'])
 
     def test_check_grad(self):
-        self.check_grad(["X"], "Out")
+        self.check_grad(["X"], "Out", sum_outputs=["Out"])
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_reverse_op.py b/python/paddle/fluid/tests/unittests/test_reverse_op.py
index f845575a02869f08299d76b5600074598ca27f6c..e83f548c228c7c045ff795e882738ea56e3f2d24 100644
--- a/python/paddle/fluid/tests/unittests/test_reverse_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reverse_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_rmsprop_op.py b/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
index 0d84a5853ead45b84de9383dd8749992d2f91440..70848e4e2239e2be160bb0c1a28a5aecd01a87dc 100644
--- a/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
@@ -12,91 +12,167 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
+
 import numpy as np
-from op_test import OpTest
-
-
-class TestRmspropOp1(OpTest):
-    ''' Test RMSProp with explicit inputs
-    '''
-
-    def setUp(self):
-        self.op_type = "rmsprop"
-
-        param = np.random.random((123, 321)).astype("float32")
-        mean_square = np.random.random((123, 321)).astype("float32")
-        learning_rate = np.array([0.01]).astype("float32")
-        grad = np.random.random((123, 321)).astype("float32")
-        moment = np.zeros((123, 321)).astype("float32")
-
-        epsilon = 1e-6
-        decay = 0.9
-        momentum = 0.0
-
-        self.inputs = {
-            'Param': param,
-            'MeanSquare': mean_square,
-            'LearningRate': learning_rate,
-            'Grad': grad,
-            'Moment': moment,
-        }
-
-        self.attrs = {'epsilon': epsilon, 'decay': decay, 'momentum': momentum}
-
-        ms_out = decay * mean_square + (1 - decay) * grad * grad
-        moment_out = momentum * moment + \
-            learning_rate * grad / np.sqrt(ms_out + epsilon)
-        param_out = param - moment_out
-
-        self.outputs = {
-            'ParamOut': param_out,
-            'MomentOut': moment_out,
-            'MeanSquareOut': ms_out
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestRmspropOp2(OpTest):
-    '''Test RMSProp with default values for attributes
-    '''
-
-    def setUp(self):
-        self.op_type = "rmsprop"
-
-        param = np.random.random((123, 321)).astype("float32")
-        mean_square = np.random.random((123, 321)).astype("float32")
-        learning_rate = np.array([0.01]).astype("float32")
-        grad = np.random.random((123, 321)).astype("float32")
-        moment = np.zeros((123, 321)).astype("float32")
-
-        epsilon = 1.0e-10
-        decay = 0.9
-        momentum = 0.0
-
-        self.inputs = {
-            'Param': param,
-            'MeanSquare': mean_square,
-            'LearningRate': learning_rate,
-            'Grad': grad,
-            'Moment': moment,
-        }
-
-        ms_out = decay * mean_square + (1 - decay) * grad * grad
-        moment_out = momentum * moment + \
-            learning_rate * grad / np.sqrt(ms_out + epsilon)
-        param_out = param - moment_out
-
-        self.outputs = {
-            'ParamOut': param_out,
-            'MomentOut': moment_out,
-            'MeanSquareOut': ms_out
-        }
-
-    def test_check_output(self):
-        self.check_output()
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+
+
+class TestBase(unittest.TestCase):
+    def setup(self, centered, epsilon=1e-6):
+        np.random.seed(5)  # fix seed
+
+        self.param_name = "param"
+        self.param = np.random.random((123, 321)).astype("float32")
+
+        self.mean_square_name = "mean_square"
+        self.mean_square = np.random.random((123, 321)).astype("float32")
+
+        self.mean_grad_name = "mean_grad"
+        self.mean_grad = np.random.random((123, 321)).astype("float32")
+
+        self.lr_name = "lr"
+        self.learning_rate = np.array([0.01]).astype("float32")
+
+        self.grad_name = "grad"
+        self.grad = np.random.random((123, 321)).astype("float32")
+
+        self.moment_name = "moment"
+        self.moment = np.zeros((123, 321)).astype("float32")
+
+        self.epsilon = epsilon
+        self.decay = 0.9
+        self.momentum = 0.0
+        self.centered = centered
+
+        self.ms_out = self.decay * self.mean_square + (1 - self.decay
+                                                       ) * self.grad * self.grad
+        if centered:
+            self.mg_out = self.decay * self.mean_grad + (1 - self.decay
+                                                         ) * self.grad
+            self.moment_out = self.momentum * self.moment + \
+                              self.learning_rate * self.grad / np.sqrt(self.ms_out - np.square(self.mg_out) + self.epsilon)
+        else:
+            self.moment_out = self.momentum * self.moment + \
+                              self.learning_rate * self.grad / np.sqrt(self.ms_out + self.epsilon)
+
+        self.param_out = self.param - self.moment_out
+
+    def check(self,
+              actual_t,
+              expect_t,
+              place,
+              out_name,
+              atol=1e-5,
+              equal_nan=False):
+        self.assertTrue(
+            np.allclose(
+                actual_t, expect_t, atol=atol, equal_nan=equal_nan),
+            "Output (" + out_name + ") has diff at " + str(place) + "\nExpect "
+            + str(expect_t) + "\n" + "But Got" + str(actual_t))
+
+
+class TestRmspropOp(TestBase):
+    def check_with_place(self, place, centered, epsilon):
+        self.setup(centered, epsilon)
+        scope = core.Scope()
+
+        # create and initialize Param Variable
+        param = scope.var(self.param_name).get_tensor()
+        param.set(self.param, place)
+
+        mean_square = scope.var(self.mean_square_name).get_tensor()
+        mean_square.set(self.mean_square, place)
+
+        lr = scope.var(self.lr_name).get_tensor()
+        lr.set(self.learning_rate, place)
+
+        grad = scope.var(self.grad_name).get_tensor()
+        grad.set(self.grad, place)
+
+        moment = scope.var(self.moment_name).get_tensor()
+        moment.set(self.moment, place)
+
+        # create and run sgd operator
+
+        if self.centered:
+            mean_grad = scope.var(self.mean_grad_name).get_tensor()
+            mean_grad.set(self.mean_grad, place)
+
+            rmsprop_op = Operator(
+                "rmsprop",
+                Param=self.param_name,
+                Grad=self.grad_name,
+                MeanSquare=self.mean_square_name,
+                MeanGrad=self.mean_grad_name,
+                Moment=self.moment_name,
+                LearningRate=self.lr_name,
+                ParamOut=self.param_name,
+                MeanSquareOut=self.mean_square_name,
+                MomentOut=self.moment_name,
+                MeanGradOut=self.mean_grad_name,
+                epsilon=self.epsilon,
+                decay=self.decay,
+                momentum=self.momentum,
+                centered=True)
+        else:
+            rmsprop_op = Operator(
+                "rmsprop",
+                Param=self.param_name,
+                Grad=self.grad_name,
+                MeanSquare=self.mean_square_name,
+                Moment=self.moment_name,
+                LearningRate=self.lr_name,
+                ParamOut=self.param_name,
+                MeanSquareOut=self.mean_square_name,
+                MomentOut=self.moment_name,
+                epsilon=self.epsilon,
+                decay=self.decay,
+                momentum=self.momentum,
+                centered=False)
+
+        rmsprop_op.run(scope, place)
+
+        atol = 1e-5
+        equal_nan = False
+
+        if self.centered:
+            atol = 1e-3
+            equal_nan = True
+
+        self.check(
+            np.array(mean_square), self.ms_out, place, self.mean_square_name)
+        self.check(
+            np.array(moment),
+            self.moment_out,
+            place,
+            self.moment_name,
+            atol=atol,
+            equal_nan=equal_nan)
+        self.check(
+            np.array(param),
+            self.param_out,
+            place,
+            self.param_name,
+            atol=atol,
+            equal_nan=equal_nan)
+
+        if self.centered:
+            self.check(
+                np.array(mean_grad), self.mg_out, place, self.mean_grad_name)
+
+    def test_rmsprop(self):
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
+        for place in places:
+            self.check_with_place(place, False, 1e-6)
+            self.check_with_place(place, False, 1e-10)
+            self.check_with_place(place, True, 1e-6)
+            self.check_with_place(place, True, 1e-10)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_rnn_memory_helper_op.py b/python/paddle/fluid/tests/unittests/test_rnn_memory_helper_op.py
index 178606f05961263df5ef0398064a1fd135fbe784..9bfec8e9bdd8c4667fb19f3dd479b759d6dd665b 100644
--- a/python/paddle/fluid/tests/unittests/test_rnn_memory_helper_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rnn_memory_helper_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 
 from paddle.fluid.framework import Program
diff --git a/python/paddle/fluid/tests/unittests/test_roi_pool_op.py b/python/paddle/fluid/tests/unittests/test_roi_pool_op.py
index df5684ab173a4889dd7b693f9246bafd12e0345f..ad4cd2e803bfae4c3fbc04503331b9a786b25d17 100644
--- a/python/paddle/fluid/tests/unittests/test_roi_pool_op.py
+++ b/python/paddle/fluid/tests/unittests/test_roi_pool_op.py
@@ -12,10 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 import math
 import sys
+import paddle.compat as cpt
 from op_test import OpTest
 
 
@@ -58,11 +61,11 @@ class TestROIPoolOp(OpTest):
 
         for i in range(self.rois_num):
             roi = self.rois[i]
-            roi_batch_id = roi[0]
-            roi_start_w = int(round(roi[1] * self.spatial_scale))
-            roi_start_h = int(round(roi[2] * self.spatial_scale))
-            roi_end_w = int(round(roi[3] * self.spatial_scale))
-            roi_end_h = int(round(roi[4] * self.spatial_scale))
+            roi_batch_id = int(roi[0])
+            roi_start_w = int(cpt.round(roi[1] * self.spatial_scale))
+            roi_start_h = int(cpt.round(roi[2] * self.spatial_scale))
+            roi_end_w = int(cpt.round(roi[3] * self.spatial_scale))
+            roi_end_h = int(cpt.round(roi[4] * self.spatial_scale))
 
             roi_height = int(max(roi_end_h - roi_start_h + 1, 1))
             roi_width = int(max(roi_end_w - roi_start_w + 1, 1))
@@ -97,8 +100,8 @@ class TestROIPoolOp(OpTest):
                             for w in range(wstart, wend):
                                 if x_i[c, h, w] > out_data[i, c, ph, pw]:
                                     out_data[i, c, ph, pw] = x_i[c, h, w]
-                                    argmax_data[i, c, ph, pw] = h * \
-                                        self.width + w
+                                    argmax_data[i, c, ph,
+                                                pw] = h * self.width + w
 
         self.outs = out_data.astype('float32')
         self.argmaxes = argmax_data.astype('int64')
@@ -110,19 +113,19 @@ class TestROIPoolOp(OpTest):
             self.rois_lod[0].append(bno + 1)
             for i in range(bno + 1):
                 x1 = np.random.random_integers(
-                    0, self.width / self.spatial_scale - self.pooled_width)
+                    0, self.width // self.spatial_scale - self.pooled_width)
                 y1 = np.random.random_integers(
-                    0, self.height / self.spatial_scale - self.pooled_height)
+                    0, self.height // self.spatial_scale - self.pooled_height)
 
                 x2 = np.random.random_integers(x1 + self.pooled_width,
-                                               self.width / self.spatial_scale)
-                y2 = np.random.random_integers(y1 + self.pooled_height,
-                                               self.height / self.spatial_scale)
+                                               self.width // self.spatial_scale)
+                y2 = np.random.random_integers(
+                    y1 + self.pooled_height, self.height // self.spatial_scale)
 
                 roi = [bno, x1, y1, x2, y2]
                 rois.append(roi)
         self.rois_num = len(rois)
-        self.rois = np.array(rois).astype("int64")
+        self.rois = np.array(rois).astype("float32")
 
     def setUp(self):
         self.op_type = "roi_pool"
diff --git a/python/paddle/fluid/tests/unittests/test_row_conv_op.py b/python/paddle/fluid/tests/unittests/test_row_conv_op.py
index 07dcd108689ae6069e30fe22029258d192215549..2f13f067ef313685227c7de9a49fae8640ca6b32 100644
--- a/python/paddle/fluid/tests/unittests/test_row_conv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_row_conv_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py b/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..f63dbcd3d7f6bfce3ccc1c42ae41afe42bfad003
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py
@@ -0,0 +1,218 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from op_test import OpTest
+from test_anchor_generator_op import anchor_generator_in_python
+from test_generate_proposal_labels_op import _generate_groundtruth
+from test_generate_proposal_labels_op import _bbox_overlaps, _box_to_delta
+
+
+def rpn_target_assign(anchor_by_gt_overlap,
+                      rpn_batch_size_per_im,
+                      rpn_positive_overlap,
+                      rpn_negative_overlap,
+                      rpn_fg_fraction,
+                      use_random=True):
+    anchor_to_gt_argmax = anchor_by_gt_overlap.argmax(axis=1)
+    anchor_to_gt_max = anchor_by_gt_overlap[np.arange(
+        anchor_by_gt_overlap.shape[0]), anchor_to_gt_argmax]
+
+    gt_to_anchor_argmax = anchor_by_gt_overlap.argmax(axis=0)
+    gt_to_anchor_max = anchor_by_gt_overlap[gt_to_anchor_argmax, np.arange(
+        anchor_by_gt_overlap.shape[1])]
+    anchors_with_max_overlap = np.where(
+        anchor_by_gt_overlap == gt_to_anchor_max)[0]
+
+    labels = np.ones((anchor_by_gt_overlap.shape[0], ), dtype=np.int32) * -1
+    labels[anchors_with_max_overlap] = 1
+    labels[anchor_to_gt_max >= rpn_positive_overlap] = 1
+
+    num_fg = int(rpn_fg_fraction * rpn_batch_size_per_im)
+    fg_inds = np.where(labels == 1)[0]
+    if len(fg_inds) > num_fg and use_random:
+        disable_inds = np.random.choice(
+            fg_inds, size=(len(fg_inds) - num_fg), replace=False)
+    else:
+        disable_inds = fg_inds[num_fg:]
+    labels[disable_inds] = -1
+    fg_inds = np.where(labels == 1)[0]
+
+    num_bg = rpn_batch_size_per_im - np.sum(labels == 1)
+    bg_inds = np.where(anchor_to_gt_max < rpn_negative_overlap)[0]
+    if len(bg_inds) > num_bg and use_random:
+        enable_inds = bg_inds[np.random.randint(len(bg_inds), size=num_bg)]
+    else:
+        enable_inds = bg_inds[:num_bg]
+    labels[enable_inds] = 0
+    fg_inds = np.where(labels == 1)[0]
+    bg_inds = np.where(labels == 0)[0]
+
+    loc_index = fg_inds
+    score_index = np.hstack((fg_inds, bg_inds))
+    labels = labels[score_index]
+    assert not np.any(labels == -1), "Wrong labels with -1"
+
+    gt_inds = anchor_to_gt_argmax[fg_inds]
+
+    return loc_index, score_index, labels, gt_inds
+
+
+def get_anchor(n, c, h, w):
+    input_feat = np.random.random((n, c, h, w)).astype('float32')
+    anchors, _ = anchor_generator_in_python(
+        input_feat=input_feat,
+        anchor_sizes=[32., 64.],
+        aspect_ratios=[0.5, 1.0],
+        variances=[1.0, 1.0, 1.0, 1.0],
+        stride=[16.0, 16.0],
+        offset=0.5)
+    return anchors
+
+
+def rpn_target_assign_in_python(all_anchors,
+                                gt_boxes,
+                                is_crowd,
+                                im_info,
+                                lod,
+                                rpn_straddle_thresh,
+                                rpn_batch_size_per_im,
+                                rpn_positive_overlap,
+                                rpn_negative_overlap,
+                                rpn_fg_fraction,
+                                use_random=True):
+    anchor_num = all_anchors.shape[0]
+    batch_size = len(lod) - 1
+    for i in range(batch_size):
+        im_height = im_info[i][0]
+        im_width = im_info[i][1]
+        im_scale = im_info[i][2]
+        if rpn_straddle_thresh >= 0:
+            # Only keep anchors inside the image by a margin of straddle_thresh
+            inds_inside = np.where(
+                (all_anchors[:, 0] >= -rpn_straddle_thresh) &
+                (all_anchors[:, 1] >= -rpn_straddle_thresh) & (
+                    all_anchors[:, 2] < im_width + rpn_straddle_thresh) & (
+                        all_anchors[:, 3] < im_height + rpn_straddle_thresh))[0]
+            # keep only inside anchors
+            inside_anchors = all_anchors[inds_inside, :]
+        else:
+            inds_inside = np.arange(all_anchors.shape[0])
+            inside_anchors = all_anchors
+
+        b, e = lod[i], lod[i + 1]
+        gt_boxes_slice = gt_boxes[b:e, :] * im_scale
+        is_crowd_slice = is_crowd[b:e]
+
+        not_crowd_inds = np.where(is_crowd_slice == 0)[0]
+        gt_boxes_slice = gt_boxes_slice[not_crowd_inds]
+        iou = _bbox_overlaps(inside_anchors, gt_boxes_slice)
+
+        loc_inds, score_inds, labels, gt_inds = rpn_target_assign(
+            iou, rpn_batch_size_per_im, rpn_positive_overlap,
+            rpn_negative_overlap, rpn_fg_fraction, use_random)
+        # unmap to all anchor 
+        loc_inds = inds_inside[loc_inds]
+        score_inds = inds_inside[score_inds]
+
+        sampled_gt = gt_boxes_slice[gt_inds]
+        sampled_anchor = all_anchors[loc_inds]
+        box_deltas = _box_to_delta(sampled_anchor, sampled_gt, [1., 1., 1., 1.])
+
+        if i == 0:
+            loc_indexes = loc_inds
+            score_indexes = score_inds
+            tgt_labels = labels
+            tgt_bboxes = box_deltas
+        else:
+            loc_indexes = np.concatenate(
+                [loc_indexes, loc_inds + i * anchor_num])
+            score_indexes = np.concatenate(
+                [score_indexes, score_inds + i * anchor_num])
+            tgt_labels = np.concatenate([tgt_labels, labels])
+            tgt_bboxes = np.vstack([tgt_bboxes, box_deltas])
+
+    return loc_indexes, score_indexes, tgt_bboxes, tgt_labels
+
+
+class TestRpnTargetAssignOp(OpTest):
+    def setUp(self):
+        n, c, h, w = 2, 4, 14, 14
+        all_anchors = get_anchor(n, c, h, w)
+        gt_num = 10
+        all_anchors = all_anchors.reshape(-1, 4)
+        anchor_num = all_anchors.shape[0]
+
+        images_shape = [[64, 64], [64, 64]]
+        #images_shape = [[64, 64]]
+        groundtruth, lod = _generate_groundtruth(images_shape, 3, 4)
+        lod = [0, 4, 8]
+        #lod = [0, 4]
+
+        im_info = np.ones((len(images_shape), 3)).astype(np.float32)
+        for i in range(len(images_shape)):
+            im_info[i, 0] = images_shape[i][0]
+            im_info[i, 1] = images_shape[i][1]
+            im_info[i, 2] = 0.8  #scale
+        gt_boxes = np.vstack([v['boxes'] for v in groundtruth])
+        is_crowd = np.hstack([v['is_crowd'] for v in groundtruth])
+
+        all_anchors = all_anchors.astype('float32')
+        gt_boxes = gt_boxes.astype('float32')
+
+        rpn_straddle_thresh = 0.0
+        rpn_batch_size_per_im = 256
+        rpn_positive_overlap = 0.7
+        rpn_negative_overlap = 0.3
+        rpn_fg_fraction = 0.5
+        use_random = False
+
+        loc_index, score_index, tgt_bbox, labels = rpn_target_assign_in_python(
+            all_anchors, gt_boxes, is_crowd, im_info, lod, rpn_straddle_thresh,
+            rpn_batch_size_per_im, rpn_positive_overlap, rpn_negative_overlap,
+            rpn_fg_fraction, use_random)
+        labels = labels[:, np.newaxis]
+
+        self.op_type = "rpn_target_assign"
+        self.inputs = {
+            'Anchor': all_anchors,
+            'GtBoxes': (gt_boxes, [[4, 4]]),
+            'IsCrowd': (is_crowd, [[4, 4]]),
+            'ImInfo': (im_info, [[1, 1]])
+        }
+        self.attrs = {
+            'rpn_batch_size_per_im': rpn_batch_size_per_im,
+            'rpn_straddle_thresh': rpn_straddle_thresh,
+            'rpn_positive_overlap': rpn_positive_overlap,
+            'rpn_negative_overlap': rpn_negative_overlap,
+            'rpn_fg_fraction': rpn_fg_fraction,
+            'use_random': use_random
+        }
+        self.outputs = {
+            'LocationIndex': loc_index.astype('int32'),
+            'ScoreIndex': score_index.astype('int32'),
+            'TargetBBox': tgt_bbox.astype('float32'),
+            'TargetLabel': labels.astype('int32')
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sampling_id_op.py b/python/paddle/fluid/tests/unittests/test_sampling_id_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..674ef2ddf44edb4246c9d952cb75b36fe3d6ddc8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_sampling_id_op.py
@@ -0,0 +1,61 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+
+
+class TestSamplingIdOp(OpTest):
+    def setUp(self):
+        self.op_type = "sampling_id"
+        self.use_mkldnn = False
+        self.init_kernel_type()
+        self.X = np.random.random((100, 10)).astype('float32')
+        self.inputs = {"X": self.X}
+        self.Y = np.random.random(100).astype('int64')
+        self.outputs = {'Out': self.Y}
+        self.attrs = {'max': 1.0, 'min': 0.0, 'seed': 1}
+
+    def test_check_output(self):
+        self.check_output_customized(self.verify_output)
+        y1 = self.out
+        self.check_output_customized(self.verify_output)
+        y2 = self.out
+
+        # check dtype
+        assert y1.dtype == np.int64
+        assert y2.dtype == np.int64
+
+        # check output is index ids of inputs
+        inputs_ids = np.arange(self.X.shape[1])
+        assert np.isin(y1, inputs_ids).all()
+        assert np.isin(y2, inputs_ids).all()
+
+        self.assertTrue(np.array_equal(y1, y2))
+        self.assertEqual(len(y1), len(self.Y))
+
+    def verify_output(self, outs):
+        out = np.array(outs[0])
+        self.out = out
+
+    def init_kernel_type(self):
+        pass
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_scale_op.py b/python/paddle/fluid/tests/unittests/test_scale_op.py
index 53f59c399054a96f5b5f07a390e6fa9eeae878ce..032af6ed5ce9e1007d6775306ef4c0aefb9dcc41 100644
--- a/python/paddle/fluid/tests/unittests/test_scale_op.py
+++ b/python/paddle/fluid/tests/unittests/test_scale_op.py
@@ -12,9 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
 
 
 class TestScaleOp(OpTest):
@@ -31,5 +35,57 @@ class TestScaleOp(OpTest):
         self.check_grad(['X'], 'Out')
 
 
+class TestScaleOpSelectedRows(unittest.TestCase):
+    def check_with_place(self, place, in_name, out_name):
+        scope = core.Scope()
+
+        # create and initialize Grad Variable
+        in_height = 10
+        in_rows = [0, 4, 7]
+        in_row_numel = 12
+        scale = 2.0
+
+        in_selected_rows = scope.var(in_name).get_selected_rows()
+        in_selected_rows.set_height(in_height)
+        in_selected_rows.set_rows(in_rows)
+        in_array = np.random.random(
+            (len(in_rows), in_row_numel)).astype("float32")
+
+        in_tensor = in_selected_rows.get_tensor()
+        in_tensor.set(in_array, place)
+
+        # create and initialize Param Variable
+        out_selected_rows = scope.var(out_name).get_selected_rows()
+        out_tensor = out_selected_rows.get_tensor()
+        out_tensor._set_dims(in_tensor._get_dims())
+
+        # create and run sgd operator
+        scale_op = Operator("scale", X=in_name, Out=out_name, scale=scale)
+        scale_op.run(scope, place)
+
+        # get and compare result
+        out_height = out_selected_rows.height()
+        out_rows = out_selected_rows.rows()
+        result_array = np.array(out_tensor)
+
+        assert (in_array * scale == result_array).all()
+        assert in_height == out_height
+        assert in_rows == out_rows
+
+    def test_scale_selected_rows(self):
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
+        for place in places:
+            self.check_with_place(place, 'in', 'out')
+
+    def test_scale_selected_rows_inplace(self):
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
+        for place in places:
+            self.check_with_place(place, 'in', 'in')
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_scatter_op.py b/python/paddle/fluid/tests/unittests/test_scatter_op.py
index fb1728743630b3ea908ae835444eff7fd71b72c8..088996f9d7dee1ea914e36e3342c9a5110001c44 100644
--- a/python/paddle/fluid/tests/unittests/test_scatter_op.py
+++ b/python/paddle/fluid/tests/unittests/test_scatter_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_scope.py b/python/paddle/fluid/tests/unittests/test_scope.py
index d249a989a9499d01f6ed10d6cdbc6c456a7262c5..45fcbfba6eb7c6fc4e75f6d8228d721c0186ef36 100644
--- a/python/paddle/fluid/tests/unittests/test_scope.py
+++ b/python/paddle/fluid/tests/unittests/test_scope.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import paddle.fluid.core
 import unittest
 
diff --git a/python/paddle/fluid/tests/unittests/test_selected_rows.py b/python/paddle/fluid/tests/unittests/test_selected_rows.py
index 3d7b86787fbf0a855bcd86b8a873c9134cb1d5cc..2f34f79b8eafad8e7fdf6b359548747f354b141f 100644
--- a/python/paddle/fluid/tests/unittests/test_selected_rows.py
+++ b/python/paddle/fluid/tests/unittests/test_selected_rows.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import paddle.fluid.core as core
 import unittest
 import numpy as np
@@ -40,12 +42,12 @@ class TestSelectedRows(unittest.TestCase):
 
         # compare tensor
         self.assertAlmostEqual(2.0,
-                               selected_rows.get_tensor().get_float_element(0))
+                               selected_rows.get_tensor()._get_float_element(0))
         self.assertAlmostEqual(1.0,
-                               selected_rows.get_tensor().get_float_element(1))
+                               selected_rows.get_tensor()._get_float_element(1))
         self.assertAlmostEqual(
             4.0,
-            selected_rows.get_tensor().get_float_element(2 * row_numel + 8))
+            selected_rows.get_tensor()._get_float_element(2 * row_numel + 8))
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_seq_concat_op.py b/python/paddle/fluid/tests/unittests/test_seq_concat_op.py
index 11ffa761a690eb1f9f6dc50c45128a99301741db..9d1d139721ad7ee3e29d44c9b3e7c666b78a4556 100644
--- a/python/paddle/fluid/tests/unittests/test_seq_concat_op.py
+++ b/python/paddle/fluid/tests/unittests/test_seq_concat_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 import sys
diff --git a/python/paddle/fluid/tests/unittests/test_seq_conv.py b/python/paddle/fluid/tests/unittests/test_seq_conv.py
index 9701d9adef1fd272f2520f66607acded6a8c25c6..dcc86382e5286f354c4f2e81ead598f12c75b2c1 100644
--- a/python/paddle/fluid/tests/unittests/test_seq_conv.py
+++ b/python/paddle/fluid/tests/unittests/test_seq_conv.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 import random
@@ -26,9 +28,9 @@ class TestSeqProject(OpTest):
         if self.context_length == 1 \
                 and self.context_start == 0 \
                 and self.padding_trainable:
-            print "If context_start is 0 " \
+            print("If context_start is 0 " \
                   "and context_length is 1," \
-                  " padding_trainable should be false."
+                  " padding_trainable should be false.")
             return
 
         # one level, batch size
@@ -212,7 +214,7 @@ class TestSeqProjectCase2(TestSeqProject):
         self.context_stride = 1
 
         self.input_size = [self.input_row, 23]
-        idx = range(self.input_size[0])
+        idx = list(range(self.input_size[0]))
         del idx[0]
         offset_lod = [[0] + np.sort(random.sample(idx, 8)).tolist() +
                       [self.input_size[0]]]
diff --git a/python/paddle/fluid/tests/unittests/test_seq_pool.py b/python/paddle/fluid/tests/unittests/test_seq_pool.py
index 0b3659d7a67956f7546d368346bd102eeedf1d97..66e77714c5d65d51262f76519901032182985ea8 100644
--- a/python/paddle/fluid/tests/unittests/test_seq_pool.py
+++ b/python/paddle/fluid/tests/unittests/test_seq_pool.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_enumerate_op.py b/python/paddle/fluid/tests/unittests/test_sequence_enumerate_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..9814ec0a15e1803b356f300d378c31e57ba36c09
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_sequence_enumerate_op.py
@@ -0,0 +1,105 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def sequence_enumerate(input_seq, in_lod, win_size, pad_value):
+    lod0 = [0]
+    for i in range(0, len(in_lod[0])):
+        lod0.append(lod0[i] + in_lod[0][i])
+    out_seq = []
+    for i in range(0, len(lod0) - 1):
+        for idx in range(lod0[i], lod0[i + 1]):
+            single_seq = []
+            for word_idx in range(win_size):
+                word_pos = idx + word_idx
+                dat = input_seq[word_pos] if word_pos < lod0[i+1] \
+                    else pad_value
+                single_seq.append(dat)
+            out_seq.append(single_seq)
+    return out_seq
+
+
+class TestSequenceEnumerateOp(OpTest):
+    def setUp(self):
+        self.op_type = "sequence_enumerate"
+        self.init_test_case()
+        self.inputs = {'X': (self.in_seq, self.lod)}
+        self.attrs = {'win_size': self.win_size, 'pad_value': self.pad_value}
+        self.outputs = {'Out': (self.out_seq, self.lod)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def init_test_case(self):
+        self.in_seq = np.random.randint(0, 10, (30, 1)).astype("int32")
+        self.lod = [[9, 4, 11, 6]]
+        self.win_size = 2
+        self.pad_value = 0
+        out_seq = sequence_enumerate(self.in_seq, self.lod, self.win_size,
+                                     self.pad_value)
+        self.out_seq = np.array(out_seq).astype("int32")
+
+
+class TesSequenceEnumerateOpInt64(TestSequenceEnumerateOp):
+    def init_test_case(self):
+        self.in_seq = np.random.randint(0, 10, (30, 1)).astype("int64")
+        self.lod = [[9, 4, 11, 6]]
+        self.win_size = 2
+        self.pad_value = 0
+        out_seq = sequence_enumerate(self.in_seq, self.lod, self.win_size,
+                                     self.pad_value)
+        self.out_seq = np.array(out_seq).astype("int64")
+
+
+class TestSequenceEnumerateOpLargeWinSize(TestSequenceEnumerateOp):
+    def init_test_case(self):
+        self.in_seq = np.random.randint(0, 10, (30, 1)).astype("int32")
+        self.lod = [[9, 4, 11, 6]]
+        self.win_size = 5
+        self.pad_value = 0
+        out_seq = sequence_enumerate(self.in_seq, self.lod, self.win_size,
+                                     self.pad_value)
+        self.out_seq = np.array(out_seq).astype("int32")
+
+
+class TestSequenceEnumerateOpMaxWinSize(TestSequenceEnumerateOp):
+    def init_test_case(self):
+        self.in_seq = np.random.randint(0, 10, (30, 1)).astype("int32")
+        self.lod = [[9, 4, 11, 6]]
+        self.win_size = 30
+        self.pad_value = 0
+        out_seq = sequence_enumerate(self.in_seq, self.lod, self.win_size,
+                                     self.pad_value)
+        self.out_seq = np.array(out_seq).astype("int32")
+
+
+class TestSequenceEnumerateOpLargePadValue(TestSequenceEnumerateOp):
+    def init_test_case(self):
+        self.in_seq = np.random.randint(0, 10, (30, 1)).astype("int32")
+        self.lod = [[9, 4, 11, 6]]
+        self.win_size = 5
+        self.pad_value = 5
+        out_seq = sequence_enumerate(self.in_seq, self.lod, self.win_size,
+                                     self.pad_value)
+        self.out_seq = np.array(out_seq).astype("int32")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_erase_op.py b/python/paddle/fluid/tests/unittests/test_sequence_erase_op.py
index 8f0765277ae85af2b17ad96d4fd0c1148c393ff0..92cd5b0cbcd1ab56300158d26850969870e86f2b 100644
--- a/python/paddle/fluid/tests/unittests/test_sequence_erase_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_erase_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_expand.py b/python/paddle/fluid/tests/unittests/test_sequence_expand.py
index 0bbd31814efdff6050733f6876ef64e3fcaaaf76..ffd4026dbade2f8f7eace399c52ae0428f3e8d7b 100644
--- a/python/paddle/fluid/tests/unittests/test_sequence_expand.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_expand.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
@@ -44,7 +46,7 @@ class TestSequenceExpand(OpTest):
             out_lod = [[]]
 
         offset = 0
-        for i in xrange(len(y_lod[ref_level])):
+        for i in range(len(y_lod[ref_level])):
             repeat_num = y_lod[ref_level][i]
             x_len = x_idx[i]
 
@@ -55,7 +57,7 @@ class TestSequenceExpand(OpTest):
                     stacked_x_sub = np.vstack((stacked_x_sub, x_sub))
                 out = np.vstack((out, stacked_x_sub))
                 if x_lod is not None:
-                    for j in xrange(repeat_num):
+                    for j in range(repeat_num):
                         out_lod[0].append(x_len)
             offset += x_len
 
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_mask.py b/python/paddle/fluid/tests/unittests/test_sequence_mask.py
new file mode 100644
index 0000000000000000000000000000000000000000..02c5b204082ece0d98d014c952293c5be39520ca
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_sequence_mask.py
@@ -0,0 +1,94 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from op_test import OpTest
+import paddle.fluid as fluid
+from paddle.fluid.framework import convert_np_dtype_to_dtype_
+import paddle.fluid.core as core
+import numpy as np
+import copy
+import unittest
+
+
+class SequenceMaskTestBase(OpTest):
+    def initDefaultParameters(self):
+        self.op_type = 'sequence_mask'
+        self.maxlen = 10
+        self.mask_dtype = 'int64'
+        self.x = [[0, 3, 4], [5, 7, 9]]
+
+    def initParameters(self):
+        pass
+
+    def setUp(self):
+        self.initDefaultParameters()
+        self.initParameters()
+        if not isinstance(self.x, np.ndarray):
+            self.x = np.array(self.x)
+
+        self.inputs = {'X': self.x}
+        self.outputs = {'Y': self.calc_ground_truth_mask()}
+        self.attrs = {
+            'maxlen': self.maxlen,
+            'out_dtype': convert_np_dtype_to_dtype_(self.mask_dtype)
+        }
+
+    def calc_ground_truth_mask(self):
+        maxlen = np.max(self.x) if self.maxlen < 0 else self.maxlen
+        shape = self.x.shape + (maxlen, )
+        index_broadcast = np.broadcast_to(
+            np.reshape(
+                range(maxlen), newshape=[1] * self.x.ndim + [-1]),
+            shape=shape)
+        x_broadcast = np.broadcast_to(
+            np.reshape(
+                self.x, newshape=self.x.shape + (-1, )), shape=shape)
+        return (index_broadcast < x_broadcast).astype(self.mask_dtype)
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class SequenceMaskTest1(SequenceMaskTestBase):
+    def initParameters(self):
+        self.mask_dtype = 'bool'
+
+
+class SequenceMaskTest2(SequenceMaskTestBase):
+    def initParameters(self):
+        self.mask_dtype = 'uint8'
+
+
+class SequenceMaskTest3(SequenceMaskTestBase):
+    def initParameters(self):
+        self.mask_dtype = 'int32'
+
+
+class SequenceMaskTest4(SequenceMaskTestBase):
+    def initParameters(self):
+        self.mask_dtype = 'float32'
+
+
+class SequenceMaskTest5(SequenceMaskTestBase):
+    def initParameters(self):
+        self.mask_dtype = 'float64'
+
+
+class SequenceMaskTest6(SequenceMaskTestBase):
+    def initParameters(self):
+        self.maxlen = -1
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_pad_op.py b/python/paddle/fluid/tests/unittests/test_sequence_pad_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..471515c817541976a06eb024fa3d4f77b78f920d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_sequence_pad_op.py
@@ -0,0 +1,131 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestSequencePadOp(OpTest):
+    def set_attr(self):
+        self.x_shape = [12, 4]
+        self.x_len_lod = [[2, 3, 4, 3]]
+        self.pad_value = [1.0]
+        self.padded_length = -1
+        self.dtype = 'float32'
+
+    def set_data(self):
+        x_data = np.random.uniform(0.1, 0.5, self.x_shape).astype(self.dtype)
+        pad_value_data = np.array(self.pad_value).astype(self.dtype)
+        self.inputs = {
+            'X': (x_data, self.x_len_lod),
+            'PadValue': pad_value_data
+        }
+        self.attrs = {'padded_length': self.padded_length}
+
+    def compute(self):
+        # get padded length
+        padded_length = self.padded_length
+        x_len_lod_0 = self.x_len_lod[0]
+        if padded_length == -1:
+            max_seq_len = 0
+            for l in x_len_lod_0:
+                max_seq_len = max(max_seq_len, l)
+            padded_length = max_seq_len
+
+        # do padding
+        x_data = self.inputs['X'][0]
+        pad_value_data = self.inputs['PadValue']
+        if pad_value_data.shape == (1, ):
+            pad_value_data = np.broadcast_to(
+                pad_value_data, shape=x_data.shape[1:])
+        padded_sequences = []
+        start_idx = 0
+        for l in x_len_lod_0:
+            end_idx = start_idx + l
+            seq = x_data[start_idx:end_idx]
+            to_pad_len = padded_length - l
+            for _ in range(to_pad_len):
+                seq = np.append(seq, pad_value_data[np.newaxis, :], axis=0)
+            padded_sequences.append(seq)
+            start_idx = end_idx
+
+        out_data = np.array(padded_sequences)
+        self.outputs = {'Out': out_data}
+
+    def setUp(self):
+        self.op_type = 'sequence_pad'
+        self.set_attr()
+        self.set_data()
+        self.compute()
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestSequencePadOp2(TestSequencePadOp):
+    def set_attr(self):
+        self.x_shape = [12, 4]
+        self.x_len_lod = [[2, 3, 4, 3]]
+        self.pad_value = [1.0, 2.0, 3.0, 4.0]
+        self.padded_length = -1
+        self.dtype = 'float32'
+
+
+class TestSequencePadOp3(TestSequencePadOp):
+    def set_attr(self):
+        self.x_shape = [12, 4]
+        self.x_len_lod = [[2, 3, 4, 3]]
+        self.pad_value = [1.0]
+        self.padded_length = 7
+        self.dtype = 'float32'
+
+
+class TestSequencePadOp4(TestSequencePadOp):
+    def set_attr(self):
+        self.x_shape = [12, 4]
+        self.x_len_lod = [[2, 3, 4, 3]]
+        self.pad_value = [1.0, 2.0, 3.0, 4.0]
+        self.padded_length = 7
+        self.dtype = 'float32'
+
+
+class TestSequencePadOp5(TestSequencePadOp):
+    def set_attr(self):
+        self.x_shape = [12, 2, 2]
+        self.x_len_lod = [[2, 3, 4, 3]]
+        self.pad_value = [1.0]
+        self.padded_length = -1
+        self.dtype = 'float32'
+
+
+class TestSequencePadOp6(TestSequencePadOp):
+    def set_attr(self):
+        self.x_shape = [12, 2, 2]
+        self.x_len_lod = [[2, 3, 4, 3]]
+        self.pad_value = [[1.0, 2.0], [3.0, 4.0]]
+        self.padded_length = -1
+        self.dtype = 'float32'
+
+
+class TestSequencePadOp7(TestSequencePadOp):
+    def set_attr(self):
+        self.x_shape = [12, 2, 2]
+        self.x_len_lod = [[2, 3, 4, 3]]
+        self.pad_value = [1.0]
+        self.padded_length = 7
+        self.dtype = 'float32'
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_reshape.py b/python/paddle/fluid/tests/unittests/test_sequence_reshape.py
index 68f2e5eba35ed318281d14e397dc6d363bcb4079..f11fa6c39c35efc14f8600dd746ab64cc940cd71 100644
--- a/python/paddle/fluid/tests/unittests/test_sequence_reshape.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_reshape.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 import math
@@ -35,7 +37,7 @@ class TestSequenceReshape(OpTest):
     def compute_output(self, x, x_lod, dimension):
         x_width = x.shape[1]
         out_lod = [[]]
-        for i in xrange(len(x_lod[0])):
+        for i in range(len(x_lod[0])):
             seq_len = x_lod[0][i]
             offset = (seq_len * x_width) / dimension
             assert int(offset) * dimension == seq_len * x_width
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_slice_op.py b/python/paddle/fluid/tests/unittests/test_sequence_slice_op.py
index 313e485d1e3080f2c59c68256cbc5c81aa6558cd..1561490087330c9af3ea3e384bf735eaa268a749 100644
--- a/python/paddle/fluid/tests/unittests/test_sequence_slice_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_slice_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 import sys
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_softmax_op.py b/python/paddle/fluid/tests/unittests/test_sequence_softmax_op.py
index e91a69a0f8039651225039beb2a42e8dffeb62d3..3e00e7d95f63ea652ea1964eb792f9393ffa5994 100644
--- a/python/paddle/fluid/tests/unittests/test_sequence_softmax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_softmax_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
@@ -61,6 +63,8 @@ class TestSequenceSoftmaxOp(OpTest):
 
 
 # ----------------cudnn Sequencesoftmax----------------
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
 class TestSequenceSoftmaxCUDNNOp(TestSequenceSoftmaxOp):
     def init_op_type(self):
         self.use_cudnn = True
diff --git a/python/paddle/fluid/tests/unittests/test_sgd_op.py b/python/paddle/fluid/tests/unittests/test_sgd_op.py
index 3126293f9d8e52daa866be5fc1533648a33f3363..b46e4bfb86bd5dc9c74375693328f2506281be3e 100644
--- a/python/paddle/fluid/tests/unittests/test_sgd_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sgd_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 import paddle.fluid.core as core
@@ -124,6 +126,7 @@ class TestSGDOpOptimizeSelectedRows(unittest.TestCase):
         w_selected_rows = scope.var('Param').get_selected_rows()
         w_selected_rows.set_height(len(param_rows))
         w_selected_rows.set_rows(param_rows)
+        w_selected_rows.sync_index()
         w_array = np.ones((len(param_rows), row_width)).astype("float32")
         for i in range(len(param_rows)):
             w_array[i] *= i
diff --git a/python/paddle/fluid/tests/unittests/test_shape_op.py b/python/paddle/fluid/tests/unittests/test_shape_op.py
index a62ee050075cb8c9f8817c142825a89c24bdfedf..02231ea943e1e92a08730e6e9f1aa3cefeb927c0 100644
--- a/python/paddle/fluid/tests/unittests/test_shape_op.py
+++ b/python/paddle/fluid/tests/unittests/test_shape_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py b/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py
index b779f0fb014bbba62927754ea6f36828a32e6c0a..97f79f9421d498723da4c7992551f1210d3f6003 100644
--- a/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py
+++ b/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import paddle.fluid.core as core
 from paddle.fluid.executor import Executor
@@ -21,6 +23,9 @@ from paddle.fluid.framework import default_main_program, switch_main_program
 from paddle.fluid.framework import Program
 import numpy as np
 
+from paddle.fluid.layers.control_flow import shrink_memory
+from paddle.fluid.layers.control_flow import lod_rank_table
+
 
 class TestShrinkRNNMemoryBase(unittest.TestCase):
     def setUp(self):
@@ -30,23 +35,23 @@ class TestShrinkRNNMemoryBase(unittest.TestCase):
         x.stop_gradient = False
         rank_table_tensor = layers.data(
             'rank_table_tensor', shape=[1], dtype='float32', lod_level=1)
-        table = layers.lod_rank_table(x=rank_table_tensor)
+        table = lod_rank_table(x=rank_table_tensor)
         i = layers.zeros(dtype='int64', shape=[1])
-        self.mem1 = layers.shrink_memory(x=x, i=i, table=table)
+        self.mem1 = shrink_memory(x=x, i=i, table=table)
         i = layers.increment(x=i)
         i.stop_gradient = True
-        self.mem2 = layers.shrink_memory(x=self.mem1, i=i, table=table)
+        self.mem2 = shrink_memory(x=self.mem1, i=i, table=table)
         i = layers.increment(x=i)
         i.stop_gradient = True
-        self.mem3 = layers.shrink_memory(x=self.mem2, i=i, table=table)
+        self.mem3 = shrink_memory(x=self.mem2, i=i, table=table)
         mem3_mean = layers.mean(self.mem3)
         append_backward(loss=mem3_mean)
         self.x_grad = self.main_program.global_block().var('x@GRAD')
 
     def sum_lodtensor(self, tensor):
         sum_res = 0.0
-        for i in xrange(np.product(tensor.get_dims())):
-            sum_res += tensor.get_float_element(i)
+        for i in range(np.product(tensor.shape())):
+            sum_res += tensor._get_float_element(i)
         return sum_res
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py b/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py
index c435796569cd2479c19d70a849991f439bf5292a..97ff203499c0bf223930c904de46e1abdd902799 100644
--- a/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import numpy as np
 from op_test import OpTest
 from scipy.special import logit
diff --git a/python/paddle/fluid/tests/unittests/test_sign_op.py b/python/paddle/fluid/tests/unittests/test_sign_op.py
index 087a0c575bfa6bc18cb229ad274b4e1e90210605..85a9d9cae47c2b0942da0e0d962d4512af1566c0 100644
--- a/python/paddle/fluid/tests/unittests/test_sign_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sign_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_slice_op.py b/python/paddle/fluid/tests/unittests/test_slice_op.py
index 1a48bce3bb7c74551a365fd471f6869b128babac..134df38eea6655857db04dfdc19dd7f7897946f4 100644
--- a/python/paddle/fluid/tests/unittests/test_slice_op.py
+++ b/python/paddle/fluid/tests/unittests/test_slice_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_slice_var.py b/python/paddle/fluid/tests/unittests/test_slice_var.py
index 82305b23a1a1e2cee8cef6b291d848581fe5b509..fab63b7d5631829feffd26fc1dce2bd338d2036b 100644
--- a/python/paddle/fluid/tests/unittests/test_slice_var.py
+++ b/python/paddle/fluid/tests/unittests/test_slice_var.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import math
 import unittest
 from paddle.fluid.transpiler.distribute_transpiler import slice_variable
diff --git a/python/paddle/fluid/tests/unittests/test_smooth_l1_loss_op.py b/python/paddle/fluid/tests/unittests/test_smooth_l1_loss_op.py
index e74664dac4d87c5a7e1e061294e93e8267e3cc17..8ab6833821c75262124b3ae4200a17e457b718d5 100644
--- a/python/paddle/fluid/tests/unittests/test_smooth_l1_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_smooth_l1_loss_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_softmax_op.py b/python/paddle/fluid/tests/unittests/test_softmax_op.py
index 279f3073f73d1c36f54bb901d92441a7403ac23f..d88aa1ae1c9d848eba7a2224d22b5201fc27b857 100644
--- a/python/paddle/fluid/tests/unittests/test_softmax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
@@ -26,15 +28,22 @@ def stable_softmax(x):
 
 
 class TestSoftmaxOp(OpTest):
+    def get_x_shape(self):
+        return [10, 10]
+
     def setUp(self):
         self.op_type = "softmax"
         self.use_cudnn = False
         self.use_mkldnn = False
         self.dtype = np.float32
         self.init_kernel_type()
+        self.shape = self.get_x_shape()
+
+        x = np.random.uniform(0.1, 1, self.shape).astype(self.dtype)
+        out = np.apply_along_axis(stable_softmax, 1,
+                                  x.reshape([-1, self.shape[-1]]))
+        out = out.reshape(self.shape)
 
-        x = np.random.uniform(0.1, 1, [10, 10]).astype(self.dtype)
-        out = np.apply_along_axis(stable_softmax, 1, x)
         self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
         self.outputs = {'Out': out}
         self.attrs = {
@@ -63,11 +72,27 @@ class TestSoftmaxOp(OpTest):
             self.check_grad(["X"], "Out", max_relative_error=0.01)
 
 
+class TestSoftmaxOp2(TestSoftmaxOp):
+    def get_x_shape(self):
+        return [2, 3, 4, 5]
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
 class TestSoftmaxCUDNNOp(TestSoftmaxOp):
     def init_kernel_type(self):
         self.use_cudnn = True
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestSoftmaxCUDNNOp2(TestSoftmaxCUDNNOp):
+    def get_x_shape(self):
+        return [2, 3, 4, 5]
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
 class TestSoftmaxFP16Op(TestSoftmaxOp):
     def init_kernel_type(self):
         self.dtype = np.float16
@@ -79,6 +104,15 @@ class TestSoftmaxFP16Op(TestSoftmaxOp):
                 self.check_output_with_place(place, atol=1e-3)
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestSoftmaxFP16Op2(TestSoftmaxFP16Op):
+    def get_x_shape(self):
+        return [2, 3, 4, 5]
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
 class TestSoftmaxFP16CUDNNOp(TestSoftmaxOp):
     def init_kernel_type(self):
         self.use_cudnn = True
@@ -91,10 +125,22 @@ class TestSoftmaxFP16CUDNNOp(TestSoftmaxOp):
                 self.check_output_with_place(place, atol=1e-3)
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestSoftmaxFP16CUDNNOp2(TestSoftmaxFP16CUDNNOp):
+    def get_x_shape(self):
+        return [2, 3, 4, 5]
+
+
 class TestSoftmaxMKLDNNOp(TestSoftmaxOp):
     def init_kernel_type(self):
         self.use_mkldnn = True
 
 
+class TestSoftmaxMKLDNNOp2(TestSoftmaxMKLDNNOp):
+    def get_x_shape(self):
+        return [2, 3, 4, 5]
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py b/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
index c0d9fc8f22a7c4f791d80a9cad87d003b5d54299..a18941dd3126ac027f022ddafbbaed8516166233 100644
--- a/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 
@@ -86,5 +88,40 @@ class TestSoftmaxWithCrossEntropyOp2(OpTest):
         self.check_grad(["Logits"], "Loss")
 
 
+class TestSoftmaxWithCrossEntropyOp3(OpTest):
+    """
+    Test softmax with cross entropy operator with ignore_index.
+    """
+
+    def setUp(self):
+        self.op_type = "softmax_with_cross_entropy"
+        batch_size = 41
+        class_num = 37
+
+        logits = np.random.uniform(0.1, 1.0,
+                                   [batch_size, class_num]).astype("float64")
+        softmax = np.apply_along_axis(stable_softmax, 1, logits)
+        labels = np.random.randint(0, class_num, [batch_size, 1], dtype="int64")
+        ignore_index = 7
+        cross_entropy = np.asmatrix(
+            [[-np.log(softmax[i][labels[i][0]])]
+             if labels[i] != ignore_index else [0]
+             for i in range(softmax.shape[0])],
+            dtype="float64")
+
+        self.inputs = {"Logits": logits, "Label": labels}
+        self.outputs = {
+            "Softmax": softmax.astype("float64"),
+            "Loss": cross_entropy.astype("float64")
+        }
+        self.attrs = {"ignore_index": ignore_index}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["Logits"], "Loss")
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py b/python/paddle/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py
index 0916ed7c9f1e2d6d90c6908983fdc8b177aecbb9..5397d5c52158ccfb9ad5703b957ca59d6fa11418 100644
--- a/python/paddle/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py
+++ b/python/paddle/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import paddle.fluid.core as core
 import numpy as np
@@ -19,6 +21,8 @@ import paddle.fluid.layers as layers
 from paddle.fluid.framework import Program, program_guard
 from paddle.fluid.executor import Executor
 from paddle.fluid.backward import append_backward
+from paddle.fluid.layers.control_flow import split_lod_tensor
+from paddle.fluid.layers.control_flow import merge_lod_tensor
 
 
 class TestCPULoDTensorArrayOps(unittest.TestCase):
@@ -96,12 +100,11 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
             y = layers.data(name='y', shape=[1])
             y.persistable = True
 
-            out_true, out_false = layers.split_lod_tensor(
-                input=x, mask=y, level=level)
+            out_true, out_false = split_lod_tensor(input=x, mask=y, level=level)
             out_true.persistable = True
             out_false.persistable = True
 
-            out = layers.merge_lod_tensor(
+            out = merge_lod_tensor(
                 in_true=out_true, in_false=out_false, mask=y, x=x, level=level)
 
             out.persistable = True
@@ -142,9 +145,8 @@ class TestCPUSplitMergeLoDTensorGrad(unittest.TestCase):
 
             level = 0
 
-            out_true, out_false = layers.split_lod_tensor(
-                input=x, mask=y, level=level)
-            out = layers.merge_lod_tensor(
+            out_true, out_false = split_lod_tensor(input=x, mask=y, level=level)
+            out = merge_lod_tensor(
                 in_true=out_true, in_false=out_false, mask=y, x=x, level=level)
             mean = layers.mean(out)
 
diff --git a/python/paddle/fluid/tests/unittests/test_split_ids_op.py b/python/paddle/fluid/tests/unittests/test_split_ids_op.py
index e9f0a06a56b42952800411d548bb3fc1732e031e..4c3d0258980fd8595704a65219deb520b96e222e 100644
--- a/python/paddle/fluid/tests/unittests/test_split_ids_op.py
+++ b/python/paddle/fluid/tests/unittests/test_split_ids_op.py
@@ -12,9 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
+import six
 from op_test import OpTest
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
 
 
 class TestSplitIdsOp(OpTest):
@@ -31,5 +36,55 @@ class TestSplitIdsOp(OpTest):
         self.check_output()
 
 
+class TestSpliteIds(unittest.TestCase):
+    def get_places(self):
+        places = [core.CPUPlace()]
+        return places
+
+    def test_check_output(self):
+        for place in self.get_places():
+            self.check_with_place(place)
+
+    def check_with_place(self, place):
+        scope = core.Scope()
+        rows = [0, 5, 7, 4, 9]
+        height = 20
+        row_numel = 2
+
+        # initialize input variable X
+        x = scope.var('X').get_selected_rows()
+        x.set_rows(rows)
+        x.set_height(height)
+        np_array = np.ones((len(rows), row_numel)).astype("float32")
+        for i in range(len(rows)):
+            for j in range(row_numel):
+                np_array[i, j] = rows[i] + j
+        x_tensor = x.get_tensor()
+        x_tensor.set(np_array, place)
+
+        outs_name = ["out%d" % i for i in six.moves.xrange(3)]
+        outs = [
+            scope.var(var_name).get_selected_rows() for var_name in outs_name
+        ]
+
+        # expected output selected rows
+        expected_out_rows = [[0, 9], [7, 4], [5]]
+
+        op = Operator("split_ids", Ids="X", Out=outs_name)
+
+        for _ in range(3):
+            op.run(scope, place)
+
+            for i in range(len(outs)):
+                expected_rows = expected_out_rows[i]
+                self.assertEqual(outs[i].rows(), expected_rows)
+                for j in range(len(expected_rows)):
+                    row = expected_rows[j]
+                    self.assertAlmostEqual(
+                        float(row), np.array(outs[i].get_tensor())[j, 0])
+                    self.assertAlmostEqual(
+                        float(row + 1), np.array(outs[i].get_tensor())[j, 1])
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_split_op.py b/python/paddle/fluid/tests/unittests/test_split_op.py
index eb49a53e54f4bdb6bcd6cb1991423970f29997bb..3c5dd782f85235c4a2feb5a8ca6d048a012c5e1c 100644
--- a/python/paddle/fluid/tests/unittests/test_split_op.py
+++ b/python/paddle/fluid/tests/unittests/test_split_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
@@ -26,7 +28,7 @@ class TestSplitOp(OpTest):
         self.inputs = {'X': x}
         self.attrs = {'axis': axis, 'sections': [2, 1, 2]}
         self.outputs = {'Out': [('out%d' % i, out[i]) \
-            for i in xrange(len(out))]}
+            for i in range(len(out))]}
 
     def _set_op_type(self):
         self.op_type = "split"
diff --git a/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py b/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py
index 61040a39ced6dc57d05a10bf0605c80011db45c3..41a5ee59ea523b1f6c5015974a12c526e883fa35 100644
--- a/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py
+++ b/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import paddle.fluid.core as core
 import numpy as np
@@ -53,7 +55,7 @@ class TestSpliteSelectedRows(unittest.TestCase):
         height_sections = [5, 5, 5, 5, 3]
 
         # initialize output variables [out0, out1]
-        outs_name = ["out%d" % i for i in xrange(len(height_sections))]
+        outs_name = ["out%d" % i for i in range(len(height_sections))]
         outs = [
             scope.var(var_name).get_selected_rows() for var_name in outs_name
         ]
diff --git a/python/paddle/fluid/tests/unittests/test_spp_op.py b/python/paddle/fluid/tests/unittests/test_spp_op.py
index f0ab5909df62835b252154709e5ff75ca38235c8..a6c2cccd39c9cecb2ae904a1930b44ba18dbbd7e 100644
--- a/python/paddle/fluid/tests/unittests/test_spp_op.py
+++ b/python/paddle/fluid/tests/unittests/test_spp_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
@@ -26,7 +28,7 @@ class TestSppOp(OpTest):
         input = np.random.random(self.shape).astype("float32")
         nsize, csize, hsize, wsize = input.shape
         out_level_flatten = []
-        for i in xrange(self.pyramid_height):
+        for i in range(self.pyramid_height):
             bins = np.power(2, i)
             kernel_size = [0, 0]
             padding = [0, 0]
diff --git a/python/paddle/fluid/tests/unittests/test_squared_l2_distance_op.py b/python/paddle/fluid/tests/unittests/test_squared_l2_distance_op.py
index 78bc300ebec1cd34e44343d47376fef05a6d0135..a8bc1004d9bbe91e323db49c0cf0b576f8da306e 100644
--- a/python/paddle/fluid/tests/unittests/test_squared_l2_distance_op.py
+++ b/python/paddle/fluid/tests/unittests/test_squared_l2_distance_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_squared_l2_norm_op.py b/python/paddle/fluid/tests/unittests/test_squared_l2_norm_op.py
index 609445d52287f421e67a5796f9e50c1fb42c8e49..439bae9510ee84b131050bb6804a3ede2ad6a8b3 100644
--- a/python/paddle/fluid/tests/unittests/test_squared_l2_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_squared_l2_norm_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import numpy as np
 import unittest
 from numpy import linalg as LA
diff --git a/python/paddle/fluid/tests/unittests/test_squeeze_op.py b/python/paddle/fluid/tests/unittests/test_squeeze_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..204a4bb40196bd1fc2f5861aa31cf9560ea4d349
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_squeeze_op.py
@@ -0,0 +1,75 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+
+from op_test import OpTest
+
+
+# Correct: General.
+class TestSqueezeOp(OpTest):
+    def setUp(self):
+        self.op_type = "squeeze2"
+        self.init_test_case()
+        self.inputs = {"X": np.random.random(self.ori_shape).astype("float32")}
+        self.init_attrs()
+        self.outputs = {
+            "Out": self.inputs["X"].reshape(self.new_shape),
+            "XShape": np.random.random(self.ori_shape).astype("float32")
+        }
+
+    def test_check_output(self):
+        self.check_output(no_check_set=['XShape'])
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+    def init_test_case(self):
+        self.ori_shape = (1, 3, 1, 5)
+        self.axes = (0, 2)
+        self.new_shape = (3, 5)
+
+    def init_attrs(self):
+        self.attrs = {"axes": self.axes}
+
+
+# Correct: There is mins axis.
+class TestSqueezeOp1(TestSqueezeOp):
+    def init_test_case(self):
+        self.ori_shape = (1, 3, 1, 5)
+        self.axes = (0, -2)
+        self.new_shape = (3, 5)
+
+
+# Correct: No axes input.
+class TestSqueezeOp2(TestSqueezeOp):
+    def init_test_case(self):
+        self.ori_shape = (1, 3, 1, 5)
+        self.axes = ()
+        self.new_shape = (3, 5)
+
+
+# Correct: Just part of axes be squeezed. 
+class TestSqueezeOp3(TestSqueezeOp):
+    def init_test_case(self):
+        self.ori_shape = (3, 1, 5, 1, 4, 1)
+        self.axes = (1, -1)
+        self.new_shape = (3, 5, 1, 4)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_stack_op.py b/python/paddle/fluid/tests/unittests/test_stack_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..defdeb5d70df4c39ed8e23247270e6eb3dd14a7a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_stack_op.py
@@ -0,0 +1,92 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from op_test import OpTest
+import numpy as np
+import unittest
+
+
+class TestStackOpBase(OpTest):
+    def initDefaultParameters(self):
+        self.num_inputs = 4
+        self.input_dim = (5, 6, 7)
+        self.axis = 0
+        self.dtype = 'float32'
+
+    def initParameters(self):
+        pass
+
+    def get_x_names(self):
+        x_names = []
+        for i in range(self.num_inputs):
+            x_names.append('x{}'.format(i))
+        return x_names
+
+    def setUp(self):
+        self.initDefaultParameters()
+        self.initParameters()
+        self.op_type = 'stack'
+        self.x = []
+        for i in range(self.num_inputs):
+            self.x.append(
+                np.random.random(size=self.input_dim).astype(self.dtype))
+
+        tmp = []
+        x_names = self.get_x_names()
+        for i in range(self.num_inputs):
+            tmp.append((x_names[i], self.x[i]))
+
+        self.inputs = {'X': tmp}
+        self.outputs = {'Y': np.stack(self.x, axis=self.axis)}
+        self.attrs = {'axis': self.axis}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(self.get_x_names(), 'Y')
+
+
+class TestStackOp1(TestStackOpBase):
+    def initParameters(self):
+        self.num_inputs = 16
+
+
+class TestStackOp2(TestStackOpBase):
+    def initParameters(self):
+        self.num_inputs = 20
+
+
+class TestStackOp3(TestStackOpBase):
+    def initParameters(self):
+        self.axis = -1
+
+
+class TestStackOp4(TestStackOpBase):
+    def initParameters(self):
+        self.axis = -4
+
+
+class TestStackOp5(TestStackOpBase):
+    def initParameters(self):
+        self.axis = 1
+
+
+class TestStackOp6(TestStackOpBase):
+    def initParameters(self):
+        self.axis = 3
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sum_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_sum_mkldnn_op.py
index 7956897d68a3fb49d62ba696d0b6400b4f909989..55820f31b81df9f3618d1004f6d21565564efa29 100644
--- a/python/paddle/fluid/tests/unittests/test_sum_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sum_mkldnn_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 
 from test_sum_op import TestSumOp
diff --git a/python/paddle/fluid/tests/unittests/test_sum_op.py b/python/paddle/fluid/tests/unittests/test_sum_op.py
index 1d90414e137a70e6265042e24e106fe565802778..74797bb65678404b7b35d06eecc7f9a12b2a346e 100644
--- a/python/paddle/fluid/tests/unittests/test_sum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sum_op.py
@@ -12,9 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
 
 
 class TestSumOp(OpTest):
@@ -40,5 +44,66 @@ class TestSumOp(OpTest):
         pass
 
 
+class TestSelectedRowsSumOp(OpTest):
+    def check_with_place(self, place):
+        scope = core.Scope()
+        self.check_input_and_optput(scope, place, True, True, True)
+        self.check_input_and_optput(scope, place, False, True, True)
+        self.check_input_and_optput(scope, place, False, False, True)
+        self.check_input_and_optput(scope, place, False, False, False)
+
+    def check_input_and_optput(self,
+                               scope,
+                               place,
+                               w1_has_data=False,
+                               w2_has_data=False,
+                               w3_has_data=False):
+
+        self.create_selected_rows(scope, place, "W1", w1_has_data)
+        self.create_selected_rows(scope, place, "W2", w2_has_data)
+        self.create_selected_rows(scope, place, "W3", w3_has_data)
+
+        # create Out Variable
+        out = scope.var('Out').get_selected_rows()
+
+        # create and run sum operator
+        sum_op = Operator("sum", X=["W1", "W2", "W3"], Out='Out')
+        sum_op.run(scope, place)
+
+        has_data_w_num = 0
+        for w in [w1_has_data, w2_has_data, w3_has_data]:
+            if not w:
+                has_data_w_num += 1
+
+        self.assertEqual(7 * has_data_w_num, len(out.rows()))
+
+    def create_selected_rows(self, scope, place, var_name, isEmpty):
+        # create and initialize W Variable
+        if not isEmpty:
+            rows = [0, 1, 2, 3, 4, 5, 6]
+            row_numel = 12
+        else:
+            rows = []
+            row_numel = 12
+
+        var = scope.var(var_name)
+        w_selected_rows = var.get_selected_rows()
+        w_selected_rows.set_height(len(rows))
+        w_selected_rows.set_rows(rows)
+        w_array = np.ones((len(rows), row_numel)).astype("float32")
+        for i in range(len(rows)):
+            w_array[i] *= i
+        w_tensor = w_selected_rows.get_tensor()
+        w_tensor.set(w_array, place)
+
+        return var
+
+    def test_w_is_selected_rows(self):
+        places = [core.CPUPlace()]
+        # currently only support CPU
+        for place in places:
+            self.check_with_place(place)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_switch.py b/python/paddle/fluid/tests/unittests/test_switch.py
index 528c5cce4bc7262ade196f6a81a57a57089117ec..2a9c07a889ba5fe24fd1c098729a233cb8fbb16f 100644
--- a/python/paddle/fluid/tests/unittests/test_switch.py
+++ b/python/paddle/fluid/tests/unittests/test_switch.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 
 import paddle.fluid.core as core
diff --git a/python/paddle/fluid/tests/unittests/test_target_assign_op.py b/python/paddle/fluid/tests/unittests/test_target_assign_op.py
index bd208897520122b6a5dcf71da325b1b9dba632f6..aec219f80639415a9be55ba18e7940953d0e11b0 100644
--- a/python/paddle/fluid/tests/unittests/test_target_assign_op.py
+++ b/python/paddle/fluid/tests/unittests/test_target_assign_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 import random
diff --git a/python/paddle/fluid/tests/unittests/test_tensor.py b/python/paddle/fluid/tests/unittests/test_tensor.py
index f17edd3025b17549892bbd47935a1d2452cefac3..1822957c23d0bb1e4821373515d4faef2b76950e 100644
--- a/python/paddle/fluid/tests/unittests/test_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_tensor.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import paddle.fluid.core as core
 import unittest
 import numpy
@@ -25,8 +27,8 @@ class TestTensor(unittest.TestCase):
 
         tensor = var.get_tensor()
 
-        tensor.set_dims([1000, 784])
-        tensor.alloc_int(place)
+        tensor._set_dims([1000, 784])
+        tensor._alloc_int(place)
         tensor_array = numpy.array(tensor)
         self.assertEqual((1000, 784), tensor_array.shape)
         tensor_array[3, 9] = 1
@@ -44,8 +46,8 @@ class TestTensor(unittest.TestCase):
 
         tensor = var.get_tensor()
 
-        tensor.set_dims([1000, 784])
-        tensor.alloc_float(place)
+        tensor._set_dims([1000, 784])
+        tensor._alloc_float(place)
 
         tensor_array = numpy.array(tensor)
         self.assertEqual((1000, 784), tensor_array.shape)
@@ -57,14 +59,35 @@ class TestTensor(unittest.TestCase):
         self.assertAlmostEqual(1.0, tensor_array_2[3, 9])
         self.assertAlmostEqual(2.0, tensor_array_2[19, 11])
 
+    def test_int8_tensor(self):
+        scope = core.Scope()
+        var = scope.var("int8_tensor")
+        cpu_tensor = var.get_tensor()
+        tensor_array = numpy.random.randint(
+            -127, high=128, size=[100, 200], dtype=numpy.int8)
+        place = core.CPUPlace()
+        cpu_tensor.set(tensor_array, place)
+        cpu_tensor_array_2 = numpy.array(cpu_tensor)
+        self.assertAlmostEqual(cpu_tensor_array_2.all(), tensor_array.all())
+
+        if core.is_compiled_with_cuda():
+            cuda_tensor = var.get_tensor()
+            tensor_array = numpy.random.randint(
+                -127, high=128, size=[100, 200], dtype=numpy.int8)
+            place = core.CUDAPlace(0)
+            cuda_tensor.set(tensor_array, place)
+            cuda_tensor_array_2 = numpy.array(cuda_tensor)
+            self.assertAlmostEqual(cuda_tensor_array_2.all(),
+                                   tensor_array.all())
+
     def test_int_lod_tensor(self):
         place = core.CPUPlace()
         scope = core.Scope()
         var_lod = scope.var("test_lod_tensor")
         lod_tensor = var_lod.get_tensor()
 
-        lod_tensor.set_dims([4, 4, 6])
-        lod_tensor.alloc_int(place)
+        lod_tensor._set_dims([4, 4, 6])
+        lod_tensor._alloc_int(place)
         array = numpy.array(lod_tensor)
         array[0, 0, 0] = 3
         array[3, 3, 5] = 10
@@ -84,8 +107,8 @@ class TestTensor(unittest.TestCase):
         var_lod = scope.var("test_lod_tensor")
 
         lod_tensor = var_lod.get_tensor()
-        lod_tensor.set_dims([5, 2, 3, 4])
-        lod_tensor.alloc_float(place)
+        lod_tensor._set_dims([5, 2, 3, 4])
+        lod_tensor._alloc_float(place)
 
         tensor_array = numpy.array(lod_tensor)
         self.assertEqual((5, 2, 3, 4), tensor_array.shape)
@@ -104,14 +127,13 @@ class TestTensor(unittest.TestCase):
         self.assertListEqual(lod_py, lod)
 
     def test_lod_tensor_init(self):
-        scope = core.Scope()
         place = core.CPUPlace()
         lod_py = [[2, 1], [1, 2, 2]]
         lod_tensor = core.LoDTensor()
 
-        lod_tensor.set_dims([5, 2, 3, 4])
+        lod_tensor._set_dims([5, 2, 3, 4])
         lod_tensor.set_recursive_sequence_lengths(lod_py)
-        lod_tensor.alloc_float(place)
+        lod_tensor._alloc_float(place)
         tensor_array = numpy.array(lod_tensor)
         tensor_array[0, 0, 0, 0] = 1.0
         tensor_array[0, 0, 0, 1] = 2.0
@@ -129,9 +151,9 @@ class TestTensor(unittest.TestCase):
         lod_py = [[2, 1], [1, 2, 2]]
         lod_tensor = core.LoDTensor()
 
-        lod_tensor.set_dims([5, 2, 3, 4])
+        lod_tensor._set_dims([5, 2, 3, 4])
         lod_tensor.set_recursive_sequence_lengths(lod_py)
-        lod_tensor.alloc_float(place)
+        lod_tensor._alloc_float(place)
         tensor_array = numpy.array(lod_tensor)
         tensor_array[0, 0, 0, 0] = 1.0
         tensor_array[0, 0, 0, 1] = 2.0
@@ -149,15 +171,15 @@ class TestTensor(unittest.TestCase):
 
         tensor = var.get_tensor()
 
-        tensor.set_dims([0, 1])
-        tensor.alloc_float(place)
+        tensor._set_dims([0, 1])
+        tensor._alloc_float(place)
 
         tensor_array = numpy.array(tensor)
         self.assertEqual((0, 1), tensor_array.shape)
 
         if core.is_compiled_with_cuda():
             gpu_place = core.CUDAPlace(0)
-            tensor.alloc_float(gpu_place)
+            tensor._alloc_float(gpu_place)
             tensor_array = numpy.array(tensor)
             self.assertEqual((0, 1), tensor_array.shape)
 
diff --git a/python/paddle/fluid/tests/unittests/test_top_k_op.py b/python/paddle/fluid/tests/unittests/test_top_k_op.py
index cc2fcc5ec0a076679c7dd85a7e8f8da6a170172b..e54e170f7f1e03db4b63db72edb7395d18130f68 100644
--- a/python/paddle/fluid/tests/unittests/test_top_k_op.py
+++ b/python/paddle/fluid/tests/unittests/test_top_k_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
@@ -28,7 +30,7 @@ class TestTopkOp(OpTest):
         self.inputs = {'X': input}
         self.attrs = {'k': k}
 
-        for rowid in xrange(32):
+        for rowid in range(32):
             row = input[rowid]
             output[rowid] = np.sort(row)[-k:]
             indices[rowid] = row.argsort()[-k:]
@@ -52,7 +54,7 @@ class TestTopkOp3d(OpTest):
         self.inputs = {'X': input_flat_2d}
         self.attrs = {'k': k}
 
-        for rowid in xrange(64):
+        for rowid in range(64):
             row = input_flat_2d[rowid]
             output[rowid] = np.sort(row)[-k:]
             indices[rowid] = row.argsort()[-k:]
diff --git a/python/paddle/fluid/tests/unittests/test_transpose_op.py b/python/paddle/fluid/tests/unittests/test_transpose_op.py
index ebd63fbd495354eafe298ad5cc3456a196538a6a..c30da2389d50d3b6bdf1f911aaed6ed71f274153 100644
--- a/python/paddle/fluid/tests/unittests/test_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_transpose_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
@@ -20,16 +22,19 @@ from op_test import OpTest
 class TestTransposeOp(OpTest):
     def setUp(self):
         self.initTestCase()
-        self.op_type = "transpose"
+        self.op_type = "transpose2"
         self.inputs = {'X': np.random.random(self.shape).astype("float32")}
         self.attrs = {'axis': list(self.axis)}
-        self.outputs = {'Out': self.inputs['X'].transpose(self.axis)}
+        self.outputs = {
+            'XShape': np.random.random(self.shape).astype("float32"),
+            'Out': self.inputs['X'].transpose(self.axis)
+        }
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(no_check_set=['XShape'])
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
+        self.check_grad(['X'], 'Out', sum_outputs=['Out'])
 
     def initTestCase(self):
         self.shape = (3, 4)
diff --git a/python/paddle/fluid/tests/unittests/test_uniform_random_batch_size_like_op.py b/python/paddle/fluid/tests/unittests/test_uniform_random_batch_size_like_op.py
index e033e86114f17d37c01480fe8350648eb8aa27cb..7b8be24d9da8c15eeb52c0ba207ea780b03254f8 100644
--- a/python/paddle/fluid/tests/unittests/test_uniform_random_batch_size_like_op.py
+++ b/python/paddle/fluid/tests/unittests/test_uniform_random_batch_size_like_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
index 346a949b6e7c96b5535f5e65ddbada11e110a0a7..d6a5d68765c53d9d711add64c86575a0db6997e4 100644
--- a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_unique_name.py b/python/paddle/fluid/tests/unittests/test_unique_name.py
index 49ef335618ca7ca1e8249a61a97ca552dabdb9e8..b8c751b2e9b5a905d9de40fc5f78a02c6ca5e034 100644
--- a/python/paddle/fluid/tests/unittests/test_unique_name.py
+++ b/python/paddle/fluid/tests/unittests/test_unique_name.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import paddle.fluid as fluid
 
diff --git a/python/paddle/fluid/tests/unittests/test_unpool_op.py b/python/paddle/fluid/tests/unittests/test_unpool_op.py
index a97d6dfdda9b79eed3be6302fb2b1c3810f189dc..b0c7c3c8662e217f4e88245f22f6b50e7a48c8b7 100644
--- a/python/paddle/fluid/tests/unittests/test_unpool_op.py
+++ b/python/paddle/fluid/tests/unittests/test_unpool_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
@@ -22,12 +24,12 @@ def unpool2dmax_forward_naive(input, indices, ksize, strides, paddings):
     out_hsize = (s2 - 1) * strides[0] - 2 * paddings[0] + ksize[0]
     out_wsize = (s2 - 1) * strides[1] - 2 * paddings[1] + ksize[1]
     out = np.zeros((s0, s1, out_hsize, out_wsize))
-    for nidx in xrange(s0):
-        for cidx in xrange(s1):
-            for h in xrange(s2):
-                for w in xrange(s3):
+    for nidx in range(s0):
+        for cidx in range(s1):
+            for h in range(s2):
+                for w in range(s3):
                     index = indices[nidx, cidx, h, w]
-                    hidx = (index - index % out_wsize) / out_wsize
+                    hidx = (index - index % out_wsize) // out_wsize
                     widx = index % out_wsize
                     out[nidx, cidx, int(hidx), int(widx)] = \
                             input[nidx, cidx, h, w]
@@ -41,28 +43,28 @@ class TestUnpoolOp(OpTest):
         self.init_test_case()
         pre_input = np.random.random(self.shape).astype("float32")
         nsize, csize, hsize, wsize = pre_input.shape
-        hsize_out = (hsize - self.ksize[0] + 2 * self.paddings[0]) / \
+        hsize_out = (hsize - self.ksize[0] + 2 * self.paddings[0]) // \
                 self.strides[0] + 1
-        wsize_out = (wsize - self.ksize[1] + 2 * self.paddings[1]) / \
+        wsize_out = (wsize - self.ksize[1] + 2 * self.paddings[1]) // \
                 self.strides[1] + 1
         input = np.zeros((nsize, csize, hsize_out, wsize_out))
         indices = np.zeros((nsize, csize, hsize_out, wsize_out))
-        for i in xrange(hsize_out):
-            for j in xrange(wsize_out):
+        for i in range(hsize_out):
+            for j in range(wsize_out):
                 r_start = np.max((i * self.strides[0] - self.paddings[0], 0))
                 r_end = np.min((i * self.strides[0] + self.ksize[0] - \
                         self.paddings[0], hsize))
                 c_start = np.max((j * self.strides[1] - self.paddings[1], 0))
                 c_end = np.min((j * self.strides[1] + self.ksize[1] - \
                         self.paddings[1], wsize))
-                for nidx in xrange(nsize):
-                    for cidx in xrange(csize):
+                for nidx in range(nsize):
+                    for cidx in range(csize):
                         x_masked = pre_input[nidx, cidx, r_start:r_end, \
                                 c_start:c_end]
                         input[nidx, cidx, i, j] = x_masked.max()
                         arg = x_masked.argmax()
                         indices[nidx, cidx, i, j] = \
-                                (r_start + arg / self.ksize[1]) * wsize + \
+                                (r_start + arg // self.ksize[1]) * wsize + \
                                 c_start + arg % self.ksize[1]
         output = self.unpool2d_forward_naive(input, indices, self.ksize, \
                 self.strides, self.paddings).astype("float32")
diff --git a/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py b/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..14dd2bb06f9a18d0b15a4aee4e9e6bfdf8c41206
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+
+from op_test import OpTest
+
+
+# Correct: General.
+class TestUnsqueezeOp(OpTest):
+    def setUp(self):
+        self.init_test_case()
+        self.op_type = "unsqueeze2"
+        self.inputs = {"X": np.random.random(self.ori_shape).astype("float32")}
+        self.init_attrs()
+        self.outputs = {
+            "Out": self.inputs["X"].reshape(self.new_shape),
+            "XShape": np.random.random(self.ori_shape).astype("float32")
+        }
+
+    def test_check_output(self):
+        self.check_output(no_check_set=["XShape"])
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+    def init_test_case(self):
+        self.ori_shape = (3, 5)
+        self.axes = (1, 2)
+        self.new_shape = (3, 1, 1, 5)
+
+    def init_attrs(self):
+        self.attrs = {"axes": self.axes}
+
+
+# Correct: Single input index.
+class TestUnsqueezeOp1(TestUnsqueezeOp):
+    def init_test_case(self):
+        self.ori_shape = (3, 5)
+        self.axes = (-1, )
+        self.new_shape = (3, 5, 1)
+
+
+# Correct: Mixed input axis.
+class TestUnsqueezeOp2(TestUnsqueezeOp):
+    def init_test_case(self):
+        self.ori_shape = (3, 5)
+        self.axes = (0, -1)
+        self.new_shape = (1, 3, 5, 1)
+
+
+# Correct: There is duplicated axis.
+class TestUnsqueezeOp3(TestUnsqueezeOp):
+    def init_test_case(self):
+        self.ori_shape = (3, 2, 5)
+        self.axes = (0, 3, 3)
+        self.new_shape = (1, 3, 2, 1, 1, 5)
+
+
+# Correct: Reversed axes.
+class TestUnsqueezeOp4(TestUnsqueezeOp):
+    def init_test_case(self):
+        self.ori_shape = (3, 2, 5)
+        self.axes = (3, 1, 1)
+        self.new_shape = (3, 1, 1, 2, 5, 1)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_unstack_op.py b/python/paddle/fluid/tests/unittests/test_unstack_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..7cbac8928ec40dc3e1c0e91e7779ec9ec978d884
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_unstack_op.py
@@ -0,0 +1,81 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from op_test import OpTest
+import numpy as np
+import unittest
+
+
+class TestUnStackOpBase(OpTest):
+    def initDefaultParameters(self):
+        self.input_dim = (5, 6, 7)
+        self.axis = 0
+        self.dtype = 'float32'
+
+    def initParameters(self):
+        pass
+
+    def get_y_names(self):
+        y_names = []
+        for i in range(self.input_dim[self.axis]):
+            y_names.append('y{}'.format(i))
+        return y_names
+
+    def setUp(self):
+        self.initDefaultParameters()
+        self.initParameters()
+        self.op_type = 'unstack'
+        self.x = np.random.random(size=self.input_dim).astype(self.dtype)
+
+        outs = np.split(self.x, self.input_dim[self.axis], self.axis)
+        new_shape = list(self.input_dim)
+        del new_shape[self.axis]
+        y_names = self.get_y_names()
+        tmp = []
+        for i in range(self.input_dim[self.axis]):
+            tmp.append((y_names[i], np.reshape(outs[i], new_shape)))
+
+        self.inputs = {'X': self.x}
+        self.outputs = {'Y': tmp}
+        self.attrs = {'axis': self.axis, 'num': self.input_dim[self.axis]}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad('X', self.get_y_names())
+
+
+class TestStackOp3(TestUnStackOpBase):
+    def initParameters(self):
+        self.axis = -1
+
+
+class TestStackOp4(TestUnStackOpBase):
+    def initParameters(self):
+        self.axis = -3
+
+
+class TestStackOp5(TestUnStackOpBase):
+    def initParameters(self):
+        self.axis = 1
+
+
+class TestStackOp6(TestUnStackOpBase):
+    def initParameters(self):
+        self.axis = 2
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_variable.py b/python/paddle/fluid/tests/unittests/test_variable.py
index 49784e21c461bacadd404bf4a8640ebc4dcb26ca..4f3c26ca7bdf4d807952b413c8b0dc8b211c06f6 100644
--- a/python/paddle/fluid/tests/unittests/test_variable.py
+++ b/python/paddle/fluid/tests/unittests/test_variable.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 from paddle.fluid.framework import default_main_program, Program, convert_np_dtype_to_dtype_
 import paddle.fluid.core as core
@@ -29,7 +31,8 @@ class TestVariable(unittest.TestCase):
         self.assertEqual(DT.INT16, convert("int16"))
         self.assertEqual(DT.INT64, convert("int64"))
         self.assertEqual(DT.BOOL, convert("bool"))
-        self.assertRaises(ValueError, lambda: convert("int8"))
+        self.assertEqual(DT.INT8, convert("int8"))
+        self.assertEqual(DT.UINT8, convert("uint8"))
 
     def test_var(self):
         b = default_main_program().current_block()
diff --git a/python/paddle/fluid/tests/unittests/test_version.py b/python/paddle/fluid/tests/unittests/test_version.py
new file mode 100644
index 0000000000000000000000000000000000000000..42a0e5c802c53ed0e6aad38fb9ab0f64122e87f5
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_version.py
@@ -0,0 +1,50 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import re
+
+import paddle.version as fluid_version
+
+
+class VersionTest(unittest.TestCase):
+    def setUp(self):
+        self._major_regex = "[0-9]+"
+        self._minor_regex = "[0-9]+"
+        self._patch_regex = "[0-9]+(\\.(a|b|rc)\\.[0-9]+)?"
+        self._rc_regex = "[0-9]+"
+        self._version_regex = "[0-9]+\\.[0-9]+\\.[0-9]+(\\.(a|b|rc)\\.[0-9]+)?"
+        self._commit_regex = "[0-9a-f]{5,49}"
+
+    def test_check_output(self):
+        # check commit format
+        self.assertTrue(re.match(self._commit_regex, fluid_version.commit))
+        self.assertTrue(isinstance(fluid_version.istaged, bool))
+
+        # check version format
+        if fluid_version.istaged:
+            self.assertEqual(fluid_version.major, 0)
+            self.assertEqual(fluid_version.minor, 0)
+            self.assertEqual(fluid_version.patch, "0")
+            self.assertEqual(fluid_version.rc, 0)
+            self.assertEqual(fluid_version.full_version, "0.0.0")
+        else:
+            self.assertTrue(re.match(self._major_regex, fluid_version.major))
+            self.assertTrue(re.match(self._minor_regex, fluid_version.minor))
+            self.assertTrue(re.match(self._patch_regex, fluid_version.patch))
+            self.assertTrue(re.match(self._rc_regex, fluid_version.rc))
+            self.assertTrue(
+                re.match(self._version_regex, fluid_version.full_version))
diff --git a/python/paddle/fluid/tests/unittests/test_warpctc_op.py b/python/paddle/fluid/tests/unittests/test_warpctc_op.py
index 9f1aaee472f918da7deb8816a0a4654dafe74a30..5e3aa13546d0c4fdcde4a3d6378d5a1748327814 100644
--- a/python/paddle/fluid/tests/unittests/test_warpctc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_warpctc_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import sys
 import unittest
 import numpy as np
@@ -132,7 +134,7 @@ class CTCForward(object):
             for k in range(end - start):
                 j = k + start
                 if j & 1 == 1:
-                    label_idx = j / 2
+                    label_idx = j // 2
                     label_val = labels_a_sequence[label_idx, 0]
                     fv = self.log_add(forward_vars[i - 1, j],
                                       forward_vars[i - 1, j - 1])
diff --git a/python/paddle/fluid/tests/unittests/test_weight_normalization.py b/python/paddle/fluid/tests/unittests/test_weight_normalization.py
index 436f9b9f86fb86270e47c8e30c5c0701787ca0f1..e990d8b2498f6a1b62f7a34d329e3ca72a962728 100644
--- a/python/paddle/fluid/tests/unittests/test_weight_normalization.py
+++ b/python/paddle/fluid/tests/unittests/test_weight_normalization.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy
 import collections
diff --git a/python/paddle/fluid/tests/unittests/test_while_op.py b/python/paddle/fluid/tests/unittests/test_while_op.py
index fe8808bc044684c96fb3382836be32dac1d241f3..b75373cf24a7344bf59b3c6fcb9c4c3969be6503 100644
--- a/python/paddle/fluid/tests/unittests/test_while_op.py
+++ b/python/paddle/fluid/tests/unittests/test_while_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import paddle.fluid.layers as layers
 from paddle.fluid.executor import Executor
@@ -66,7 +68,7 @@ class TestWhileOp(unittest.TestCase):
         exe = Executor(cpu)
         d = []
 
-        for i in xrange(3):
+        for i in range(3):
             d.append(numpy.random.random(size=[10]).astype('float32'))
 
         outs = exe.run(feed={'d0': d[0],
diff --git a/python/paddle/fluid/tests/unittests/testsuite.py b/python/paddle/fluid/tests/unittests/testsuite.py
index a995ee10f29a714b674fae4b31070e6ba2ca9953..34fbb1b549cf5fc5f75bcc0715e5c83665f1d200 100644
--- a/python/paddle/fluid/tests/unittests/testsuite.py
+++ b/python/paddle/fluid/tests/unittests/testsuite.py
@@ -12,20 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import numpy as np
 
 import paddle.fluid.core as core
 from paddle.fluid.op import Operator
 
 
-def as_lodtensor(np_array, lod, place):
-    tensor = core.LoDTensor()
-    tensor.set(np_value, place)
-    if lod is not None:
-        tensor.set_recursive_sequence_lengths(lod)
-    return tensor
-
-
 def create_op(scope, op_type, inputs, outputs, attrs):
     kwargs = dict()
 
@@ -69,14 +63,19 @@ def create_op(scope, op_type, inputs, outputs, attrs):
 
 
 def set_input(scope, op, inputs, place):
+    def np_value_to_fluid_value(input):
+        if input.dtype == np.float16:
+            input = input.view(np.uint16)
+        return input
+
     def __set_input__(var_name, var):
         if isinstance(var, tuple) or isinstance(var, np.ndarray):
             tensor = scope.find_var(var_name).get_tensor()
             if isinstance(var, tuple):
                 tensor.set_recursive_sequence_lengths(var[1])
                 var = var[0]
-            tensor.set_dims(var.shape)
-            tensor.set(var, place)
+            tensor._set_dims(var.shape)
+            tensor.set(np_value_to_fluid_value(var), place)
         elif isinstance(var, float):
             scope.find_var(var_name).set_float(var)
         elif isinstance(var, int):
@@ -104,6 +103,7 @@ def append_input_output(block, op_proto, np_list, is_input, dtype):
         if name not in np_list:
             assert var_proto.intermediate, "{} not found".format(name)
         else:
+            # inferece the dtype from numpy value.
             np_value = np_list[name]
             if isinstance(np_value, tuple):
                 dtype = np_value[0].dtype
@@ -116,6 +116,16 @@ def append_input_output(block, op_proto, np_list, is_input, dtype):
                 if is_input:
                     shape = list(np_value.shape)
                     lod_level = 0
+        # NOTE(dzhwinter): type hacking
+        # numpy float16 is binded to paddle::platform::float16
+        # in tensor_py.h via the help of uint16 datatype. Because
+        # the internal memory representation of float16 is
+        # actually uint16_t in paddle. So we use np.uint16 in numpy for
+        # raw memory, it can pass through the pybind. So in the testcase,
+        # we feed data use data.view(uint16), but the dtype is float16 in fact.
+        # The data.view(uint16) means do not cast the data type, but process data as the uint16
+        if dtype == np.uint16:
+            dtype = np.float16
         return block.create_var(
             dtype=dtype, shape=shape, lod_level=lod_level, name=name)
 
@@ -142,10 +152,7 @@ def append_input_output(block, op_proto, np_list, is_input, dtype):
 
 
 def append_loss_ops(block, output_names):
-    mean_inputs = map(block.var, output_names)
-    # for item in mean_inputs:
-    #     print(item)
-    #     print("Item", item.dtype)
+    mean_inputs = list(map(block.var, output_names))
 
     if len(mean_inputs) == 1:
         loss = block.create_var(dtype=mean_inputs[0].dtype, shape=[1])
diff --git a/python/paddle/fluid/tests/unittests/transformer_model.py b/python/paddle/fluid/tests/unittests/transformer_model.py
index c62792face3c353db1f2e3c77eaf4bd32fbded69..f0e74aff6bdfa7d9f0a7f10e64cac4de88009f0a 100644
--- a/python/paddle/fluid/tests/unittests/transformer_model.py
+++ b/python/paddle/fluid/tests/unittests/transformer_model.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 from functools import partial
 import numpy as np
 
@@ -22,7 +24,7 @@ pos_enc_param_names = (
     "src_pos_enc_table",
     "trg_pos_enc_table", )
 
-batch_size = 64
+batch_size = 2
 
 
 def position_encoding_init(n_position, d_pos_vec):
@@ -118,8 +120,9 @@ def multi_head_attention(queries,
         # FIXME(guosheng): Decouple the program desc with batch_size.
         return layers.reshape(
             x=trans_x,
-            shape=map(int,
-                      [batch_size, -1, trans_x.shape[2] * trans_x.shape[3]]))
+            shape=list(
+                map(int, [batch_size, -1, trans_x.shape[2] * trans_x.shape[3]
+                          ])))
 
     def scaled_dot_product_attention(q, k, v, attn_bias, d_model, dropout_rate):
         """
@@ -403,7 +406,7 @@ def transformer(
         trg_pad_idx,
         pos_pad_idx, ):
     file_obj = fluid.layers.open_recordio_file(
-        filename='./wmt16.recordio',
+        filename='/tmp/wmt16.recordio',
         shapes=[
             [batch_size * max_length, 1],
             [batch_size * max_length, 1],
diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py
index b6e0241265b18377874efb0d223441994b4650d0..30cdfe4ad2c9892184862b70ff49417ce5a08516 100644
--- a/python/paddle/fluid/trainer.py
+++ b/python/paddle/fluid/trainer.py
@@ -12,19 +12,24 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import contextlib
 import os
-
-import core
-
-import data_feeder
-import executor
-import framework
-import io
+import errno
+import shutil
+import six
+import time
+
+from . import core
+from . import data_feeder
+from . import executor
+from . import framework
+from . import io
 # optimizer is same as the parameter of Trainer.__init__. Rename it to opt_module
-import optimizer as opt_module
-import parallel_executor
-from transpiler import distribute_transpiler
+from . import optimizer as opt_module
+from . import parallel_executor
+from .transpiler import distribute_transpiler
 
 __all__ = [
     'Trainer', 'BeginEpochEvent', 'EndEpochEvent', 'BeginStepEvent',
@@ -70,7 +75,7 @@ class BeginStepEvent(object):
         self.step = step_id
         self.fetch_metrics = True
         """
-        If fetch_metrics is true, the metrics will be fetched at the 
+        If fetch_metrics is true, the metrics will be fetched at the
         EndStepEvent. Default is True.
         """
 
@@ -94,7 +99,7 @@ class EndStepEvent(object):
 
 class CheckpointConfig(object):
     """
-    Parameter object for :code:`fluid.io.save_checkpoint` and
+    Parameter object for :code:`save_checkpoint` and
     :code:`fluid.Trainer`. Used to configuration how to save checkpoint.
 
     Args:
@@ -237,7 +242,7 @@ class Trainer(object):
         self.checkpoint_cfg = checkpoint_config
         if self.checkpoint_cfg:
             assert isinstance(self.checkpoint_cfg, CheckpointConfig)
-            serial = io.get_latest_checkpoint_serial(
+            serial = _get_latest_checkpoint_serial(
                 self.checkpoint_cfg.checkpoint_dir)
             self.checkpoint_cfg.load_serial = serial if serial >= 0 else None
 
@@ -276,32 +281,16 @@ class Trainer(object):
             exe = executor.Executor(place)
             exe.run(self.startup_program)
 
-        if self.checkpoint_cfg and self.checkpoint_cfg.load_serial:
-            with self._prog_and_scope_guard():
-                exe = executor.Executor(place)
-                io.load_checkpoint(exe, self.checkpoint_cfg.checkpoint_dir,
-                                   self.checkpoint_cfg.load_serial,
-                                   self.startup_program)
-
-                if not self.checkpoint_cfg.pserver_id:
-                    epoch_id, step_id = io.load_trainer_args(
-                        self.checkpoint_cfg.checkpoint_dir,
-                        self.checkpoint_cfg.load_serial, self.trainer_id,
-                        self._get_checkpoint_load_args())
-                    self.checkpoint_cfg.epoch_id = int(epoch_id)
-                    self.checkpoint_cfg.step_id = int(step_id)
-                else:
-                    if self.checkpoint_cfg.lookup_table_name:
-                        io.load_lookup_table_vars(
-                            exe, self.checkpoint_cfg.checkpoint_dir,
-                            self.startup_program,
-                            self.checkpoint_cfg.pserver_id,
-                            self.checkpoint_cfg.lookup_table_name)
+        if self.checkpoint_cfg and self.checkpoint_cfg.load_serial is not None:
+            self._load_checkpoint()
 
         if param_path and os.path.isdir(param_path):
-            # load params from param_path into scope
-            io.load_persist_vars_without_grad(
-                exe, dirname=param_path, program=self.startup_program)
+            with self._prog_and_scope_guard():
+                # load params from param_path into scope
+                io.load_persistables(
+                    executor=exe,
+                    dirname=param_path,
+                    main_program=self.startup_program)
 
     def _transpile_nccl2_dist(self):
         # PADDLE_TRAINER_IPS
@@ -442,6 +431,28 @@ class Trainer(object):
             exe = executor.Executor(self.place)
             io.save_persistables(exe, dirname=param_path)
 
+    def save_inference_model(self, param_path, feeded_var_names,
+                             target_var_indexes):
+        """
+        Save model for cpp inference into :code:`param_path`.
+
+        Args:
+            param_path(str): The path to save parameters.
+            feeded_var_names(list(str)): The name of the vars that you
+                need to feed in before run program.
+            target_var_indexes(list(int)): the index of target var that
+                you need to return in trainer.train_func.
+        Returns:
+            None
+        """
+        with self._prog_and_scope_guard():
+            exe = executor.Executor(self.place)
+            target_vars = [
+                self.train_func_outputs[index] for index in target_var_indexes
+            ]
+            io.save_inference_model(param_path, feeded_var_names, target_vars,
+                                    exe)
+
     @contextlib.contextmanager
     def _prog_and_scope_guard(self):
         with framework.program_guard(
@@ -549,7 +560,7 @@ class Trainer(object):
 
     def _clean_checkpoint(self):
         assert self.checkpoint_cfg
-        io.clean_checkpoint(checkpoint_dir=self.checkpoint_cfg.checkpoint_dir)
+        clean_checkpoint(checkpoint_dir=self.checkpoint_cfg.checkpoint_dir)
 
     def _get_checkpoint_load_args(self):
         """
@@ -572,7 +583,7 @@ class Trainer(object):
         if epoch_id % self.checkpoint_cfg.epoch_interval == 0 \
             and step_id % self.checkpoint_cfg.step_interval == 0:
             exe = executor.Executor(self.place)
-            io.save_checkpoint(
+            save_checkpoint(
                 executor=exe,
                 checkpoint_dir=self.checkpoint_cfg.checkpoint_dir,
                 trainer_id=self.trainer_id,
@@ -580,6 +591,41 @@ class Trainer(object):
                 main_program=self.train_program,
                 max_num_checkpoints=self.checkpoint_cfg.max_num_checkpoints)
 
+    def _load_checkpoint(self):
+        with self._prog_and_scope_guard():
+            exe = executor.Executor(self.place)
+            load_checkpoint(
+                executor=exe,
+                checkpoint_dir=self.checkpoint_cfg.checkpoint_dir,
+                main_program=self.startup_program)
+
+            if not self.checkpoint_cfg.pserver_id:
+                load_trainer_args = self._get_checkpoint_load_args()
+                trainer_args = load_checkpoint(
+                    executor=exe,
+                    checkpoint_dir=self.checkpoint_cfg.checkpoint_dir,
+                    main_program=self.startup_program,
+                    role_id=self.trainer_id,
+                    is_trainer=True,
+                    load_trainer_args=load_trainer_args)
+
+                if len(trainer_args) != 2:
+                    raise ValueError(
+                        "the return trainer_args length do not equal _get_checkpoint_load_args"
+                    )
+                self.checkpoint_cfg.epoch_id = int(trainer_args[0])
+                self.checkpoint_cfg.step_id = int(trainer_args[1])
+            else:
+                if self.checkpoint_cfg.lookup_table_name:
+                    load_checkpoint(
+                        executor=exe,
+                        checkpoint_dir=self.checkpoint_cfg.checkpoint_dir,
+                        main_program=self.startup_program,
+                        role_id=self.checkpoint_cfg.pserver_id,
+                        is_trainer=False,
+                        load_trainer_args=None,
+                        load_lookup_table=self.checkpoint_cfg.lookup_table_name)
+
 
 def build_feed_var_list(program, feed_order):
     if not isinstance(program, framework.Program):
@@ -593,12 +639,620 @@ def build_feed_var_list(program, feed_order):
         if not isinstance(feed_order, dict):
             raise TypeError(
                 "The 'feed_order' should be either None, list or dict.")
-        if not sorted(feed_order.values()) == range(len(feed_order)):
+        if not sorted(feed_order.values()) == list(range(len(feed_order))):
             raise ValueError(
                 "The values of 'feed_order' should be a permutation of [0, len(feed_order))"
             )
-        sorted_pair_list = sorted(feed_order.items(), key=lambda item: item[1])
+        sorted_pair_list = sorted(
+            six.iteritems(feed_order), key=lambda item: item[1])
         feed_var_list = [
             program.global_block().var(pair[0]) for pair in sorted_pair_list
         ]
     return feed_var_list
+
+
+# move Checkpoint APIs from io.py to trainer.py, make all of them are private.
+SUCCESS_MARK_FILENAME = "_SUCCESS"
+CHECKPOINT_PREFIX = "checkpoint"
+MODEL_DIR = "__model__"
+LOOKUP_TABLE_DIR = "__lookup_table__"
+TRAINER_PREFIX = "trainer"
+CHECKPOINT_SEPARATOR = "_"
+
+
+def save_checkpoint(executor,
+                    checkpoint_dir,
+                    trainer_id,
+                    main_program,
+                    trainer_args=None,
+                    max_num_checkpoints=3,
+                    lookup_table=None,
+                    pserver_endpoints=None):
+    """
+    This function filters out all checkpoint variables from the give
+    main_program and then saves these variables to the `checkpoint_dir`
+    directory.
+
+    In the training precess, we generally save a checkpoint in each
+    iteration. So there might be a lot of checkpoints in the
+    `checkpoint_dir`. To avoid them taking too much disk space, the
+    `max_num_checkpoints` are introduced to limit the total number of
+    checkpoints. If the number of existing checkpints is greater than
+    the `max_num_checkpoints`, oldest ones will be scroll deleted.
+
+    A variable is a checkpoint variable and will be saved if it meets
+    all following conditions:
+        1. It's persistable.
+        2. It's type is not FEED_MINIBATCH nor FETCH_LIST nor RAW.
+        3. It's name contains no "@GRAD" nor ".trainer_" nor ".block".
+
+    Args:
+        executor(Executor): The executor to run for save checkpoint.
+        checkpoint_dir(str): The folder where to save checkpoints.
+        trainer_id(int): currect trainer id, if id is equal to 0, the trainer
+            is chief.
+        trainer_args(dict|None): Current training arguments. Such as 'epoch_id'
+            and 'step_id'.
+            Defaut: None
+        main_program(Program): The program whose checkpoint variables will
+            be saved.
+        max_num_checkpoints(int): The max number of total number of existing
+            checkpoints.
+            Default: 3
+        lookup_table(string|None): the lookup table name, when use distribute
+            lookup table, we can get lookup table name by DistributeTranspiler.
+            table_name
+        pserver_endpoints(list|None): the parameter server ip:port list.
+            when use distribute lookup table, we can get pserver_endpoints by
+            distribute arguments.
+
+    Returns:
+        None
+
+    Raises:
+        ValueError: If `checkpoint_dir` is None.
+        AssertionError: If `trainer_args` is not a dict.
+
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            path = "./checkpoints"
+            prog = fluid.default_main_program()
+            trainer_args = {"epoch_id": 200,
+                            "step_id": 20} # just an example
+            table_name = "share_w"
+            ps_endpoints = ["127.0.0.1:6000","127.0.0.1:6001"]
+
+            save_checkpoint(executor=exe,
+                                     checkpoint_dir=path,
+                                     trainer_id=0,
+                                     trainer_args=trainer_args,
+                                     main_program=prog,
+                                     max_num_checkpoints=3,
+                                     lookup_table=table_name,
+                                     pserver_endpoints = ps_endpoints)
+    """
+    if checkpoint_dir is None:
+        raise ValueError("'checkpoint_dir' should not be None")
+
+    if main_program is None:
+        raise ValueError('main_program should not be None.')
+
+    if trainer_args:
+        assert isinstance(trainer_args, dict)
+
+    is_chief = trainer_id == 0
+
+    _make_chekcpoint_dirs(checkpoint_dir)
+    serial = _get_latest_checkpoint_serial(checkpoint_dir) + 1
+    cur_dir = _get_serial_dir(checkpoint_dir, serial)
+
+    _save_trainer_args(cur_dir, trainer_id, trainer_args)
+
+    if is_chief:
+        _save_persist_vars_without_grad(executor, cur_dir, main_program)
+
+    if is_chief and lookup_table and pserver_endpoints:
+        _save_pserver_vars_by_notify(executor, cur_dir, lookup_table,
+                                     pserver_endpoints)
+
+    _scroll_delete(checkpoint_dir, max_num_checkpoints)
+
+
+def load_checkpoint(executor,
+                    checkpoint_dir,
+                    main_program,
+                    role_id=0,
+                    is_trainer=True,
+                    load_trainer_args=None,
+                    load_lookup_table=None):
+    """
+    This function filters out all checkpoint variables from the give
+    main_program and then try to load these variables from the
+    `checkpoint_dir` directory.
+
+    In the training precess, we generally save a checkpoint in each
+    iteration. So there are more than one checkpoint in the
+    `checkpoint_dir` (each checkpoint has its own sub folder), use
+    `serial` to specify which serial of checkpoint you would like to
+    load.
+
+    A variable is a checkpoint variable and will be loaded if it meets
+    all following conditions:
+        1. It's persistable.
+        2. It's type is not FEED_MINIBATCH nor FETCH_LIST nor RAW.
+        3. It's name contains no "@GRAD" nor ".trainer_" nor ".block".
+
+    Args:
+        executor(Executor): The executor to run for loading checkpoint.
+        checkpoint_dir(str): The folder where all checkpoints are.
+        serial(int): The serial of checkpoint you would like to load.
+        main_program(Program): The program whose checkpoint variables will
+                               be loaded.
+        role_id(int):  the trainer id or the parameter server id.
+        is_trainer(bool): trainer is True and parameter server is False.
+        load_trainer_args(list|None): list about load trainer args.
+        load_lookup_table(str|None): the lookup table name
+
+    Returns:
+        None
+
+    Raises:
+        ValueError: If `checkpoint_dir` is None.
+        ValueError: If `main_program` is None.
+
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            path = "./checkpoints"
+            prog = fluid.default_main_program()
+            load_checkpoint(executor=exe, checkpoint_dir=path,
+                    serial=9, main_program=prog)
+
+            # In this example, `load_checkpoint` function
+            # will first filters out all checkpoint variables in the default
+            # main program, and then try to load these variables form the
+            # folder "./checkpoints/checkpoint_9/__model__".
+    """
+
+    if checkpoint_dir is None:
+        raise ValueError("'checkpoint_dir' should not be None")
+
+    serial = _get_latest_checkpoint_serial(checkpoint_dir)
+
+    # there are nothing  need to be loaded
+    if serial is None or serial < 0:
+        return
+
+    if main_program is None:
+        raise ValueError('main_program should not be None.')
+
+    if is_trainer and load_trainer_args is None:
+        cur_dir = _get_serial_dir(checkpoint_dir, serial)
+        _load_persist_vars_without_grad(executor, cur_dir, main_program, True)
+        return
+
+    if is_trainer and load_trainer_args:
+        return _load_trainer_args(checkpoint_dir, serial, role_id,
+                                  load_trainer_args)
+
+    if not is_trainer and load_lookup_table:
+        _load_lookup_table_vars(executor, checkpoint_dir, main_program, role_id,
+                                load_lookup_table)
+
+
+def clean_checkpoint(checkpoint_dir, delete_dir=False):
+    """
+    clean the checkpoint dir, when the train exits normally,
+    the trainer will call clean_checkpoint to delete checkpoint directory saved before.
+    delete_dir only works when the directory is empty, otherwise, OSError is raised.
+
+    : param checkpoint_dir
+    : param delete_dir
+    """
+
+    if checkpoint_dir is None:
+        raise ValueError("'checkpoint_dir' should not be None")
+    _scroll_delete(checkpoint_dir, max_num_checkpoints=0)
+
+    if delete_dir and not os.listdir(checkpoint_dir):
+        os.rmdir(checkpoint_dir)
+
+
+def _load_persist_vars_without_grad(executor,
+                                    dirname,
+                                    program,
+                                    has_model_dir=False):
+    """
+    This function filters out all checkpoint variables from the give
+    program and then trys to load these variables from the given directory.
+
+    A variable is a checkpoint variable if it meets all following
+    conditions:
+        1. It's persistable.
+        2. It's type is not FEED_MINIBATCH nor FETCH_LIST nor RAW.
+        3. It's name contains no "@GRAD" nor ".trainer_" nor ".block".
+
+    Args:
+        executor(Executor): The executor to run for loading variables.
+        dirname(str): The directory path.
+        program(Program): The program whose checkpoint variables will
+                          be loaded.
+        has_model_dir(bool): if True, the function loads variables
+                             from a sub directory named '__model__'.
+                             Default: False
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            param_path = "./my_paddle_model"
+            prog = fluid.default_main_program()
+            _load_persist_vars_without_grad(executor=exe,
+                    dirname=param_path, program=prog, has_model_dir=True)
+
+            # In this example, `_load_persist_vars_without_grad` function
+            # will first filters out all checkpoint variables in the default
+            # main program, and then trys to load these variables form the
+            # folder "./my_paddle_model/__model__".
+    """
+
+    if has_model_dir:
+        dirname = _get_model_dir(dirname)
+
+    io.load_vars(
+        executor,
+        dirname=dirname,
+        main_program=program,
+        predicate=_is_checkpoint_var,
+        filename=None)
+
+
+def _load_lookup_table_vars(executor, dirname, program, pserver_id, table_name):
+    """
+    The parameter server will load lookup table's local file in
+    selectedrows variable.
+
+    Args:
+        executor(Executor): The executor to run for loading persistable variables
+        dirname(str): The directory path
+        main_program(Program): Find the variable named table_name in main_program
+        pserver_id(int): the serial number in pserver_endpoints list
+        table_name(str): lookup table name
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            dirname = "./checkpoints/checkpoint_9/"
+            prog = fluid.default_main_program()
+            pserver_id = 1
+            table_name = "share_w"
+            _load_lookup_table_vars(executor=exe,
+                    dirname=dirname, program=prog, pserver_id=pserver_id,
+                    table_name=table_name)
+    """
+
+    for var in program.list_vars():
+        if var.name == table_name:
+            lookup_table_var = var
+            break
+
+    assert lookup_table_var is not None
+
+    lookup_table_dir = os.path.join(dirname, LOOKUP_TABLE_DIR)
+    table_file = table_name + CHECKPOINT_SEPARATOR + str(pserver_id)
+
+    load_prog = framework.Program()
+    load_block = load_prog.global_block()
+
+    load_block.append_op(
+        type='load',
+        inputs={},
+        outputs={'Out': [lookup_table_var]},
+        attrs={'file_path': os.path.join(lookup_table_dir, table_file)})
+
+    executor.run(load_prog)
+
+
+def _save_persist_vars_without_grad(executor, dirname, program):
+    """
+    This function filters out all checkpoint variables from the give
+    program and then save these variables to a sub-folder '__model__' of
+    the given directory.
+
+    A variable is a checkpoint variable if it meets all following
+    conditions:
+        1. It's persistable.
+        2. It's type is not FEED_MINIBATCH nor FETCH_LIST nor RAW.
+        3. It's name contains no "@GRAD" nor ".trainer_" nor ".block".
+
+    Args:
+        executor(Executor): The executor to run for saving variables.
+        dirname(str): The directory path.
+        program(Program): The program whose checkpoint variables will
+                          be saved.
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            param_path = "./my_paddle_model"
+            prog = fluid.default_main_program()
+            _save_persist_vars_without_grad(executor=exe,
+                    dirname=param_path, program=prog)
+
+            # In this example, `_save_persist_vars_without_grad` function
+            # will first filters out all checkpoint variables in the default
+            # main program, and then saves these variables to the folder
+            # "./my_paddle_model/__model__".
+    """
+    cur_dir = _get_model_dir(dirname)
+    io.save_vars(
+        executor,
+        dirname=cur_dir,
+        main_program=program,
+        vars=None,
+        predicate=_is_checkpoint_var,
+        filename=None)
+    _write_success(cur_dir)
+
+
+def _save_pserver_vars_by_notify(executor, dirname, lookup_table,
+                                 ps_endpoint_list):
+    """
+    This function will send checkpoint notify message from Trainer 0
+    to all the pservers.
+    The checkpoint notify message contains lookup table name,
+    the absolute path on pserver to save lookup_table.
+
+    Args:
+        executor(Executor): The executor to run for send checkpoint notify.
+        dirname(str): The folder where to save checkpoints.
+        lookup_table(string): the lookup table name, when use distribute
+            lookup table, we can get lookup table name by DistributeTranspiler.
+            table_name
+        ps_endpoint_list(list): the parameter server ip:port list.
+            when use distribute lookup table, we can get ps_endpoint_list by
+            distribute arguments.
+    Return:
+        None
+
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            param_path = "./my_paddle_model"
+            prog = fluid.default_main_program()
+            table_name = "share_w"
+            ps_endpoints = ["127.0.0.1:6000","127.0.0.1:6001"]
+
+            _save_pserver_vars_by_notify(executor=exe,
+                    dirname=param_path, lookup_table=table_name,
+                    ps_endpoint_list=ps_endpoints)
+    """
+    cur_dir = _get_lookuptable_dir(dirname)
+
+    checkpoint_notify_program = framework.Program()
+    checkpoint_notify_block = checkpoint_notify_program.global_block()
+
+    attrs = {}
+    attrs['epmap'] = ps_endpoint_list
+    attrs['dir'] = cur_dir
+    attrs['lookup_table'] = lookup_table
+
+    checkpoint_notify_block.append_op(
+        type='checkpoint_notify', inputs={}, outputs={}, attrs=attrs)
+    executor.run(checkpoint_notify_program)
+
+
+def _save_trainer_args(dirname, trainer_id, trainer_args):
+    assert isinstance(trainer_args, dict)
+
+    cur_dir = _get_trainer_dir(dirname, trainer_id)
+
+    for name, value in six.iteritems(trainer_args):
+        args_file = os.path.join(cur_dir, name)
+        with open(args_file, 'w') as f:
+            f.write(str(value))
+    _write_success(cur_dir)
+
+
+def _load_trainer_args(checkpoint_dir, serial, trainer_id, trainer_args):
+    """
+    trainer will load some args from it's independent directory,
+    such as epoch_id and step_id.
+
+    Args:
+        checkpoint_dir(str): The folder where all checkpoints are.
+        serial(int): The serial of checkpoint you would like to load.
+        trainer_id(int): current trainer id.
+        trainer_args(list): list about load trainer args
+    Return:
+        None
+
+    Examples:
+        .. code-block:: python
+
+            param_path = "./checkpoint/"
+            serial = 7
+            trainer_id = 2
+            trainer_args = ["epoch_id", "step_id"]
+
+            _load_trainer_args(checkpoint_dir=param_path, serial=serial,
+            trainer_id=trainer_id, trainer_args=trainer_args)
+    """
+    assert isinstance(trainer_args, list)
+
+    cur_dir = _get_serial_dir(checkpoint_dir, serial)
+    cur_dir = _get_trainer_dir(cur_dir, trainer_id)
+
+    ret_values = []
+
+    for arg in trainer_args:
+        cur_file = os.path.join(cur_dir, arg)
+        with open(cur_file, 'r') as f:
+            contents = f.read()
+            ret_values.append(contents.strip())
+    return ret_values
+
+
+def _is_checkpoint_var(var):
+    """
+    the checkpoint will not save or load all the variables.
+    var type is FEED_MINIBATCH/FETCH_LIST/RAW or var name ends with @GRAD are discarded.
+
+    : param var(Variable)
+    """
+    if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
+            var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
+            var.desc.type() == core.VarDesc.VarType.RAW:
+        return False
+    # @GRAD are named for gradient variables, checkpoint will not save it.
+    if "@GRAD" in var.name:
+        return False
+    # .trainer_ are named for distribute train variables, checkpoint will not save it.
+    if ".trainer_" in var.name:
+        return False
+
+    # .block is named for distribute train variables, checkpoint will not save it.
+    if ".block" in var.name:
+        return False
+
+    return var.persistable
+
+
+def _make_chekcpoint_dirs(dirs):
+    """
+    _make_chekcpoint_dirs will makdir local directory directly, when the directory is exist, it will igore it.
+    """
+    assert dirs is not None
+
+    if os.path.isfile(dirs):
+        raise OSError(errno.ENOTDIR, "dirs path shoule be a Directory.", dirs)
+
+    if not os.path.isdir(dirs):
+        try:
+            os.makedirs(dirs)
+        except OSError as err:
+            if err.errno != errno.EEXIST:
+                raise err
+
+
+def _get_dir_serial(dirname):
+    _, serial = dirname.split(CHECKPOINT_SEPARATOR)
+
+    try:
+        serial_num = int(serial)
+    except ValueError:
+        serial_num = -1
+    return serial_num
+
+
+def _get_serial_dir(dirname, serial):
+    serial_folder = CHECKPOINT_PREFIX + CHECKPOINT_SEPARATOR + str(serial)
+    serial_dir = os.path.join(dirname, serial_folder)
+    _make_chekcpoint_dirs(serial_dir)
+
+    return serial_dir
+
+
+def _get_model_dir(dirname):
+    model_dir = os.path.join(dirname, MODEL_DIR)
+    _make_chekcpoint_dirs(model_dir)
+    return model_dir
+
+
+def _get_lookuptable_dir(dirname):
+    lookuptable_dir = os.path.join(dirname, LOOKUP_TABLE_DIR)
+    _make_chekcpoint_dirs(lookuptable_dir)
+    return lookuptable_dir
+
+
+def _get_trainer_dir(dirname, trainer_id):
+    trainer_folder = TRAINER_PREFIX + CHECKPOINT_SEPARATOR + str(trainer_id)
+    trainer_dir = os.path.join(dirname, trainer_folder)
+    _make_chekcpoint_dirs(trainer_dir)
+    return trainer_dir
+
+
+def _scroll_delete(dirname, max_num_checkpoints=3):
+    dirs = os.listdir(dirname)
+    serial_map = {}
+    for serial in dirs:
+        serial_num = _get_dir_serial(serial)
+        serial_map[serial_num] = serial
+
+    if len(list(serial_map.keys())) <= max_num_checkpoints:
+        return
+
+    serials = list(serial_map.keys())
+    serials.sort(reverse=True)
+    serials = serials[max_num_checkpoints:]
+    for serial in serials:
+        cur_dir = _get_serial_dir(dirname, serial)
+        try:
+            shutil.rmtree(cur_dir)
+        except OSError as err:
+            if err.errno != errno.ENOENT:
+                raise err
+
+
+def _write_success(dirname):
+    """
+    write an empty file named "_SUCCESS" in checkpoint dir, indicate this checkpoint is correct.
+
+    : param dirname
+    """
+    success_file = os.path.join(dirname, SUCCESS_MARK_FILENAME)
+    with open(success_file, 'a') as f:
+        now = time.ctime()
+        f.write(now)
+
+
+def _get_latest_checkpoint_serial(checkpoint_dir):
+    """
+    get the latest file in checkpoint directory, the _SUCCESS file must exist in the directory
+
+    : param checkpoint_dir
+    """
+    if not checkpoint_dir:
+        return -1
+
+    def has_success(checkpoint_dir, cur_dir):
+        """
+        is _SUCCESS in this dir
+        """
+
+        serial = _get_dir_serial(cur_dir)
+        if serial == -1 or not os.path.isdir(
+                os.path.join(checkpoint_dir, cur_dir)):
+            return -1
+
+        success_path = os.path.join(
+            _get_serial_dir(checkpoint_dir, serial), MODEL_DIR,
+            SUCCESS_MARK_FILENAME)
+        if os.path.isfile(success_path):
+            return serial
+
+    if not os.path.isdir(checkpoint_dir):
+        return -1
+
+    current_dir = -1
+    dirs = os.listdir(checkpoint_dir)
+    for cur_dir in dirs:
+        success_num = has_success(checkpoint_dir, cur_dir)
+        if success_num > current_dir:
+            current_dir = success_num
+    return current_dir
diff --git a/python/paddle/fluid/transpiler/__init__.py b/python/paddle/fluid/transpiler/__init__.py
index cf18090f71f34be5105498f5846dbcdf15ab2e3f..8429e2fd7c5141f064c66d8f406889bca1510fe2 100644
--- a/python/paddle/fluid/transpiler/__init__.py
+++ b/python/paddle/fluid/transpiler/__init__.py
@@ -12,12 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from distribute_transpiler import DistributeTranspiler
-from inference_transpiler import InferenceTranspiler
-from memory_optimization_transpiler import memory_optimize, release_memory
-from ps_dispatcher import HashName, RoundRobin
+from __future__ import print_function
+
+from .distribute_transpiler import DistributeTranspiler, DistributeTranspilerConfig
+from .inference_transpiler import InferenceTranspiler
+from .memory_optimization_transpiler import memory_optimize, release_memory
+from .ps_dispatcher import HashName, RoundRobin
 
 __all__ = [
     "DistributeTranspiler", "InferenceTranspiler", "memory_optimize",
-    "release_memory", "HashName", "RoundRobin"
+    "release_memory", "HashName", "RoundRobin", "DistributeTranspilerConfig"
 ]
diff --git a/python/paddle/fluid/transpiler/details/__init__.py b/python/paddle/fluid/transpiler/details/__init__.py
index dc597c33849dc06cc975b245099672f64c3539d3..f33c05ed2f48c2498b98fc486d6ff7471088d77e 100644
--- a/python/paddle/fluid/transpiler/details/__init__.py
+++ b/python/paddle/fluid/transpiler/details/__init__.py
@@ -12,5 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from program_utils import *
-from ufind import *
+from __future__ import print_function
+
+from .program_utils import *
+from .ufind import *
+from .checkport import *
diff --git a/python/paddle/fluid/transpiler/details/checkport.py b/python/paddle/fluid/transpiler/details/checkport.py
new file mode 100644
index 0000000000000000000000000000000000000000..7bad4b427a2d53bd14c7a1f870ce74a883158d04
--- /dev/null
+++ b/python/paddle/fluid/transpiler/details/checkport.py
@@ -0,0 +1,50 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import time
+import socket
+from contextlib import closing
+
+
+def wait_server_ready(endpoints):
+    """
+    Wait until parameter servers are ready, use connext_ex to detect
+    port readiness.
+
+    Args:
+        endpoints (list): endpoints string list, like:
+                         ["127.0.0.1:8080", "127.0.0.1:8081"]
+
+    Examples:
+        .. code-block:: python
+
+           wait_server_ready(["127.0.0.1:8080", "127.0.0.1:8081"])
+    """
+    while True:
+        all_ok = True
+        for ep in endpoints:
+            ip_port = ep.split(":")
+            with closing(socket.socket(socket.AF_INET,
+                                       socket.SOCK_STREAM)) as sock:
+                sock.settimeout(2)
+                result = sock.connect_ex((ip_port[0], int(ip_port[1])))
+                if result != 0:
+                    all_ok = False
+        if not all_ok:
+            sys.stderr.write("pserver not ready, wait 3 sec to retry...\n")
+            sys.stderr.flush()
+            time.sleep(3)
+        else:
+            break
diff --git a/python/paddle/fluid/transpiler/details/program_utils.py b/python/paddle/fluid/transpiler/details/program_utils.py
index f10b496306a002ee131d01798a0698b807d379ca..a83aa0f11eed9bfc1674d8d75dcfacc297f056b0 100644
--- a/python/paddle/fluid/transpiler/details/program_utils.py
+++ b/python/paddle/fluid/transpiler/details/program_utils.py
@@ -12,15 +12,22 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
+import six
+
+from paddle.fluid import core
+import paddle
+
 
 def delete_ops(block, ops):
     try:
         start = list(block.ops).index(ops[0])
         end = list(block.ops).index(ops[-1])
-        [block.remove_op(start) for _ in xrange(end - start + 1)]
-    except Exception, e:
+        [block._remove_op(start) for _ in six.moves.range(end - start + 1)]
+    except Exception as e:
         raise e
-    block.program.sync_with_cpp()
+    block.program._sync_with_cpp()
 
 
 def find_op_by_input_arg(block, arg_name):
@@ -35,3 +42,142 @@ def find_op_by_output_arg(block, arg_name):
         if arg_name in op.output_arg_names:
             return index
     return -1
+
+
+def get_indent_space(indent, space_num=4):
+    ret = ""
+    for i in range(0, indent * space_num):
+        ret += " "
+
+    return ret
+
+
+def variable_to_code(var):
+    """
+    Get readable codes of fluid variable.
+
+    Args:
+        var: A fluid operator.
+
+    Returns:
+        string: The formatted string.
+    """
+    if var.type == core.VarDesc.VarType.SELECTED_ROWS or var.type == core.VarDesc.VarType.LOD_TENSOR:
+        var_str = "{name} : fluid.{type}.shape{shape}.astype({dtype})".\
+            format(i="{", e="}", name=var.name, type=var.type, shape=var.shape, dtype=var.dtype)
+    else:
+        var_str = "{name} : fluid.{type})".\
+            format(i="{", e="}", name=var.name, type=var.type)
+
+    if type(var) == paddle.fluid.framework.Parameter:
+        if var.trainable:
+            var_str = "trainable parameter " + var_str
+        else:
+            var_str = "parameter " + var_str
+    else:
+        var_str = "var " + var_str
+
+    if var.persistable:
+        var_str = "persist " + var_str
+
+    return var_str
+
+
+def op_to_code(op):
+    """
+    Get readable codes of fluid operator.
+
+    Args:
+        op: A fluid operator.
+
+    Returns:
+        string: The foramtted string.
+    """
+
+    outputs_str = "{"
+    for i in range(0, len(op.output_names)):
+        outputs_str += "{name}=".format(name=op.output_names[i])
+        o = op.output(op.output_names[i])
+        outputs_str += "{value}".format(value=o)
+        if i != len(op.output_names) - 1:
+            outputs_str += ", "
+    outputs_str += "}"
+
+    inputs_str = "{"
+    for i in range(0, len(op.input_names)):
+        inputs_str += "{name}=".format(name=op.input_names[i])
+        o = op.input(op.input_names[i])
+        inputs_str += "{value}".format(value=o)
+
+        if i != len(op.input_names) - 1:
+            inputs_str += ", "
+    inputs_str += "}"
+
+    attrs_str = ""
+    for i in range(0, len(op.attr_names)):
+        name = op.attr_names[i]
+
+        attr_type = op.desc.attr_type(name)
+        if attr_type == core.AttrType.BLOCK:
+            a = "{name} = block[{value}]".format(
+                name=name, type=attr_type, value=op.block_attr_id(name))
+            attrs_str += a
+            continue
+
+        if attr_type == core.AttrType.BLOCKS:
+            a = "{name} = blocks{value}".format(
+                name=name, type=attr_type, value=op.blocks_attr_ids(name))
+            attrs_str += a
+            continue
+
+        a = "{name} = {value}".format(
+            name=name, type=attr_type, value=op.desc.attr(name))
+        attrs_str += a
+        if i != len(op.attr_names) - 1:
+            attrs_str += ", "
+
+    if outputs_str != "{}":
+        op_str = "{outputs} = {op_type}(inputs={inputs}, {attrs})".\
+            format(outputs = outputs_str, op_type=op.type, inputs=inputs_str, attrs=attrs_str)
+    else:
+        op_str = "{op_type}(inputs={inputs}, {attrs})".\
+            format(op_type=op.type, inputs=inputs_str, attrs=attrs_str)
+    return op_str
+
+
+def block_to_code(block, block_idx):
+    indent = 0
+
+    print("{0}{1} // block {2}".format(
+        get_indent_space(indent), '{', block_idx))
+
+    indent += 1
+    # sort all vars
+    all_vars = sorted(six.iteritems(block.vars), key=lambda x: x[0])
+    for var in all_vars:
+        print("{}{}".format(get_indent_space(indent), variable_to_code(var[1])))
+
+    if len(all_vars) > 0:
+        print("")
+
+    for op in block.ops:
+        print("{}{}".format(get_indent_space(indent), op_to_code(op)))
+    indent -= 1
+
+    print("{0}{1}".format(get_indent_space(indent), '}'))
+
+
+def program_to_code(prog):
+    """
+    Print readable codes of fluid program.
+
+    Args:
+        prog : A fluid program.
+
+    An example result like bellow:
+    https://github.com/PaddlePaddle/Paddle/pull/12673
+    """
+    block_idx = 0
+    for block in prog.blocks:
+        block_to_code(block, block_idx)
+        block_idx += 1
diff --git a/python/paddle/fluid/transpiler/details/ufind.py b/python/paddle/fluid/transpiler/details/ufind.py
index 0e30d0e3f9c5712c494daf17b2b4bcec86f69c23..aa63af7dcf7ac85031fb00ca4c39fb36d7e588b8 100644
--- a/python/paddle/fluid/transpiler/details/ufind.py
+++ b/python/paddle/fluid/transpiler/details/ufind.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 
 class UnionFind(object):
     """ Union-find data structure.
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index 53d6ca86a008f798af2854a154cce8b7242d2f35..53c9cbe23dd82af866658fe46d1d631b0a3b26f3 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -11,6 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from __future__ import print_function
 """
 Steps to transpile trainer:
 1. split variable to multiple blocks, aligned by product(dim[1:]) (width).
@@ -28,17 +30,19 @@ Steps to transpile pserver:
 5. add listen_and_serv op
 """
 
-from __future__ import print_function
-
 import math
+import sys
 import numpy as np
+import collections
+import six
 
-from ps_dispatcher import RoundRobin, HashName, PSDispatcher
+from .ps_dispatcher import RoundRobin, HashName, PSDispatcher
 from .. import core, framework
 from ..framework import Program, default_main_program, \
                         default_startup_program, Block, \
-                        Variable, Parameter, grad_var_name
-from details import *
+                        Parameter, grad_var_name
+from .details import *
+from functools import reduce
 
 LOOKUP_TABLE_TYPE = "lookup_table"
 LOOKUP_TABLE_GRAD_TYPE = "lookup_table_grad"
@@ -63,7 +67,7 @@ def same_or_split_var(p_name, var_name):
     return p_name == var_name or p_name.startswith(var_name + ".block")
 
 
-def slice_variable(var_list, slice_count, min_block_size=8192):
+def slice_variable(var_list, slice_count, min_block_size):
     """
     We may need to split dense tensor to one or more blocks and put
     them equally onto parameter server. One block is a sub-tensor
@@ -101,7 +105,7 @@ def slice_variable(var_list, slice_count, min_block_size=8192):
                 block_size += dim1 - remains
         # update split_count after aligning
         split_count = int(math.ceil(var_numel / float(block_size)))
-        for block_id in xrange(split_count):
+        for block_id in range(split_count):
             curr_block_size = min(block_size, var_numel - (
                 (block_id) * block_size))
             block = VarBlock(var.name, block_id, curr_block_size)
@@ -109,6 +113,22 @@ def slice_variable(var_list, slice_count, min_block_size=8192):
     return blocks
 
 
+class DistributeTranspilerConfig(object):
+    """
+    slice_var_up (bool): Do Tensor slice for pservers, default is True.
+    split_method (PSDispatcher): RoundRobin or HashName can be used
+        try to choose the best method to balance loads for pservers.
+    min_block_size (int): Minimum splitted element number in block.
+        According:https://github.com/PaddlePaddle/Paddle/issues/8638#issuecomment-369912156
+        We can use bandwidth effiently when data size is larger than 2MB.If you
+        want to change it, please be sure you see the slice_variable function.
+    """
+
+    slice_var_up = True
+    split_method = None
+    min_block_size = 8192
+
+
 class DistributeTranspiler(object):
     """
     **DistributeTranspiler**
@@ -145,14 +165,25 @@ class DistributeTranspiler(object):
                 trainer_program = t.get_trainer_program()
     """
 
+    def __init__(self, config=None):
+        if config is not None:
+            self.config = config
+        else:
+            self.config = DistributeTranspilerConfig()
+
+        if self.config.split_method is None:
+            self.config.split_method = RoundRobin
+
+        assert (self.config.min_block_size >= 8192)
+        assert (self.config.split_method.__bases__[0] == PSDispatcher)
+
     def transpile(self,
                   trainer_id,
                   program=None,
                   pservers="127.0.0.1:6174",
                   trainers=1,
-                  slice_var_up=True,
-                  split_method=RoundRobin,
-                  sync_mode=True):
+                  sync_mode=True,
+                  startup_program=None):
         """
         Run the transpiler.
 
@@ -164,15 +195,18 @@ class DistributeTranspiler(object):
             pservers (str): comma separated ip:port string for the pserver
                 list.
             trainers (int): number of trainers in the distributed job.
-            slice_var_up (bool): Do Tensor slice for pservers, default is True.
-            split_method (PSDispatcher): RoundRobin or HashName can be used
-                try to choose the best method to balance loads for pservers.
             sync_mode (bool): Do sync training or not, default is True.
+            startup_program (Program|None): startup_program to transpile,
+                default is fluid.default_main_program().
         """
-        assert (split_method.__bases__[0] == PSDispatcher)
         if program is None:
             program = default_main_program()
+        if startup_program is None:
+            startup_program = default_startup_program()
         self.origin_program = program
+        self.startup_program = startup_program
+        self.origin_startup_program = self.startup_program.clone()
+
         self.trainer_num = trainers
         self.sync_mode = sync_mode
         self.trainer_id = trainer_id
@@ -180,13 +214,25 @@ class DistributeTranspiler(object):
         self.pserver_endpoints = pserver_endpoints
         self.optimize_ops, self.params_grads = self._get_optimize_pass()
 
-        ps_dispatcher = split_method(self.pserver_endpoints)
+        ps_dispatcher = self.config.split_method(self.pserver_endpoints)
         self.has_distributed_lookup_table = self._has_distributed_lookup_table()
+        self.param_name_to_grad_name = dict()
+        self.grad_name_to_param_name = dict()
+        for param_var, grad_var in self.params_grads:
+            self.param_name_to_grad_name[param_var.name] = grad_var.name
+            self.grad_name_to_param_name[grad_var.name] = param_var.name
+
+        # add distributed attrs to program
+        self.origin_program._is_distributed = True
+        self.origin_program._endpoints = self.pserver_endpoints
+        self.origin_program._is_chief = self.trainer_id == 0
+        self.origin_program._distributed_lookup_table = self.table_name if self.table_name else None
 
         # split and create vars, then put splited vars in dicts for later use.
-        self._init_splited_vars(slice_var_up)
+        # step 1: split and create vars, then put splited vars in dicts for later use.
+        self._init_splited_vars()
 
-        # step 3.1: insert send op to send gradient vars to parameter servers
+        # step 2: insert send op to send gradient vars to parameter servers
         ps_dispatcher.reset()
         send_vars = []
 
@@ -195,54 +241,73 @@ class DistributeTranspiler(object):
         #       fc_w@GRAD_trainer_0, fc_w@GRAD_trainer_1 --> pserver1
         #       fc_b@GRAD_trainer_0, fc_b@GRAD_trainer_1 --> pserver2
         # shuffle the map will avoid the uneven distribution above
-        grad_var_mapping_items = self.grad_var_mapping.items()
-        if not slice_var_up:
+        grad_var_mapping_items = list(six.iteritems(self.grad_var_mapping))
+
+        if not self.config.slice_var_up:
+            np.random.seed(self.origin_program.random_seed)
             np.random.shuffle(grad_var_mapping_items)
 
-        for orig_varname, splited_vars in grad_var_mapping_items:
+        grad_name_to_send_dummy_out = dict()
+        for grad_varname, splited_vars in grad_var_mapping_items:
             eplist = ps_dispatcher.dispatch(splited_vars)
 
-            if not slice_var_up:
+            if not self.config.slice_var_up:
                 assert (len(splited_vars) == 1)
 
+            splited_grad_varname = grad_varname
             if len(splited_vars) == 1:
-                orig_varname = splited_vars[0].name
+                splited_grad_varname = splited_vars[0].name
                 index = find_op_by_output_arg(program.global_block(),
-                                              orig_varname)
+                                              splited_grad_varname)
             elif len(splited_vars) > 1:
-                orig_var = program.global_block().vars[orig_varname]
+                orig_var = program.global_block().vars[splited_grad_varname]
                 index = find_op_by_output_arg(program.global_block(),
-                                              orig_varname)
+                                              splited_grad_varname)
                 self._insert_split_op(program, orig_var, index, splited_vars)
                 index += 1
             else:
                 AssertionError("Can not insert the send op by original "
-                               "variable name :", orig_varname)
+                               "variable name :", splited_grad_varname)
 
-            program.global_block().insert_op(
+            dummy_output = program.global_block().create_var(
+                name=framework.generate_control_dev_var_name())
+            grad_name_to_send_dummy_out[grad_varname] = dummy_output
+
+            # get send op_role_var, if not splited, the grad should have .trainer suffix
+            # if splited, grad should be the original grad var name (split_by_ref and send
+            # will be on the same place). ParallelExecutor
+            # will use op_role_var to get expected device place to run this op.
+            program.global_block()._insert_op(
                 index=index + 1,
                 type="send",
                 inputs={"X": splited_vars},
-                outputs={},
+                outputs={"Out": dummy_output},
                 attrs={
                     "epmap": eplist,
-                    RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
+                    RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE,
+                    OP_ROLE_VAR_ATTR_NAME: [
+                        self.grad_name_to_param_name[grad_varname],
+                        splited_grad_varname
+                    ],
+                    "sync_mode": not self.sync_mode,
                 })
             for _, var in enumerate(splited_vars):
                 send_vars.append(var)
 
         if self.sync_mode:
+            send_barrier_out = program.global_block().create_var(
+                name=framework.generate_control_dev_var_name())
+            input_deps = grad_name_to_send_dummy_out.values()
             program.global_block().append_op(
                 type="send_barrier",
-                inputs={},
-                outputs={},
+                inputs={"X": list(input_deps)},
+                outputs={"Out": send_barrier_out},
                 attrs={
                     "endpoints": pserver_endpoints,
-                    "sync_mode": self.sync_mode,
                     RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
                 })
 
-        # step 3.2: insert recv op to receive parameters from parameter server
+        # step 3: insert recv op to receive parameters from parameter server
         recv_vars = []
         for _, var in enumerate(send_vars):
             recv_vars.append(self.grad_param_mapping[var])
@@ -254,46 +319,69 @@ class DistributeTranspiler(object):
             self.param_grad_ep_mapping[ep]["grads"].append(send_vars[i])
 
         # step4: Concat the parameters splits together after recv.
-        for varname, splited_var in self.param_var_mapping.iteritems():
+        all_recv_outputs = []
+        for param_varname, splited_var in six.iteritems(self.param_var_mapping):
             eps = []
             for var in splited_var:
                 index = [v.name for v in recv_vars].index(var.name)
                 eps.append(eplist[index])
+            if self.sync_mode:
+                recv_dep_in = send_barrier_out
+            else:
+                # connect deps to send op in async mode
+                recv_dep_in = grad_name_to_send_dummy_out[
+                    self.param_name_to_grad_name[param_varname]]
+            all_recv_outputs.extend(splited_var)
+            # get recv op_role_var, if not splited, the grad should have .trainer suffix
+            # if splited, grad should be the original grad var name. ParallelExecutor
+            # will use op_role_var to get expected device place to run this op.
+            orig_grad_name = self.param_name_to_grad_name[param_varname]
+            recv_op_role_var_name = orig_grad_name
+            splited_trainer_grad = self.grad_var_mapping[orig_grad_name]
+            if len(splited_trainer_grad) == 1:
+                recv_op_role_var_name = splited_trainer_grad[0].name
 
             program.global_block().append_op(
                 type="recv",
-                inputs={},
+                inputs={"X": [recv_dep_in]},
                 outputs={"Out": splited_var},
                 attrs={
                     "epmap": eps,
-                    RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
+                    RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE,
+                    OP_ROLE_VAR_ATTR_NAME:
+                    [param_varname, recv_op_role_var_name],
+                    "sync_mode": not self.sync_mode
                 })
 
-        program.global_block().append_op(
-            type="fetch_barrier",
-            inputs={},
-            outputs={},
-            attrs={
-                "endpoints": pserver_endpoints,
-                RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
-            })
+        if self.sync_mode:
+            # form a WAW dependency
+            program.global_block().append_op(
+                type="fetch_barrier",
+                inputs={},
+                outputs={"Out": all_recv_outputs},
+                attrs={
+                    "endpoints": pserver_endpoints,
+                    RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
+                })
 
-        for varname, splited_var in self.param_var_mapping.iteritems():
+        for param_varname, splited_var in six.iteritems(self.param_var_mapping):
             if len(splited_var) <= 1:
                 continue
-            orig_param = program.global_block().vars[varname]
+            orig_param = program.global_block().vars[param_varname]
             program.global_block().append_op(
                 type="concat",
                 inputs={"X": splited_var},
                 outputs={"Out": [orig_param]},
                 attrs={"axis": 0})
 
+        self._get_trainer_startup_program(recv_vars=recv_vars, eplist=eplist)
+
         if self.has_distributed_lookup_table:
             self._replace_lookup_table_op_with_prefetch(program,
                                                         pserver_endpoints)
             self._split_table_grad_and_add_send_vars(program, pserver_endpoints)
 
-    def get_trainer_program(self):
+    def get_trainer_program(self, wait_port=True):
         """
         Get transpiled trainer side program.
 
@@ -304,8 +392,91 @@ class DistributeTranspiler(object):
         # FIXME(typhoonzero): Also ops like clip_gradient, lrn_decay?
         delete_ops(self.origin_program.global_block(), self.optimize_ops)
         self.origin_program.__str__()
+
+        if wait_port:
+            wait_server_ready(self.pserver_endpoints)
+
         return self.origin_program
 
+    def _get_trainer_startup_program(self, recv_vars, eplist):
+        """
+        Get transpiled trainer side startup program.
+
+        Args:
+            recv_vars (list): Variable list to recv for current trainer_id
+            eplist (list): A list of strings indicating
+
+        Returns:
+            Program: trainer side startup program.
+        """
+        startup_program = self.startup_program
+
+        # FIXME(gongwb): delete not need ops.
+        # note that: some parameter is not trainable and those ops can't be deleted.
+
+        for varname, splited_var in six.iteritems(self.param_var_mapping):
+            # Get the eplist of recv vars
+            eps = []
+            for var in splited_var:
+                index = [v.name for v in recv_vars].index(var.name)
+                eps.append(eplist[index])
+
+            for var in splited_var:
+                if startup_program.global_block().has_var(var.name):
+                    continue
+
+                startup_program.global_block().create_var(
+                    name=var.name,
+                    persistable=False,
+                    type=var.type,
+                    dtype=var.dtype,
+                    shape=var.shape,
+                    lod_level=var.lod_level)
+
+            op = startup_program.global_block().append_op(
+                type="recv",
+                inputs={"X": []},
+                outputs={"Out": splited_var},
+                attrs={
+                    "epmap": eps,
+                    RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
+                })
+
+        fetch_barrier_out = startup_program.global_block().create_var(
+            name=framework.generate_control_dev_var_name())
+        startup_program.global_block().append_op(
+            type="fetch_barrier",
+            inputs={},
+            outputs={"Out": fetch_barrier_out},
+            attrs={
+                "endpoints": self.pserver_endpoints,
+                RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
+            })
+
+        for varname, splited_var in six.iteritems(self.param_var_mapping):
+            #add concat ops to merge splited parameters received from parameter servers.
+            if len(splited_var) <= 1:
+                continue
+            # NOTE: if enable memory optimization, origin vars maybe removed.
+            if varname in startup_program.global_block().vars:
+                orig_param = startup_program.global_block().vars[varname]
+            else:
+                origin_param_var = self.origin_program.global_block().vars[
+                    varname]
+                orig_param = startup_program.global_block().create_var(
+                    name=varname,
+                    persistable=origin_param_var.persistable,
+                    type=origin_param_var.type,
+                    dtype=origin_param_var.dtype,
+                    shape=origin_param_var.shape)
+            startup_program.global_block().append_op(
+                type="concat",
+                inputs={"X": splited_var},
+                outputs={"Out": [orig_param]},
+                attrs={"axis": 0})
+
+        return startup_program
+
     def get_pserver_program(self, endpoint):
         """
         Get parameter server side program.
@@ -320,9 +491,12 @@ class DistributeTranspiler(object):
         # NOTE: assume blocks of the same variable is not distributed
         # on the same pserver, only change param/grad varnames for
         # trainers to fetch.
-
+        sys.stderr.write("get_pserver_program() is deprecated, call\
+            get_pserver_programs() to get pserver main and startup\
+            in a single call.")
         # step1
         pserver_program = Program()
+        pserver_program.random_seed = self.origin_program.random_seed
         # step2: Create vars to receive vars at parameter servers.
         recv_inputs = []
         for v in self.param_grad_ep_mapping[endpoint]["params"]:
@@ -347,7 +521,7 @@ class DistributeTranspiler(object):
                     dtype=v.dtype,
                     shape=v.shape)
             if self.sync_mode and self.trainer_num > 1:
-                for trainer_id in xrange(self.trainer_num):
+                for trainer_id in range(self.trainer_num):
                     var = pserver_program.global_block().create_var(
                         name="%s.trainer_%d" % (orig_var_name, trainer_id),
                         persistable=False,
@@ -377,11 +551,6 @@ class DistributeTranspiler(object):
         # append it into the sub program.
 
         global_ops = []
-        # HACK: optimization global ops only used to scale beta1 and beta2
-        # replace it with dependency engine.
-        for op in self.optimize_ops:
-            if self._is_adam_connected_op(op):
-                global_ops.append(op)
 
         def __append_optimize_op__(op, block, grad_to_block_id, merged_var,
                                    lr_ops):
@@ -410,7 +579,7 @@ class DistributeTranspiler(object):
 
             # clone vars
             for var in origin_block.vars:
-                new_sub_block.clone_variable(var)
+                new_sub_block._clone_variable(var)
 
             # clone ops
             for origin_op in origin_block.ops:
@@ -442,6 +611,8 @@ class DistributeTranspiler(object):
             per_opt_block = pserver_program.create_block(pre_block_idx)
             optimize_blocks.append(per_opt_block)
             # append grad merging ops before clip and weight decay
+            # cases may like:
+            # L2Decay op -> clip op -> optimize
             for _, op in enumerate(self.optimize_ops):
                 # find the origin @GRAD var before clipping
                 grad_varname_for_block = __op_have_grad_input__(op)
@@ -449,6 +620,7 @@ class DistributeTranspiler(object):
                     merged_var = self._append_pserver_grad_merge_ops(
                         per_opt_block, grad_varname_for_block, endpoint,
                         grad_to_block_id, self.origin_program)
+                    break  # append optimize op once then append other ops.
             for _, op in enumerate(self.optimize_ops):
                 # optimizer is connected to itself
                 if ufind.is_connected(op, opt_op) and op not in global_ops:
@@ -472,11 +644,14 @@ class DistributeTranspiler(object):
             pserver_index = self.pserver_endpoints.index(endpoint)
             table_opt_block = self._create_table_optimize_block(
                 pserver_index, pserver_program, pre_block_idx, grad_to_block_id)
+            optimize_blocks.append(table_opt_block)
             prefetch_var_name_to_block_id = self._create_prefetch_block(
                 pserver_index, pserver_program, table_opt_block)
             checkpoint_block_id = self._create_checkpoint_save_block(
                 pserver_program, table_opt_block.idx)
 
+            pserver_program._distributed_lookup_table = self.table_name
+
         # NOTE: if has_distributed_lookup_table is False, then prefetch_block will
         # not be executed, so it's safe to use optimize_block to hold the place
         if self.has_distributed_lookup_table:
@@ -503,25 +678,64 @@ class DistributeTranspiler(object):
             outputs={},
             attrs=attrs)
 
-        pserver_program.sync_with_cpp()
+        # add distributed attrs
+        pserver_program._slice_vars_and_attrs = self._get_slice_vars_and_attrs(
+            endpoint)
+
+        pserver_program._sync_with_cpp()
+        # save pserver program to generate pserver side startup relatively.
+        self.pserver_program = pserver_program
         return pserver_program
 
-    def get_startup_program(self, endpoint, pserver_program):
+    def get_pserver_programs(self, endpoint):
         """
+        Get pserver side main program and startup program for distributed training.
+
+        Args:
+            endpoint (str): current pserver endpoint.
+
+        Returns:
+            tuple: (main_program, startup_program), of type "Program"
+        """
+        pserver_prog = self.get_pserver_program(endpoint)
+        pserver_startup = self.get_startup_program(endpoint)
+        return pserver_prog, pserver_startup
+
+    def get_startup_program(self,
+                            endpoint,
+                            pserver_program=None,
+                            startup_program=None):
+        """
+        **Deprecated**
+
         Get startup program for current parameter server.
         Modify operator input variables if there are variables that
         were split to several blocks.
 
         Args:
             endpoint (str): current pserver endpoint.
-            pserver_program (Program): call get_pserver_program first and
-                pass the result here.
+            pserver_program (Program): deprecated, call get_pserver_program first.
+            startup_program (Program): deprecated, should pass startup_program
+                when initalizing
 
         Returns:
             Program: parameter server side startup program.
         """
+        sys.stderr.write("get_startup_program() is deprecated, call\
+            get_pserver_programs() to get pserver main and startup\
+            in a single call.")
+        if pserver_program != None:
+            sys.stderr.write("passing pserver_program to get_startup_program()\
+                is deprecated, you can use new API get_pserver_programs() to\
+                get both pserver main program and startup program.")
+        if startup_program != None:
+            sys.stderr.write("passing startup_program to get_startup_program()\
+                is deprecated, use fluid.program_guard() or pass this argument\
+                to transpile() call.")
+
         s_prog = Program()
-        orig_s_prog = default_startup_program()
+        orig_s_prog = self.startup_program
+        s_prog.random_seed = orig_s_prog.random_seed
         params = self.param_grad_ep_mapping[endpoint]["params"]
 
         def _get_splited_name_and_shape(varname):
@@ -533,24 +747,26 @@ class DistributeTranspiler(object):
 
         # 1. create vars in pserver program to startup program
         pserver_vars = pserver_program.global_block().vars
-        created_var_map = dict()
-        for _, var in pserver_vars.iteritems():
-            tmpvar = s_prog.global_block().clone_variable(var)
+        created_var_map = collections.OrderedDict()
+        for _, var in six.iteritems(pserver_vars):
+            tmpvar = s_prog.global_block()._clone_variable(var)
             created_var_map[var.name] = tmpvar
 
         # 2. rename op outputs
         for op in orig_s_prog.global_block().ops:
-            new_outputs = dict()
+            new_outputs = collections.OrderedDict()
             # do not append startup op if var is not on this pserver
             op_on_pserver = False
-            for key in op.output_names:
-                newname, _ = _get_splited_name_and_shape(op.output(key)[0])
-                if newname:
-                    op_on_pserver = True
-                    new_outputs[key] = created_var_map[newname]
-                elif op.output(key)[0] in pserver_vars:
-                    op_on_pserver = True
-                    new_outputs[key] = pserver_vars[op.output(key)[0]]
+            # TODO(gongwb): remove this line.
+            if op.type not in ["recv", "fetch_barrier", "concat"]:
+                for key in op.output_names:
+                    newname, _ = _get_splited_name_and_shape(op.output(key)[0])
+                    if newname:
+                        op_on_pserver = True
+                        new_outputs[key] = created_var_map[newname]
+                    elif op.output(key)[0] in pserver_vars:
+                        op_on_pserver = True
+                        new_outputs[key] = pserver_vars[op.output(key)[0]]
 
             if op_on_pserver:
                 # most startup program ops have no inputs
@@ -559,14 +775,37 @@ class DistributeTranspiler(object):
                 if op.type in [
                         "gaussian_random", "fill_constant", "uniform_random"
                 ]:
-                    op.attrs["shape"] = new_outputs["Out"].shape
+                    op.set_attr("shape", list(new_outputs["Out"].shape))
                 s_prog.global_block().append_op(
                     type=op.type,
                     inputs=new_inputs,
                     outputs=new_outputs,
-                    attrs=op.attrs)
+                    attrs=op.all_attrs())
+
+        # add slice vars
+        s_prog._slice_vars_and_attrs = self._get_slice_vars_and_attrs(endpoint)
+
         return s_prog
 
+    def _get_slice_vars_and_attrs(self, endpoint):
+        slice_vars_and_attrs = []
+        block_suffix = "block"
+        for param in self.param_grad_ep_mapping[endpoint]["params"]:
+            orig_var_name, block_name, _ = self._get_varname_parts(param.name)
+            if not block_name:
+                continue
+
+            block_idx = int(block_name.split(block_suffix)[1])
+            orig_var = self.origin_program.global_block().vars[orig_var_name]
+
+            skip_numel = 0
+            slice_vars = self.param_var_mapping[orig_var_name]
+            for slice_var in slice_vars[:block_idx]:
+                skip_numel += reduce(lambda x, y: x * y, slice_var.shape)
+            slice_vars_and_attrs.append([orig_var, skip_numel, param])
+
+        return slice_vars_and_attrs
+
     # ====================== private transpiler functions =====================
 
     def _has_distributed_lookup_table(self):
@@ -578,7 +817,7 @@ class DistributeTranspiler(object):
         self.table_name = None
         for op in self.origin_program.global_block().ops:
             if op.type == LOOKUP_TABLE_TYPE:
-                if op.attrs['is_distributed'] is True:
+                if op.attr('is_distributed') is True:
                     if self.table_name is None:
                         self.table_name = op.input("W")[0]
                     if self.table_name != op.input("W")[0]:
@@ -630,7 +869,7 @@ class DistributeTranspiler(object):
                 ]
         return param_list, grad_list
 
-    def _init_splited_vars(self, slice_var_up):
+    def _init_splited_vars(self):
         # update these mappings for further transpile:
         # 1. param_var_mapping: param var name -> [splited params vars]
         # 2. grad_var_mapping: grad var name -> [splited grads vars]
@@ -654,35 +893,42 @@ class DistributeTranspiler(object):
         param_list, grad_list = self._update_dist_lookup_table_vars(
             param_list, grad_list, self.params_grads)
 
-        if slice_var_up:
+        if self.config.slice_var_up:
             # when we slice var up into blocks, we will slice the var according to
             # pserver services' count. A pserver may have two or more listening ports.
-            grad_blocks = slice_variable(grad_list, len(self.pserver_endpoints))
+            grad_blocks = slice_variable(grad_list,
+                                         len(self.pserver_endpoints),
+                                         self.config.min_block_size)
             param_blocks = slice_variable(param_list,
-                                          len(self.pserver_endpoints))
+                                          len(self.pserver_endpoints),
+                                          self.config.min_block_size)
         else:
             # when we do NOT slice var up into blocks, we will always slice params
             # grads into one block.
-            grad_blocks = slice_variable(grad_list, 1)
-            param_blocks = slice_variable(param_list, 1)
+            grad_blocks = slice_variable(grad_list, 1,
+                                         self.config.min_block_size)
+            param_blocks = slice_variable(param_list, 1,
+                                          self.config.min_block_size)
         assert (len(grad_blocks) == len(param_blocks))
 
-        # origin_varname -> [splited_var]
+        # origin_param_name -> [splited_param_vars]
         self.param_var_mapping = self._create_vars_from_blocklist(
             self.origin_program, param_blocks)
+        # origin_grad_name -> [splited_grad_vars]
         self.grad_var_mapping = self._create_vars_from_blocklist(
             self.origin_program,
             grad_blocks,
             add_trainer_suffix=self.trainer_num > 1)
-        self.grad_param_mapping = dict()
+        # dict(grad_splited_var -> param_splited_var)
+        self.grad_param_mapping = collections.OrderedDict()
         for g, p in zip(grad_blocks, param_blocks):
             g_name, g_bid, _ = g.split(":")
             p_name, p_bid, _ = p.split(":")
             self.grad_param_mapping[self.grad_var_mapping[g_name][int(g_bid)]] =  \
-                    self.param_var_mapping[p_name][int(p_bid)]
+                self.param_var_mapping[p_name][int(p_bid)]
 
         # create mapping of endpoint -> split var to create pserver side program
-        self.param_grad_ep_mapping = dict()
+        self.param_grad_ep_mapping = collections.OrderedDict()
         [
             self.param_grad_ep_mapping.update({
                 ep: {
@@ -719,21 +965,21 @@ class DistributeTranspiler(object):
                     out_name = op.output("Out")
 
                     ids_var = program.global_block().vars[ids_name[0]]
-                    prefetch_input_vars = self.create_splited_vars(
+                    prefetch_input_vars = self._create_splited_vars(
                         source_var=ids_var,
                         block=program.global_block(),
                         tag="_prefetch_in_")
                     self.all_prefetch_input_vars.append(prefetch_input_vars)
 
                     out_var = program.global_block().vars[out_name[0]]
-                    prefetch_output_vars = self.create_splited_vars(
+                    prefetch_output_vars = self._create_splited_vars(
                         source_var=out_var,
                         block=program.global_block(),
                         tag="_prefetch_out_")
                     self.all_prefetch_output_vars.append(prefetch_output_vars)
 
                     # insert split_ids_op
-                    program.global_block().insert_op(
+                    program.global_block()._insert_op(
                         index=lookup_table_op_index,
                         type="split_ids",
                         inputs={
@@ -745,18 +991,20 @@ class DistributeTranspiler(object):
                         outputs={"Out": prefetch_input_vars})
 
                     # insert prefetch_op
-                    program.global_block().insert_op(
+                    program.global_block()._insert_op(
                         index=lookup_table_op_index + 1,
                         type="prefetch",
                         inputs={'X': prefetch_input_vars},
                         outputs={"Out": prefetch_output_vars},
                         attrs={
                             "epmap": pserver_endpoints,
-                            RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
+                            # FIXME(qiao) temporarily disable this config because prefetch
+                            # is not act as other rpc op, it's more like a forward op
+                            # RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
                         })
 
                     # insert concat_op
-                    program.global_block().insert_op(
+                    program.global_block()._insert_op(
                         index=lookup_table_op_index + 2,
                         type="merge_ids",
                         inputs={
@@ -787,22 +1035,26 @@ class DistributeTranspiler(object):
             if table_grad_name in op.output_arg_names:
                 op_index = list(all_ops).index(op)
                 # insert split_ids_op
-                program.global_block().insert_op(
+                program.global_block()._insert_op(
                     index=op_index + 1,
                     type="split_ids",
                     inputs={
                         'Ids': [program.global_block().vars[table_grad_name]]
                     },
                     outputs={"Out": self.trainer_side_table_grad_list})
-                program.global_block().insert_op(
+                program.global_block()._insert_op(
                     index=op_index + 2,
                     type="send",
                     inputs={'X': self.trainer_side_table_grad_list},
-                    outputs={},
+                    outputs={'Out': []},
                     attrs={
                         "sync_mode": True,
                         "epmap": pserver_endpoints,
-                        RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
+                        RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE,
+                        OP_ROLE_VAR_ATTR_NAME: [
+                            self.grad_name_to_param_name[table_grad_name],
+                            table_grad_name
+                        ]
                     })
                 break
 
@@ -845,26 +1097,32 @@ class DistributeTranspiler(object):
         # create table param and grad var in pserver program
         origin_param_var = self.origin_program.global_block().vars[
             self.table_name]
+
+        zero_dim = int(
+            math.ceil(origin_param_var.shape[0] / float(
+                len(self.pserver_endpoints))))
+        table_shape = list(origin_param_var.shape)
+        table_shape[0] = zero_dim
+
         param_var = pserver_program.global_block().create_var(
             name=origin_param_var.name,
-            shape=origin_param_var.shape,
+            shape=table_shape,
             dtype=origin_param_var.dtype,
             type=core.VarDesc.VarType.SELECTED_ROWS,
             persistable=True)
         # parameter must be selected rows
         param_var.desc.set_type(core.VarDesc.VarType.SELECTED_ROWS)
-        grad_var = pserver_program.global_block().clone_variable(
+        grad_var = pserver_program.global_block()._clone_variable(
             self.origin_program.global_block().vars[grad_var_name(
                 self.table_name)])
 
         # create table optimize block in pserver program
         table_opt_op = [
             op for op in self.optimize_ops
-            if op.input("Param")[0] == self.table_name
+            if 'Param' in op.input_names and op.input("Param")[0] ==
+            self.table_name
         ][0]
         table_opt_block = pserver_program.create_block(pre_block_idx)
-        # only support sgd now
-        assert table_opt_op.type == "sgd"
 
         if self.sync_mode:
             # create grad vars in pserver program
@@ -893,7 +1151,7 @@ class DistributeTranspiler(object):
             if not splited_grad_name.startswith(origin_grad_name):
                 raise ValueError("origin_grad_var: " + splited_grad_name +
                                  " grad_var:" + grad_var.name)
-            grad_var = pserver_program.global_block().rename_var(
+            grad_var = pserver_program.global_block()._rename_var(
                 origin_grad_name, splited_grad_name)
 
         lr_var = pserver_program.global_block().vars[table_opt_op.input(
@@ -904,11 +1162,12 @@ class DistributeTranspiler(object):
             "LearningRate": [lr_var]
         }
         outputs = {"ParamOut": [param_var]}
-        table_opt_block.append_op(
-            type=table_opt_op.type,
-            inputs=inputs,
-            outputs=outputs,
-            attrs=table_opt_op.attrs)
+        # only support sgd now
+        import logging
+        logging.warn(
+            "distribute lookup table only support sgd optimizer, change it's optimizer to sgd instead of "
+            + table_opt_op.type)
+        table_opt_block.append_op(type="sgd", inputs=inputs, outputs=outputs)
 
         # add table parameter gradient and it's block id to grad_to_block_id
         grad_to_block_id.append(grad_var.name + ":" + str(table_opt_block.idx))
@@ -949,34 +1208,33 @@ class DistributeTranspiler(object):
             block_list (list[(varname, block_id, block_size)]): List of gradient blocks.
             add_trainer_suffix (Bool): Add trainer suffix to new variable's name if set True.
         Returns:
-            var_mapping (dict(varname->[new_varname_variable])):A dict mapping
+            var_mapping (collections.OrderedDict(varname->[new_varname_variable])):A dict mapping
                 from original var name to each var split.
         """
 
         # varname->[(block_id, current_block_size)]
-        block_map = dict()
+        block_map = collections.OrderedDict()
 
-        var_mapping = dict()
+        var_mapping = collections.OrderedDict()
         for block_str in block_list:
             varname, offset, size = block_str.split(":")
-            if not block_map.has_key(varname):
+            if varname not in block_map:
                 block_map[varname] = []
-            block_map[varname].append((long(offset), long(size)))
+            block_map[varname].append((int(offset), int(size)))
 
-        for varname, splited in block_map.iteritems():
+        for varname, splited in six.iteritems(block_map):
             orig_var = program.global_block().var(varname)
             if len(splited) == 1:
                 if self.sync_mode and add_trainer_suffix:
                     new_var_name = "%s.trainer_%d" % \
                         (orig_var.name, self.trainer_id)
-                    program.global_block().rename_var(varname, new_var_name)
+                    program.global_block()._rename_var(varname, new_var_name)
                     var_mapping[varname] = \
                         [program.global_block().var(new_var_name)]
                 else:
                     var_mapping[varname] = \
                         [program.global_block().var(orig_var.name)]
                 continue
-
             var_mapping[varname] = []
             orig_shape = orig_var.shape
             orig_dim1_flatten = 1
@@ -985,7 +1243,7 @@ class DistributeTranspiler(object):
 
             for i, block in enumerate(splited):
                 size = block[1]
-                rows = size / orig_dim1_flatten
+                rows = size // orig_dim1_flatten
                 splited_shape = [rows]
                 if len(orig_shape) >= 2:
                     splited_shape.extend(orig_shape[1:])
@@ -1003,10 +1261,10 @@ class DistributeTranspiler(object):
                     type=orig_var.type,
                     shape=splited_shape)  # flattend splited var
                 var_mapping[varname].append(var)
-            program.global_block().sync_with_cpp()
+            program.global_block()._sync_with_cpp()
         return var_mapping
 
-    def create_splited_vars(self, source_var, block, tag):
+    def _create_splited_vars(self, source_var, block, tag):
         return [
             block.create_var(
                 name=str(source_var.name + tag + str(index)),
@@ -1017,7 +1275,6 @@ class DistributeTranspiler(object):
         ]
 
     def _clone_var(self, block, var, persistable=True):
-        assert isinstance(var, Variable)
         return block.create_var(
             name=var.name,
             shape=var.shape,
@@ -1031,7 +1288,7 @@ class DistributeTranspiler(object):
             height_sections = []
             for v in splited_vars:
                 height_sections.append(v.shape[0])
-            program.global_block().insert_op(
+            program.global_block()._insert_op(
                 index=index + 1,
                 type="split_selected_rows",
                 inputs={"X": orig_var},
@@ -1041,7 +1298,7 @@ class DistributeTranspiler(object):
             sections = []
             for v in splited_vars:
                 sections.append(v.shape[0])
-            program.global_block().insert_op(
+            program.global_block()._insert_op(
                 index=index + 1,
                 type="split_byref",
                 inputs={"X": orig_var},
@@ -1072,8 +1329,8 @@ class DistributeTranspiler(object):
         elif op_type == "momentum":
             if varkey == "Velocity":
                 return param_shape
-        elif op_type == "":
-            if varkey == "Moment":
+        elif op_type == "rmsprop":
+            if varkey in ["Moment", "MeanSquare"]:
                 return param_shape
         elif op_type == "sgd":
             pass
@@ -1127,7 +1384,7 @@ class DistributeTranspiler(object):
         grad_to_block_id.append(merged_var.name + ":" + str(optimize_block.idx))
         if self.sync_mode and self.trainer_num > 1:
             vars2merge = []
-            for i in xrange(self.trainer_num):
+            for i in range(self.trainer_num):
                 per_trainer_name = "%s.trainer_%d" % \
                 (merged_var_name, i)
                 vars2merge.append(pserver_block.vars[per_trainer_name])
@@ -1137,32 +1394,33 @@ class DistributeTranspiler(object):
                 inputs={"X": vars2merge},
                 outputs={"Out": merged_var},
                 attrs={"use_mkldnn": False})
-            # TODO(panyx0718): What if it's SELECTED_ROWS.
-            if not merged_var.type == core.VarDesc.VarType.SELECTED_ROWS:
-                optimize_block.append_op(
-                    type="scale",
-                    inputs={"X": merged_var},
-                    outputs={"Out": merged_var},
-                    attrs={"scale": 1.0 / float(self.trainer_num)})
+            optimize_block.append_op(
+                type="scale",
+                inputs={"X": merged_var},
+                outputs={"Out": merged_var},
+                attrs={"scale": 1.0 / float(self.trainer_num)})
         return merged_var
 
     def _append_pserver_ops(self, optimize_block, opt_op, endpoint,
                             grad_to_block_id, origin_program, merged_var):
         program = optimize_block.program
         pserver_block = program.global_block()
-        new_inputs = dict()
-        # update param/grad shape first, then other inputs like
-        # moment can use the updated shape
+        new_inputs = collections.OrderedDict()
+
+        def _get_param_block(opt_op):
+            # param is already created on global program
+            param_block = None
+            for p in self.param_grad_ep_mapping[endpoint]["params"]:
+                if same_or_split_var(p.name, opt_op.input("Param")[0]):
+                    param_block = p
+                    break
+            return param_block
+
         for key in opt_op.input_names:
             if key == "Grad":
                 new_inputs[key] = merged_var
             elif key == "Param":
-                # param is already created on global program
-                param_block = None
-                for p in self.param_grad_ep_mapping[endpoint]["params"]:
-                    if same_or_split_var(p.name, opt_op.input(key)[0]):
-                        param_block = p
-                        break
+                param_block = _get_param_block(opt_op)
                 if not param_block:
                     return
                 tmpvar = pserver_block.create_var(
@@ -1175,7 +1433,7 @@ class DistributeTranspiler(object):
                 # learning rate variable has already be created by non-optimize op,
                 # don't create it once again.
                 lr_varname = opt_op.input(key)[0]
-                if pserver_block.vars.has_key(lr_varname):
+                if lr_varname in pserver_block.vars:
                     new_inputs[key] = pserver_block.vars[opt_op.input(key)[0]]
                 else:
                     origin_var = origin_program.global_block().vars[lr_varname]
@@ -1211,11 +1469,11 @@ class DistributeTranspiler(object):
             type=opt_op.type,
             inputs=new_inputs,
             outputs=outputs,
-            attrs=opt_op.attrs)
+            attrs=opt_op.all_attrs())
 
     def _is_splited_grad_var(self, var, var_dict):
         grad_block = None
-        for _, g in var_dict.iteritems():
+        for _, g in six.iteritems(var_dict):
             if self._orig_varname(g.name) == self._orig_varname(var.name):
                 if g.name.find(".trainer_") == -1:
                     grad_block = g
@@ -1225,31 +1483,31 @@ class DistributeTranspiler(object):
     def _clone_lr_op(self, program, block, op):
         inputs = self._get_input_map_from_op(
             self.origin_program.global_block().vars, op)
-        for key, varlist in inputs.iteritems():
+        for key, varlist in six.iteritems(inputs):
             if not isinstance(varlist, list):
                 varlist = [varlist]
             for var in varlist:
                 if var not in program.global_block().vars:
-                    block.clone_variable(var)
+                    block._clone_variable(var)
 
         outputs = self._get_output_map_from_op(
             self.origin_program.global_block().vars, op)
-        for key, varlist in outputs.iteritems():
+        for key, varlist in six.iteritems(outputs):
             if not isinstance(varlist, list):
                 varlist = [varlist]
             for var in varlist:
                 if var not in program.global_block().vars:
-                    block.clone_variable(var)
+                    block._clone_variable(var)
 
         return block.append_op(
-            type=op.type, inputs=inputs, outputs=outputs, attrs=op.attrs)
+            type=op.type, inputs=inputs, outputs=outputs, attrs=op.all_attrs())
 
     def _append_pserver_non_opt_ops(self, optimize_block, opt_op):
         program = optimize_block.program
         # Append the ops for parameters that do not need to be optimized/updated
         inputs = self._get_input_map_from_op(
             self.origin_program.global_block().vars, opt_op)
-        for key, varlist in inputs.iteritems():
+        for key, varlist in six.iteritems(inputs):
             if not isinstance(varlist, list):
                 varlist = [varlist]
             for var in varlist:
@@ -1259,7 +1517,7 @@ class DistributeTranspiler(object):
                     var, program.global_block().vars)
                 if grad_block:
                     inputs[key] = grad_block
-                elif not program.global_block().vars.has_key(var.name):
+                elif var.name not in program.global_block().vars:
                     program.global_block().create_var(
                         name=var.name,
                         persistable=var.persistable,
@@ -1268,7 +1526,7 @@ class DistributeTranspiler(object):
 
         outputs = self._get_output_map_from_op(
             self.origin_program.global_block().vars, opt_op)
-        for key, varlist in outputs.iteritems():
+        for key, varlist in six.iteritems(outputs):
             if not isinstance(varlist, list):
                 varlist = [varlist]
             for var in varlist:
@@ -1276,47 +1534,29 @@ class DistributeTranspiler(object):
                     var, program.global_block().vars)
                 if grad_block:
                     outputs[key] = grad_block
-                elif not program.global_block().vars.has_key(var.name):
-                    program.global_block().clone_variable(var)
+                elif var.name not in program.global_block().vars:
+                    program.global_block()._clone_variable(var)
 
         return optimize_block.append_op(
             type=opt_op.type,
             inputs=inputs,
             outputs=outputs,
-            attrs=opt_op.attrs)
+            attrs=opt_op.all_attrs())
 
     def _is_op_connected(self, op1, op2):
         # If one op's input is another op's output or
         # one op's output is another op's input, we say
         # the two operator is connected.
-        def _append_inname_remove_beta(varname_list):
-            op_input_names = []
-            for in_name in varname_list:
-                # HACK: remove beta1 and beta2 to avoid let all
-                # ops connected.
-                if in_name.startswith("beta2_pow_acc") or \
-                    in_name.startswith("beta1_pow_acc"):
-                    continue
-                else:
-                    op_input_names.append(in_name)
-            return op_input_names
-
-        op1_input_names = _append_inname_remove_beta(op1.desc.input_arg_names())
-        op1_output_names = op1.desc.output_arg_names()
-
-        op2_input_names = _append_inname_remove_beta(op2.desc.input_arg_names())
-        op2_output_names = op2.desc.output_arg_names()
-
-        if set(op1_output_names) & set(op2_input_names) or \
-           set(op1_input_names) & set(op2_output_names):
+        if set(op1.desc.output_arg_names()) & set(op2.desc.input_arg_names()) or \
+           set(op1.desc.input_arg_names()) & set(op2.desc.output_arg_names()):
             return True
         return False
 
     def _create_ufind(self, optimize_ops):
         # Create a unit find data struct by optimize ops
         ufind = UnionFind(optimize_ops)
-        for i in xrange(len(optimize_ops)):
-            for j in xrange(i, len(optimize_ops)):
+        for i in range(len(optimize_ops)):
+            for j in range(i, len(optimize_ops)):
                 op1 = optimize_ops[i]
                 op2 = optimize_ops[j]
                 if self._is_op_connected(op1, op2):
@@ -1344,7 +1584,7 @@ class DistributeTranspiler(object):
 
     def _get_input_map_from_op(self, varmap, op):
         """Returns a dict from op input name to the vars in varmap."""
-        iomap = dict()
+        iomap = collections.OrderedDict()
         for key in op.input_names:
             vars = []
             for varname in op.input(key):
@@ -1357,7 +1597,7 @@ class DistributeTranspiler(object):
 
     def _get_output_map_from_op(self, varmap, op):
         """Returns a dict from op output name to the vars in varmap."""
-        iomap = dict()
+        iomap = collections.OrderedDict()
         for key in op.output_names:
             vars = []
             for varname in op.output(key):
@@ -1406,14 +1646,14 @@ class DistributeTranspiler(object):
         # optimize
         op_maker = core.op_proto_and_checker_maker
         optimize_role = core.op_proto_and_checker_maker.OpRole.Optimize
-        if op_maker.kOpRoleAttrName() in op.attrs and \
-            int(op.attrs[op_maker.kOpRoleAttrName()]) == int(optimize_role):
+        if op_maker.kOpRoleAttrName() in op.attr_names and \
+                int(op.all_attrs()[op_maker.kOpRoleAttrName()]) == int(optimize_role):
             return True
         return False
 
     def _get_optimize_pass(self):
         """
-        Get optimizer operators, paramters and gradients from origin_program
+        Get optimizer operators, parameters and gradients from origin_program
         Returns:
             opt_ops (list): optimize operators.
             params_grads (dict): paramter->gradient.
@@ -1430,26 +1670,12 @@ class DistributeTranspiler(object):
                 # and op_role_var to get the pair.
                 for input_name in op.input_arg_names:
                     if input_name.find("@GRAD") != -1 and \
-                        op.attrs[RPC_OP_ROLE_ATTR_NAME]:
-                        param_name = op.attrs[OP_ROLE_VAR_ATTR_NAME][0]
+                        op.attr(RPC_OP_ROLE_ATTR_NAME):
+                        param_name = op.attr(OP_ROLE_VAR_ATTR_NAME)[0]
                         params_grads.append([
                             origin_var_dict[param_name],
                             origin_var_dict[input_name]
                         ])
-            elif self._is_adam_connected_op(op):
-                opt_ops.append(op)
             else:
                 pass
         return opt_ops, params_grads
-
-    def _is_adam_connected_op(self, op):
-        """
-        A hack function to determinate whether the input operator
-        is connected to optimize operator.
-        """
-        if op.type == "scale":
-            for in_name in op.input_arg_names:
-                if in_name.startswith("beta1_pow_acc") or \
-                        in_name.startswith("beta2_pow_acc"):
-                    return True
-        return False
diff --git a/python/paddle/fluid/transpiler/inference_transpiler.py b/python/paddle/fluid/transpiler/inference_transpiler.py
index b8afeae5ebd6ef7948a7c0c2775f419af461da04..49ba2cfd55bc881ed753fcefbd41f5b8fd4ebaf7 100644
--- a/python/paddle/fluid/transpiler/inference_transpiler.py
+++ b/python/paddle/fluid/transpiler/inference_transpiler.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import os
 import numpy as np
 from .. import core
@@ -57,10 +59,82 @@ class InferenceTranspiler(object):
             scope = global_scope()
         if not isinstance(scope, core.Scope):
             raise TypeError("scope should be as Scope type or None")
-        self.fuse_batch_norm(program, place, scope)
-        self.fuse_relu_mkldnn(program)
+        use_mkldnn = bool(os.getenv("FLAGS_use_mkldnn", False))
+
+        self._fuse_batch_norm(program, place, scope)
+        if use_mkldnn:
+            self._fuse_conv_bias_mkldnn(program)
+            self._fuse_conv_relu_mkldnn(program)
+            self._fuse_conv_eltwise_mkldnn(program)
+            self._fuse_conv_relu_mkldnn(
+                program)  # ResNet residual block merging
+            self._fuse_bn_relu_mkldnn(program)
+
+    def _fuse_conv_eltwise_mkldnn(self, program):
+        '''
+        Transpile the program fusing elementwise_add into conv for MKLDNN
+        program. Elementwise add following convolution OP can be fused by adding
+        'fuse_eltwise' attribute to convolution OP and replacing its output
+        Tensor with second parameter of elementwise_add.
+        The result of fuse is:
+            - before:
+                - conv->elementwise_add->any_other_op
+            - after:
+                - conv->any_other_op
+        :param program: program to transpile
+        :type program: Program
+        '''
+        self.block = program.block(0)
+
+        i = 0
+        while i < len(self.block.ops):
+            current_op = self.block.ops[i]
+            if current_op.type in ['conv2d']:
+                next_op = self.block.ops[i + 1]
+                if next_op.type == 'elementwise_add':
+                    self._fuse_conv_eltwise(current_op, next_op)
+                    self.block._remove_op(i + 1)  # Remove elementwise_add
+            i = i + 1
+        self._adjust_input()
+        self._remove_unused_var()
+        # TODO(luotao): use clone() method to flush the program.desc in force,
+        # since some large program.desc will not be flushed immediately.
+        # And a better solution will be considered later.
+        program = program.clone()
+
+    def _fuse_conv_relu_mkldnn(self, program):
+        '''
+        Transpile the program by fused relu activation for MKLDNN program.
+        Relu activation following convolution OP can be fused by adding
+        'fuse_relu' attribute to convolution OP.
+        The result of fuse is:
+            - before:
+                - conv->relu->any_other_op
+            - after:
+                - conv->any_other_op
+        :param program: program to transpile
+        :type program: Program
+        '''
+        self.block = program.block(0)
 
-    def fuse_relu_mkldnn(self, program):
+        i = 0
+        while i < len(self.block.ops):
+            current_op = self.block.ops[i]
+            if current_op.type in ['conv2d']:
+                next_op = self.block.ops[i + 1]
+                if next_op.type == 'relu':
+                    # modify bnorm OP to include relu
+                    current_op.set_attr("fuse_relu", True)
+                    # remove relu OP
+                    self.block._remove_op(i + 1)
+            i = i + 1
+
+        # TODO(luotao): use clone() method to flush the program.desc in force,
+        # since some large program.desc will not be flushed immediately.
+        # And a better solution will be considered later.
+        program = program.clone()
+
+    def _fuse_bn_relu_mkldnn(self, program):
         '''
         Transpile the program by fused relu activation for MKLDNN program.
 
@@ -80,10 +154,6 @@ class InferenceTranspiler(object):
         :param program: program to transpile
         :type program: Program
         '''
-        use_mkldnn = bool(os.getenv("FLAGS_use_mkldnn", False))
-        if not use_mkldnn:
-            return
-
         self.block = program.block(0)
 
         i = 0
@@ -95,7 +165,7 @@ class InferenceTranspiler(object):
                     # modify bnorm OP to include relu
                     current_op.set_attr("fuse_with_relu", True)
                     # remove relu OP
-                    self.block.remove_op(i + 1)
+                    self.block._remove_op(i + 1)
             i = i + 1
 
         self._remove_unused_var()
@@ -104,7 +174,69 @@ class InferenceTranspiler(object):
         # And a better solution will be considered later.
         program = program.clone()
 
-    def fuse_batch_norm(self, program, place, scope):
+    def _fuse_conv_bias_mkldnn(self, program):
+        '''
+        Transpile the program by fused convolution and elementwise_add.
+
+        Replace conv2d and elementwise_add ops with a new conv2d op
+        based on an old conv2d op and the :math:`Bias` taken from
+        elementwise_add.
+
+        For input :math:`X`:
+
+        - Conv process:            :math:`X = input * W`
+        - Elementwise_add process: :math` X = X + bias`
+
+        After fuse into one operation:
+
+        .. math::
+
+            X = input * W + bias
+
+        The operator transformation is:
+
+        - before:
+
+          - conv->elementwise_add->any_other_op
+
+        - after:
+
+          - conv->any_other_op
+
+        The transpile stages are:
+
+        1. Extract bias and output variables from elementwise_add.
+        2. Extract Input, Weight and attributes from conv op.
+        3. Create a new convolution op based on extracted params.
+        4. Remove old conv op.
+        5. Remove elementwise_add.
+        5. Remove unused variables.
+
+        Args:
+            program (Program): program to transpile
+
+        '''
+        self.block = program.block(0)
+
+        i = 0
+        while i < len(self.block.ops) - 2:
+            current_op = self.block.ops[i]
+            next_op = self.block.ops[i + 1]
+            # conv2d with bias
+            if current_op.type in ['conv2d'] and \
+               next_op.type in ['elementwise_add']:
+                self._fuse_conv_bias(i, current_op, next_op)
+                self.block._remove_op(i + 1)  # Remove old conv
+                self.block._remove_op(i + 1)  # Remove elementwise_add
+            i = i + 1
+
+        self._remove_unused_var()
+        # TODO(luotao): use clone() method to flush the program.desc in force,
+        # since some large program.desc will not be flushed immediately.
+        # And a better solution will be considered later.
+        program = program.clone()
+
+    def _fuse_batch_norm(self, program, place, scope):
         '''
         Transpile the program by fused batch normalization.
 
@@ -171,7 +303,7 @@ class InferenceTranspiler(object):
                     # fuse batch_norm
                     self._fuse_param(current_op, next_op, bias_op, 0)
                     # remove batch_norm_op
-                    self.block.remove_op(i + 2)
+                    self.block._remove_op(i + 2)
                     i = i + 1
                 # conv2d with bias, the next_op.type is elementwise_add
                 elif (next_op.type == 'elementwise_add'):
@@ -180,10 +312,9 @@ class InferenceTranspiler(object):
                         # fuse batch_norm
                         self._fuse_param(current_op, next_next_op, next_op, 1)
                         # remove batch_norm_op
-                        self.block.remove_op(i + 2)
+                        self.block._remove_op(i + 2)
                         i = i + 1
             i = i + 1
-
         self._adjust_input()
         self._remove_unused_var()
         # TODO(luotao): use clone() method to flush the program.desc in force,
@@ -212,7 +343,7 @@ class InferenceTranspiler(object):
         y_var = self.block.var(bn_op.input("Bias")[0])
         out_var = self.block.var(bn_op.output("Y")[0])
 
-        bias_op = self.block.insert_op(
+        bias_op = self.block._insert_op(
             index,
             type="elementwise_add",
             inputs={"X": x_var,
@@ -286,6 +417,47 @@ class InferenceTranspiler(object):
         # collect the renamed input
         self.input_map[bn_op.output("Y")[0]] = bias_op.output("Out")[0]
 
+    def _fuse_conv_bias(self, index, conv_op, elementwise_add_op):
+        '''
+        fuse the conv op with elementwise_add
+
+        :param index: index of the conv_op in ops list
+        :type index: Int
+        :param conv_op: convolution operator
+        :type conv_op: Operator
+        :param elementwise_add_op: convolution's bias operator
+        :type elementwise_add_op: Operator
+        '''
+
+        bias_var = self.block.var(elementwise_add_op.input("Y")[0])
+        out_var = self.block.var(elementwise_add_op.output("Out")[0])
+        filter_var = self.block.var(conv_op.input("Filter")[0])
+        in_var = self.block.var(conv_op.input("Input")[0])
+        attrs = {name: conv_op.attr(name) for name in conv_op.attr_names}
+
+        self.block._insert_op(
+            index,
+            type="conv2d",
+            inputs={"Input": in_var,
+                    "Filter": filter_var,
+                    "Bias": bias_var},
+            outputs={"Output": out_var},
+            attrs=attrs)
+
+    def _fuse_conv_eltwise(self, conv_op, eltwise_op):
+        '''
+        fuse the conv op with elementwise_add
+
+        :param conv_op: convolution operator
+        :type conv_op: Operator
+        :param eltwise_op: operator adding data from skip connection
+        :type eltwise_op: Operator
+        '''
+
+        conv_op.set_attr("fuse_eltwise", True)
+        self.input_map[conv_op.output("Output")[0]] = eltwise_op.input("Y")[0]
+        self.input_map[eltwise_op.output("Out")[0]] = eltwise_op.input("Y")[0]
+
     def _adjust_input(self):
         for i in range(len(self.block.ops)):
             current_op = self.block.ops[i]
@@ -305,6 +477,6 @@ class InferenceTranspiler(object):
             args += current_op.output_arg_names
         args = list(set(args))  # unique the input and output arguments
 
-        for var in self.block.vars.keys():
+        for var in list(self.block.vars.keys()):
             if var not in args:
-                self.block.remove_var(var)
+                self.block._remove_var(var)
diff --git a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
index 999ef43ca0feacbddff5f9db59589ce7097fe77e..3e58e125de4188144646236f7999c620cd8e9459 100644
--- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
+++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
@@ -12,10 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 from collections import defaultdict
 from .. import core
-from ..framework import Program, default_main_program, Parameter, Variable
+from ... import compat as cpt
+from ..framework import Program, default_main_program, Parameter
 from ..backward import _rename_arg_
+from functools import reduce
+from six.moves import range
 
 dtype_to_size = {
     core.VarDesc.VarType.FP16: 2,
@@ -107,7 +112,7 @@ class ControlFlowGraph(object):
         # Repeatedly apply liveness updates until the algorithm stablize
         # on a complete set live input vars and live output vars.
         while True:
-            for i in reversed(range(self.op_size)):
+            for i in reversed(list(range(self.op_size))):
                 live_in[i] = set(self._live_in[i])
                 live_out[i] = set(self._live_out[i])
                 for s in self._successors[i]:
@@ -123,15 +128,15 @@ class ControlFlowGraph(object):
 
     def _has_var(self, block_desc, var_name, is_forward):
         if is_forward:
-            return block_desc.has_var(str(var_name))
+            return block_desc.has_var(cpt.to_bytes(var_name))
         else:
-            return block_desc.has_var_recursive(str(var_name))
+            return block_desc.has_var_recursive(cpt.to_bytes(var_name))
 
     def _find_var(self, block_desc, var_name, is_forward):
         if is_forward:
-            return block_desc.find_var(str(var_name))
+            return block_desc.find_var(cpt.to_bytes(var_name))
         else:
-            return block_desc.find_var_recursive(str(var_name))
+            return block_desc.find_var_recursive(cpt.to_bytes(var_name))
 
     def _check_var_validity(self, block_desc, x, is_forward):
         if str(x) == "@EMPTY@":
@@ -172,12 +177,13 @@ class ControlFlowGraph(object):
             is_forward = i < self._forward_num
             in_diff, out_diff = self._get_diff(self._live_in[i],
                                                self._live_out[i])
-            can_optimize = filter(
-                lambda x: self._check_var_validity(block_desc, x, is_forward),
-                in_diff)
+            can_optimize = [
+                x for x in in_diff
+                if self._check_var_validity(block_desc, x, is_forward)
+            ]
             if can_optimize:
                 index = i + fwd_id + 1 if is_forward else i - self._forward_num + bwd_id + 1
-                delete_op = block_desc.insert_op(index)
+                delete_op = block_desc._insert_op(index)
                 delete_op.set_type("delete_var")
                 delete_op.set_input("X", can_optimize)
                 if is_forward:
@@ -213,9 +219,10 @@ class ControlFlowGraph(object):
             block_desc = op.block()
             is_forward = i < self._forward_num
             if self.pool:
-                defs_can_optimize = filter(
-                    lambda x: self._check_var_validity(block_desc, x, is_forward),
-                    self._defs[i])
+                defs_can_optimize = [
+                    x for x in self._defs[i]
+                    if self._check_var_validity(block_desc, x, is_forward)
+                ]
                 out_pair = [
                     (x, self._find_var(block_desc, x, is_forward).shape())
                     for x in defs_can_optimize
@@ -254,16 +261,17 @@ class ControlFlowGraph(object):
                         # Rename the var to the cache var already with
                         # memory allocated in order to reuse the memory.
                         _rename_arg_(self._ops, x, cache_var, begin_idx=i)
-                        self._program.block(block_desc.id).var(str(
+                        self._program.block(block_desc.id).var(cpt.to_text(
                             x)).desc = self._find_var(block_desc, cache_var,
                                                       is_forward)
                         self._update_graph(x, cache_var, begin_idx=i)
                         break
 
             in_diff, _ = self._get_diff(self._live_in[i], self._live_out[i])
-            can_optimize = filter(
-                lambda x: self._check_var_validity(block_desc, x, is_forward),
-                in_diff)
+            can_optimize = [
+                x for x in in_diff
+                if self._check_var_validity(block_desc, x, is_forward)
+            ]
             if can_optimize:
                 for var_name in can_optimize:
                     self.pool.append((var_name, self._find_var(
@@ -324,6 +332,8 @@ def _process_sub_block_pair(pdesc, sub_block_pair):
             sub_op_output = set()
             sub_op_output.update(sub_op_dict[fwd_id].output_arg_names())
             sub_op_output.update(sub_op_dict[grad_id].output_arg_names())
+            sub_op_output.update(sub_op_dict[fwd_id].input_arg_names())
+            sub_op_output.update(sub_op_dict[grad_id].input_arg_names())
             ops_list.append((sub_block_ops, block_op_size, sub_op_output))
 
         # Process rest fwd_op block ops
@@ -335,6 +345,7 @@ def _process_sub_block_pair(pdesc, sub_block_pair):
                 sub_block_ops.append(sub_block.op(i))
             sub_op_output = set()
             sub_op_output.update(sub_op_dict[fwd_id].output_arg_names())
+            sub_op_output.update(sub_op_dict[fwd_id].input_arg_names())
             ops_list.append((sub_block_ops, sub_block_op_size, sub_op_output))
     return ops_list
 
@@ -349,13 +360,17 @@ def _get_cfgs(input_program):
     pdesc = input_program.get_desc()
     block_desc = pdesc.block(0)
     op_size = block_desc.op_size()
-    # Get global block ops
-    ops_list.append(
-        ([block_desc.op(i) for i in range(op_size)], op_size, set()))
 
     # Only process one level of nested subblock.
     ops_list.extend(_process_sub_block_pair(pdesc, SUB_BLOCK_PAIR))
 
+    skip_opt_set = set()
+    for _, _, skip_opt in ops_list:
+        skip_opt_set.update(skip_opt)
+
+    # Get global block ops
+    ops_list.insert(
+        0, ([block_desc.op(i) for i in range(op_size)], op_size, skip_opt_set))
     cfgs = [
         ControlFlowGraph(input_program, ops, forward_num, skip_opt)
         for ops, forward_num, skip_opt in ops_list
diff --git a/python/paddle/fluid/transpiler/ps_dispatcher.py b/python/paddle/fluid/transpiler/ps_dispatcher.py
index dcffadd531719431f27feb464ed58a65c04770ee..6a6d14a69ba771e192a28951a6df7027741a655a 100644
--- a/python/paddle/fluid/transpiler/ps_dispatcher.py
+++ b/python/paddle/fluid/transpiler/ps_dispatcher.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 
 class PSDispatcher(object):
     """
diff --git a/python/paddle/fluid/unique_name.py b/python/paddle/fluid/unique_name.py
index 776619cd36722e338a9fdd5e13bceeaf3724de2c..b9957a699e597898bee75ce0e7283f7224293f0c 100644
--- a/python/paddle/fluid/unique_name.py
+++ b/python/paddle/fluid/unique_name.py
@@ -12,8 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import collections
 import contextlib
+import six
 import sys
 
 __all__ = ['generate', 'switch', 'guard']
@@ -67,8 +70,10 @@ def switch(new_generator=None):
 
 @contextlib.contextmanager
 def guard(new_generator=None):
-    if isinstance(new_generator, basestring):
+    if isinstance(new_generator, six.string_types):
         new_generator = UniqueNameGenerator(new_generator)
+    elif isinstance(new_generator, six.binary_type):
+        new_generator = UniqueNameGenerator(new_generator.decode())
     old = switch(new_generator)
     yield
     switch(old)
diff --git a/python/paddle/reader/__init__.py b/python/paddle/reader/__init__.py
index 3b059735a924d58714cd88a761eb83143f1192d6..678026cf95970e8ff58c1bad20246059ffb464c1 100644
--- a/python/paddle/reader/__init__.py
+++ b/python/paddle/reader/__init__.py
@@ -66,9 +66,9 @@ An example implementation for multiple item data reader creator:
 TODO(yuyang18): Should we add whole design doc here?
 """
 
-import decorator
-from decorator import *
+import paddle.reader.decorator
+from paddle.reader.decorator import *
 
-import creator
+import paddle.reader.creator
 
 __all__ = decorator.__all__ + ['creator']
diff --git a/python/paddle/reader/creator.py b/python/paddle/reader/creator.py
index 4c905d959fad4e8c1a8826ce8dc60c5fa834514d..c861020225fb6fe0a29653363c2151b20dc8f578 100644
--- a/python/paddle/reader/creator.py
+++ b/python/paddle/reader/creator.py
@@ -67,11 +67,14 @@ def recordio(paths, buf_size=100):
 
     import recordio as rec
     import paddle.reader.decorator as dec
-    import cPickle as pickle
+    import six
+    import six.moves.cPickle as pickle
 
     def reader():
-        if isinstance(paths, basestring):
+        if isinstance(paths, six.string_types):
             path = paths
+        elif isinstance(paths, six.binary_type):
+            path = paths.decode()
         else:
             path = ",".join(paths)
         f = rec.reader(path)
diff --git a/python/paddle/reader/decorator.py b/python/paddle/reader/decorator.py
index 1f83cabb8481451736944823be45185deea4f43b..5b9459b670ac8583ee0e65a3c1b51f6248bb6303 100644
--- a/python/paddle/reader/decorator.py
+++ b/python/paddle/reader/decorator.py
@@ -14,16 +14,23 @@
 
 __all__ = [
     'map_readers', 'buffered', 'compose', 'chain', 'shuffle',
-    'ComposeNotAligned', 'firstn', 'xmap_readers', 'PipeReader'
+    'ComposeNotAligned', 'firstn', 'xmap_readers', 'PipeReader',
+    'multiprocess_reader'
 ]
 
 from threading import Thread
 import subprocess
+import multiprocessing
+import sys
 
-from Queue import Queue
+from six.moves.queue import Queue
+from six.moves import zip_longest
+from six.moves import map
+from six.moves import zip
 import itertools
 import random
 import zlib
+import paddle.compat as cpt
 
 
 def map_readers(func, *readers):
@@ -42,7 +49,7 @@ def map_readers(func, *readers):
         rs = []
         for r in readers:
             rs.append(r())
-        for e in itertools.imap(func, *rs):
+        for e in map(func, *rs):
             yield e
 
     return reader
@@ -148,16 +155,16 @@ def compose(*readers, **kwargs):
         for r in readers:
             rs.append(r())
         if not check_alignment:
-            for outputs in itertools.izip(*rs):
-                yield sum(map(make_tuple, outputs), ())
+            for outputs in zip(*rs):
+                yield sum(list(map(make_tuple, outputs)), ())
         else:
-            for outputs in itertools.izip_longest(*rs):
+            for outputs in zip_longest(*rs):
                 for o in outputs:
                     if o is None:
                         # None will be not be present if compose is aligned
                         raise ComposeNotAligned(
                             "outputs of readers are not aligned.")
-                yield sum(map(make_tuple, outputs), ())
+                yield sum(list(map(make_tuple, outputs)), ())
 
     return reader
 
@@ -306,7 +313,7 @@ def xmap_readers(mapper, reader, process_num, buffer_size, order=False):
         args = (in_queue, out_queue, mapper, out_order) if order else (
             in_queue, out_queue, mapper)
         workers = []
-        for i in xrange(process_num):
+        for i in range(process_num):
             worker = Thread(target=target, args=args)
             worker.daemon = True
             workers.append(worker)
@@ -328,6 +335,100 @@ def xmap_readers(mapper, reader, process_num, buffer_size, order=False):
     return xreader
 
 
+def multiprocess_reader(readers, use_pipe=True, queue_size=1000):
+    """
+    multiprocess_reader use python multi process to read data from readers
+    and then use multiprocess.Queue or multiprocess.Pipe to merge all
+    data. The process number is equal to the number of input readers, each
+    process call one reader.
+
+    Multiprocess.Queue require the rw access right to /dev/shm, some
+    platform does not support.
+
+    you need to create multiple readers first, these readers should be independent
+    to each other so that each process can work independently.
+
+    An example:
+
+    .. code-block:: python
+
+        reader0 = reader(["file01", "file02"])
+        reader1 = reader(["file11", "file12"])
+        reader1 = reader(["file21", "file22"])
+        reader = multiprocess_reader([reader0, reader1, reader2],
+            queue_size=100, use_pipe=False)
+    """
+
+    try:
+        import ujson as json
+    except Exception as e:
+        sys.stderr.write("import ujson error: " + str(e) + " use json\n")
+        import json
+
+    assert type(readers) is list and len(readers) > 0
+
+    def _read_into_queue(reader, queue):
+        for sample in reader():
+            if sample is None:
+                raise ValueError("sample has None")
+            queue.put(sample)
+        queue.put(None)
+
+    def queue_reader():
+        queue = multiprocessing.Queue(queue_size)
+        for reader in readers:
+            p = multiprocessing.Process(
+                target=_read_into_queue, args=(reader, queue))
+            p.start()
+
+        reader_num = len(readers)
+        finish_num = 0
+        while finish_num < reader_num:
+            sample = queue.get()
+            if sample is None:
+                finish_num += 1
+            else:
+                yield sample
+
+    def _read_into_pipe(reader, conn):
+        for sample in reader():
+            if sample is None:
+                raise ValueError("sample has None!")
+            conn.send(json.dumps(sample))
+        conn.send(json.dumps(None))
+        conn.close()
+
+    def pipe_reader():
+        conns = []
+        for reader in readers:
+            parent_conn, child_conn = multiprocessing.Pipe()
+            conns.append(parent_conn)
+            p = multiprocessing.Process(
+                target=_read_into_pipe, args=(reader, child_conn))
+            p.start()
+
+        reader_num = len(readers)
+        finish_num = 0
+        conn_to_remove = []
+        while finish_num < reader_num:
+            for conn in conn_to_remove:
+                conns.remove(conn)
+            conn_to_remove = []
+            for conn in conns:
+                sample = json.loads(conn.recv())
+                if sample is None:
+                    finish_num += 1
+                    conn.close()
+                    conn_to_remove.append(conn)
+                else:
+                    yield sample
+
+    if use_pipe:
+        return pipe_reader
+    else:
+        return queue_reader
+
+
 def _buf2lines(buf, line_break="\n"):
     # FIXME: line_break should be automatically configured.
     lines = buf.split(line_break)
@@ -387,9 +488,9 @@ class PipeReader:
             buff = self.process.stdout.read(self.bufsize)
             if buff:
                 if self.file_type == "gzip":
-                    decomp_buff = self.dec.decompress(buff)
+                    decomp_buff = cpt.to_text(self.dec.decompress(buff))
                 elif self.file_type == "plain":
-                    decomp_buff = buff
+                    decomp_buff = cpt.to_text(buff)
                 else:
                     raise TypeError("file_type %s is not allowed" %
                                     self.file_type)
diff --git a/python/paddle/reader/tests/creator_test.py b/python/paddle/reader/tests/creator_test.py
index c4238c12a74759d52eb09f31ce1126cc93dd3489..d7107610a5dd751cad8f8365aec32c6ba92c53ae 100644
--- a/python/paddle/reader/tests/creator_test.py
+++ b/python/paddle/reader/tests/creator_test.py
@@ -29,6 +29,7 @@ import os
 import unittest
 import numpy as np
 import paddle.reader.creator
+import six
 
 
 class TestNumpyArray(unittest.TestCase):
@@ -37,7 +38,7 @@ class TestNumpyArray(unittest.TestCase):
         x = np.array(l, np.int32)
         reader = paddle.reader.creator.np_array(x)
         for idx, e in enumerate(reader()):
-            self.assertItemsEqual(e, l[idx])
+            six.assertCountEqual(self, e, l[idx])
 
 
 class TestTextFile(unittest.TestCase):
diff --git a/python/paddle/reader/tests/decorator_test.py b/python/paddle/reader/tests/decorator_test.py
index bee24d3b6579db5e99ec66931df201fdf9e1af07..c324092f8850e4bd64955aa9c987746b5cec54b5 100644
--- a/python/paddle/reader/tests/decorator_test.py
+++ b/python/paddle/reader/tests/decorator_test.py
@@ -14,6 +14,7 @@
 
 import time
 import unittest
+import functools
 
 import paddle.reader
 
@@ -136,7 +137,7 @@ class TestXmap(unittest.TestCase):
                     reader = paddle.reader.xmap_readers(mapper,
                                                         reader_creator_10(0),
                                                         tNum, size, order)
-                    for n in xrange(3):
+                    for n in range(3):
                         result = []
                         for i in reader():
                             result.append(i)
@@ -156,7 +157,7 @@ class TestPipeReader(unittest.TestCase):
 
         import tempfile
 
-        records = [str(i) for i in xrange(5)]
+        records = [str(i) for i in range(5)]
         temp = tempfile.NamedTemporaryFile()
         try:
             with open(temp.name, 'w') as f:
@@ -174,5 +175,33 @@ class TestPipeReader(unittest.TestCase):
             temp.close()
 
 
+class TestMultiProcessReader(unittest.TestCase):
+    def setup(self):
+        self.samples = []
+        for i in range(1000):
+            self.samples.append([[i], [i + 1, i + 2], i + 3])
+
+        def reader(index):
+            for i in range(len(self.samples)):
+                if i % 3 == index:
+                    yield self.samples[i]
+
+        self.reader0 = functools.partial(reader, 0)
+        self.reader1 = functools.partial(reader, 1)
+        self.reader2 = functools.partial(reader, 2)
+
+    def reader_test(self, use_pipe):
+        self.setup()
+        results = []
+        for data in paddle.reader.multiprocess_reader(
+            [self.reader0, self.reader1, self.reader2], 100, use_pipe)():
+            results.append(data)
+        self.assertEqual(sorted(self.samples), sorted(results))
+
+    def test_multi_process_reader(self):
+        self.reader_test(use_pipe=False)
+        self.reader_test(use_pipe=True)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/trainer/PyDataProviderWrapper.py b/python/paddle/trainer/PyDataProviderWrapper.py
index 6af250772859811b3c48434ab005e50b435dd320..374976db9f17ad9b1fd33c5d4adf77155336d100 100644
--- a/python/paddle/trainer/PyDataProviderWrapper.py
+++ b/python/paddle/trainer/PyDataProviderWrapper.py
@@ -42,7 +42,7 @@ except ImportError:
 try:
     import cPickle as pickle
 except ImportError:
-    import pickle
+    import six.moves.cPickle as pickle
 
 import io
 
diff --git a/python/paddle/trainer_config_helpers/data_sources.py b/python/paddle/trainer_config_helpers/data_sources.py
index ab9a2562dcccb394c0b24741ceeb10061e40cb9a..a2a32d848cbc4200397e6a12a3662419102da0a9 100644
--- a/python/paddle/trainer_config_helpers/data_sources.py
+++ b/python/paddle/trainer_config_helpers/data_sources.py
@@ -20,7 +20,7 @@ from .utils import deprecated
 try:
     import cPickle as pickle
 except ImportError:
-    import pickle
+    import six.moves.cPickle as pickle
 
 __all__ = ['define_py_data_sources2']
 
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index d9787ef42a31b8dfd1836e7a01d5664049cc66b5..ee34c157334b533b9c330b8103424964d7df510b 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -28,7 +28,7 @@ from .default_decorators import *
 try:
     import cPickle as pickle
 except ImportError:
-    import pickle
+    import six.moves.cPickle as pickle
 import copy
 
 __all__ = [
diff --git a/python/paddle/utils/dump_config.py b/python/paddle/utils/dump_config.py
index d27af7f76246a4c9db9a43c17715506d82031b9c..6a96a0a78fc77c50904ee7822c725c41e646c5e6 100644
--- a/python/paddle/utils/dump_config.py
+++ b/python/paddle/utils/dump_config.py
@@ -37,9 +37,9 @@ if __name__ == '__main__':
     assert isinstance(conf, TrainerConfig_pb2.TrainerConfig)
 
     if whole_conf:
-        print conf
+        print(conf)
     else:
         if binary:
             sys.stdout.write(conf.model_config.SerializeToString())
         else:
-            print conf.model_config
+            print(conf.model_config)
diff --git a/python/paddle/utils/image_multiproc.py b/python/paddle/utils/image_multiproc.py
index 3e3e519f76d388eeb477f0014bcbb3e7cd09352a..d1bbda3fd3562efe486377d41a9fb7359bafa4e7 100644
--- a/python/paddle/utils/image_multiproc.py
+++ b/python/paddle/utils/image_multiproc.py
@@ -15,7 +15,8 @@
 import os, sys
 import numpy as np
 from PIL import Image
-from cStringIO import StringIO
+import six
+from six.moves import cStringIO as StringIO
 import multiprocessing
 import functools
 import itertools
@@ -187,7 +188,8 @@ class PILTransformer(ImageTransformer):
         return self.transform(im)
 
 
-def job(is_img_string, transformer, (data, label)):
+def job(is_img_string, transformer, data_label_pack):
+    (data, label) = data_label_pack
     if is_img_string:
         return transformer.transform_from_string(data), label
     else:
@@ -208,7 +210,7 @@ class MultiProcessImageTransformer(object):
         """
         Processing image with multi-process. If it is used in PyDataProvider,
         the simple usage for CNN is as follows:
-       
+
         .. code-block:: python
 
             def hool(settings, is_train,  **kwargs):
@@ -229,7 +231,7 @@ class MultiProcessImageTransformer(object):
             @provider(init_hook=hook, pool_size=20480)
             def process(settings, file_list):
                 with open(file_list, 'r') as fdata:
-                    for line in fdata: 
+                    for line in fdata:
                         data_dic = np.load(line.strip()) # load the data batch pickled by Pickle.
                         data = data_dic['data']
                         labels = data_dic['label']
@@ -249,10 +251,10 @@ class MultiProcessImageTransformer(object):
         :type channel_swap: tuple or list
         :param mean: the mean values of image, per-channel mean or element-wise mean.
         :type mean: array, The dimension is 1 for per-channel mean.
-                    The dimension is 3 for element-wise mean. 
+                    The dimension is 3 for element-wise mean.
         :param is_train: training peroid or testing peroid.
         :type is_train: bool.
-        :param is_color: the image is color or gray. 
+        :param is_color: the image is color or gray.
         :type is_color: bool.
         :param is_img_string: The input can be the file name of image or image string.
         :type is_img_string: bool.
@@ -273,4 +275,4 @@ class MultiProcessImageTransformer(object):
     def run(self, data, label):
         fun = functools.partial(job, self.is_img_string, self.transformer)
         return self.pool.imap_unordered(
-            fun, itertools.izip(data, label), chunksize=100 * self.procnum)
+            fun, six.moves.zip(data, label), chunksize=100 * self.procnum)
diff --git a/python/paddle/utils/image_util.py b/python/paddle/utils/image_util.py
index d3d79b14405256bbc95c41d805dbee56cb104f5e..a8092349cde8a4cb30873bf819fd5ed96289a945 100644
--- a/python/paddle/utils/image_util.py
+++ b/python/paddle/utils/image_util.py
@@ -14,7 +14,7 @@
 
 import numpy as np
 from PIL import Image
-from cStringIO import StringIO
+from six.moves import cStringIO as StringIO
 
 
 def resize_image(img, target_size):
@@ -34,7 +34,7 @@ def flip(im):
     """
     Return the flipped image.
     Flip an image along the horizontal direction.
-    im: input image, (H x W x K) ndarrays 
+    im: input image, (H x W x K) ndarrays
     """
     if len(im.shape) == 3:
         return im[:, :, ::-1]
@@ -132,7 +132,7 @@ def load_meta(meta_path, mean_img_size, crop_size, color=True):
 
 def load_image(img_path, is_color=True):
     """
-    Load image and return. 
+    Load image and return.
     img_path: image path.
     is_color: is color image or not.
     """
@@ -205,7 +205,7 @@ class ImageTransformer:
 
     def set_mean(self, mean):
         if mean is not None:
-            # mean value, may be one value per channel 
+            # mean value, may be one value per channel
             if mean.ndim == 1:
                 mean = mean[:, np.newaxis, np.newaxis]
             else:
diff --git a/python/paddle/utils/make_model_diagram.py b/python/paddle/utils/make_model_diagram.py
index 40f99075de7fb2401b3b704afe1eb44dbe6072dd..52759d3ad230c3a5a5488a8bc46a2e8f8fae1025 100644
--- a/python/paddle/utils/make_model_diagram.py
+++ b/python/paddle/utils/make_model_diagram.py
@@ -15,6 +15,9 @@
 # Generate dot diagram file for the given paddle model config
 # The generated file can be viewed using Graphviz (http://graphviz.org)
 
+from __future__ import print_function
+
+import six
 import sys
 import traceback
 
@@ -61,9 +64,9 @@ def make_diagram_from_proto(model_config, dot_file):
                                              name2id[mem.link_name])
         return s
 
-    print >> f, 'digraph graphname {'
-    print >> f, 'node [width=0.375,height=0.25];'
-    for i in xrange(len(model_config.layers)):
+    print('digraph graphname {', file=f)
+    print('node [width=0.375,height=0.25];', file=f)
+    for i in six.moves.xrange(len(model_config.layers)):
         l = model_config.layers[i]
         name2id[l.name] = i
 
@@ -71,12 +74,12 @@ def make_diagram_from_proto(model_config, dot_file):
     for sub_model in model_config.sub_models:
         if sub_model.name == 'root':
             continue
-        print >> f, 'subgraph cluster_%s {' % i
-        print >> f, 'style=dashed;'
+        print('subgraph cluster_%s {' % i, file=f)
+        print('style=dashed;', file=f)
         label = '%s ' % sub_model.name
         if sub_model.reversed:
             label += '<=='
-        print >> f, 'label = "%s";' % label
+        print('label = "%s";' % label, file=f)
         i += 1
         submodel_layers.add(sub_model.name)
         for layer_name in sub_model.layer_names:
@@ -84,37 +87,41 @@ def make_diagram_from_proto(model_config, dot_file):
             lid = name2id[layer_name]
             layer_config = model_config.layers[lid]
             label = make_layer_label(layer_config)
-            print >> f, 'l%s [label="%s", shape=box];' % (lid, label)
-        print >> f, '}'
+            print('l%s [label="%s", shape=box];' % (lid, label), file=f)
+        print('}', file=f)
 
-    for i in xrange(len(model_config.layers)):
+    for i in six.moves.xrange(len(model_config.layers)):
         l = model_config.layers[i]
         if l.name not in submodel_layers:
             label = make_layer_label(l)
-            print >> f, 'l%s [label="%s", shape=box];' % (i, label)
+            print('l%s [label="%s", shape=box];' % (i, label), file=f)
 
     for sub_model in model_config.sub_models:
         if sub_model.name == 'root':
             continue
         for link in sub_model.in_links:
-            print >> f, make_link(link)
+            print(make_link(link), file=f)
         for link in sub_model.out_links:
-            print >> f, make_link(link)
+            print(make_link(link), file=f)
         for mem in sub_model.memories:
-            print >> f, make_mem(mem)
+            print(make_mem(mem), file=f)
 
-    for i in xrange(len(model_config.layers)):
+    for i in six.moves.xrange(len(model_config.layers)):
         for l in model_config.layers[i].inputs:
-            print >> f, 'l%s -> l%s [label="%s"];' % (
-                name2id[l.input_layer_name], i, l.input_parameter_name)
+            print(
+                'l%s -> l%s [label="%s"];' % (name2id[l.input_layer_name], i,
+                                              l.input_parameter_name),
+                file=f)
 
-    print >> f, '}'
+    print('}', file=f)
     f.close()
 
 
 def usage():
-    print >> sys.stderr, ("Usage: python show_model_diagram.py" +
-                          " CONFIG_FILE DOT_FILE [config_str]")
+    print(
+        ("Usage: python show_model_diagram.py" +
+         " CONFIG_FILE DOT_FILE [config_str]"),
+        file=sys.stderr)
     exit(1)
 
 
diff --git a/python/paddle/utils/merge_model.py b/python/paddle/utils/merge_model.py
index 2b100207728a8532e900992f7db4d3910e893dea..b74649e93640c3600636034d58792b8d12dffeda 100644
--- a/python/paddle/utils/merge_model.py
+++ b/python/paddle/utils/merge_model.py
@@ -70,4 +70,4 @@ def merge_v2_model(net, param_file, output_file):
         for pname in param_names:
             params.serialize(pname, f)
 
-    print 'Generate  %s  success!' % (output_file)
+    print('Generate  %s  success!' % (output_file))
diff --git a/python/paddle/utils/plotcurve.py b/python/paddle/utils/plotcurve.py
index 27bd8157d39632913e2fa3278f3af20ddea61da7..a95e5497e23571e61e5d7652830a99efd7793083 100644
--- a/python/paddle/utils/plotcurve.py
+++ b/python/paddle/utils/plotcurve.py
@@ -44,6 +44,7 @@ To use this script to generate plot for AvgCost, error:
    python plotcurve.py -i paddle.INFO -o figure.png AvgCost error
 """
 
+import six
 import sys
 import matplotlib
 # the following line is added immediately after import matplotlib
@@ -91,7 +92,7 @@ def plot_paddle_curve(keys, inputfile, outputfile, format='png',
         sys.stderr.write("No data to plot. Exiting!\n")
         return
     m = len(keys) + 1
-    for i in xrange(1, m):
+    for i in six.moves.xrange(1, m):
         pyplot.plot(
             x[:, 0],
             x[:, i],
diff --git a/python/paddle/utils/predefined_net.py b/python/paddle/utils/predefined_net.py
index fa05f981f2b66bf55303a6f7c332c0bc9b112d29..2801f4877c079615239b92be146b3e33df16b37f 100644
--- a/python/paddle/utils/predefined_net.py
+++ b/python/paddle/utils/predefined_net.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import numpy as np
+import six
 import os
 from paddle.trainer.config_parser import *
 from paddle.utils.preprocess_img import \
@@ -112,7 +113,7 @@ def simple_conv_net(data_conf, is_color=False):
         num_classes: num of classes.
         is_color: whether the input images are color.
     """
-    for k, v in data_conf.iteritems():
+    for k, v in six.iteritems(data_conf):
         globals()[k] = v
     data_input, label_input, num_image_channels = \
         image_data_layers(image_size, num_classes, is_color, is_predict)
@@ -340,7 +341,7 @@ def small_vgg(data_conf, is_predict=False):
         num_classes: num of classes.
         is_color: whether the input images are color.
     """
-    for k, v in data_conf.iteritems():
+    for k, v in six.iteritems(data_conf):
         globals()[k] = v
     vgg_conv_net(image_size, num_classes,
                  num_layers=[2, 2, 3, 3],
diff --git a/python/paddle/utils/preprocess_img.py b/python/paddle/utils/preprocess_img.py
index 975f1e9edea161331d37afbc6b5af46286f185bf..a322f7b769a2a32df516a4b8ea04289a7f882ff2 100644
--- a/python/paddle/utils/preprocess_img.py
+++ b/python/paddle/utils/preprocess_img.py
@@ -17,9 +17,9 @@ import os
 import random
 import numpy as np
 import PIL.Image as Image
-import StringIO
-import preprocess_util
-from image_util import crop_img
+from six.moves import cStringIO as StringIO
+from . import preprocess_util
+from .image_util import crop_img
 
 
 def resize_image(img, target_size):
@@ -52,7 +52,7 @@ class DiskImage:
 
     def read_image(self):
         if self.img is None:
-            print "reading: " + self.path
+            print("reading: " + self.path)
             image = resize_image(Image.open(self.path), self.target_size)
             self.img = image
 
@@ -69,7 +69,7 @@ class DiskImage:
         convert the image into the paddle batch format.
         """
         self.read_image()
-        output = StringIO.StringIO()
+        output = StringIO()
         self.img.save(output, "jpeg")
         contents = output.getvalue()
         return contents
@@ -127,7 +127,7 @@ class ImageClassificationDatasetCreater(preprocess_util.DatasetCreater):
             image_path = items[0]
             label_name = items[1]
             if not label_name in label_set:
-                label_set[label_name] = len(label_set.keys())
+                label_set[label_name] = len(list(label_set.keys()))
             img = DiskImage(path=image_path, target_size=self.target_size)
             label = preprocess_util.Lablel(
                 label=label_set[label_name], name=label_name)
@@ -144,7 +144,7 @@ class ImageClassificationDatasetCreater(preprocess_util.DatasetCreater):
             return create_dataset_from_list(path)
         label_set = preprocess_util.get_label_set_from_dir(path)
         data = []
-        for l_name in label_set.keys():
+        for l_name in list(label_set.keys()):
             image_paths = preprocess_util.list_images(
                 os.path.join(path, l_name))
             for p in image_paths:
diff --git a/python/paddle/utils/preprocess_util.py b/python/paddle/utils/preprocess_util.py
index 1d17a488243eb81e46bea3ead686efd021499e22..05b2067d01a2c544d7f5bd68320e79c805282286 100644
--- a/python/paddle/utils/preprocess_util.py
+++ b/python/paddle/utils/preprocess_util.py
@@ -14,7 +14,7 @@
 
 import os
 import math
-import cPickle as pickle
+import six.moves.cPickle as pickle
 import random
 import collections
 
@@ -169,7 +169,7 @@ class Dataset:
             random.shuffle(keyvalue_indices[k])
 
         num_data_per_key_batch = \
-            math.ceil(num_per_batch / float(len(keyvalue_indices.keys())))
+            math.ceil(num_per_batch / float(len(list(keyvalue_indices.keys()))))
 
         if num_data_per_key_batch < 2:
             raise Exception("The number of data in a batch is too small")
@@ -182,8 +182,8 @@ class Dataset:
                 end_idx = int(
                     min(begin_idx + num_data_per_key_batch,
                         len(keyvalue_indices[k])))
-                print "begin_idx, end_idx"
-                print begin_idx, end_idx
+                print("begin_idx, end_idx")
+                print(begin_idx, end_idx)
                 for idx in range(begin_idx, end_idx):
                     permuted_data.append(self.data[keyvalue_indices[k][idx]])
                 keyvalue_readpointer[k] = end_idx
@@ -357,6 +357,6 @@ class DatasetCreater(object):
             data_batcher.create_batches_and_list(
                 self.output_path, self.train_list_name, self.test_list_name,
                 self.label_set_name)
-            self.num_classes = len(train_label_set.keys())
+            self.num_classes = len(list(train_label_set.keys()))
             self.create_meta_file(train_data)
         return out_path
diff --git a/python/paddle/utils/show_pb.py b/python/paddle/utils/show_pb.py
index 20614826d1d01f50a2bb54a840e2c584fb93b247..da7a71a665aea4d93d366e8508f438a9aba88e94 100644
--- a/python/paddle/utils/show_pb.py
+++ b/python/paddle/utils/show_pb.py
@@ -15,6 +15,8 @@
 Show the content of proto buffer data file of PADDLE
 """
 
+from __future__ import print_function
+
 import os
 import sys
 from google.protobuf.internal.decoder import _DecodeVarint
@@ -39,7 +41,7 @@ def read_proto(file, message):
 
 
 def usage():
-    print >> sys.stderr, "Usage: python show_pb.py PROTO_DATA_FILE"
+    print("Usage: python show_pb.py PROTO_DATA_FILE", file=sys.stderr)
     exit(1)
 
 
@@ -50,8 +52,8 @@ if __name__ == '__main__':
     f = open(sys.argv[1])
     header = DataFormat.DataHeader()
     read_proto(f, header)
-    print header
+    print(header)
 
     sample = DataFormat.DataSample()
     while read_proto(f, sample):
-        print sample
+        print(sample)
diff --git a/python/paddle/utils/torch2paddle.py b/python/paddle/utils/torch2paddle.py
index 91490111a1144ae25ed6566ff1c83db4f7954d33..398d3aa4e02cc74b7885f7e676937d7fd254bc5e 100644
--- a/python/paddle/utils/torch2paddle.py
+++ b/python/paddle/utils/torch2paddle.py
@@ -24,7 +24,7 @@ import sys
 import struct
 import numpy as np
 import torchfile
-import cPickle as pickle
+import six.moves.cPickle as pickle
 import argparse
 
 
@@ -48,7 +48,7 @@ def save_net_parameters(layers, params, output_path):
         biases = params[i * 2 + 1]
         weight_file = os.path.join(output_path, '_%s.w0' % layers[i])
         biases_file = os.path.join(output_path, '_%s.wbias' % layers[i])
-        print "Saving for layer %s." % layers[i]
+        print("Saving for layer %s." % layers[i])
         save_layer_parameters(weight_file, [weight])
         save_layer_parameters(biases_file, biases)
 
diff --git a/python/paddle/v2/dataset/conll05.py b/python/paddle/v2/dataset/conll05.py
index 0d544efac9cd20157f87b5cd3b68f97ab5ed2dbc..8312900dc43fdd64cc1a205ab846b6f1deaecf5d 100644
--- a/python/paddle/v2/dataset/conll05.py
+++ b/python/paddle/v2/dataset/conll05.py
@@ -29,13 +29,13 @@ __all__ = ['test, get_dict', 'get_embedding', 'convert']
 
 DATA_URL = 'http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz'
 DATA_MD5 = '387719152ae52d60422c016e92a742fc'
-WORDDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/wordDict.txt'
+WORDDICT_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2FwordDict.txt'
 WORDDICT_MD5 = 'ea7fb7d4c75cc6254716f0177a506baa'
-VERBDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/verbDict.txt'
+VERBDICT_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2FverbDict.txt'
 VERBDICT_MD5 = '0d2977293bbb6cbefab5b0f97db1e77c'
-TRGDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/targetDict.txt'
+TRGDICT_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2FtargetDict.txt'
 TRGDICT_MD5 = 'd8c7f03ceb5fc2e5a0fa7503a4353751'
-EMB_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/emb'
+EMB_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2Femb'
 EMB_MD5 = 'bf436eb0faa1f6f9103017f8be57cdb7'
 
 UNK_IDX = 0
diff --git a/python/paddle/v2/dataset/mnist.py b/python/paddle/v2/dataset/mnist.py
index 2b959c48e4bc62e08f6f57981b61b7c5fe3a1d06..026cf501cfb35ab3fe35d24f52d3c271565482ef 100644
--- a/python/paddle/v2/dataset/mnist.py
+++ b/python/paddle/v2/dataset/mnist.py
@@ -68,8 +68,14 @@ def reader_creator(image_filename, label_filename, buffer_size):
                 for i in xrange(buffer_size):
                     yield images[i, :], int(labels[i])
         finally:
-            m.terminate()
-            l.terminate()
+            try:
+                m.terminate()
+            except:
+                pass
+            try:
+                l.terminate()
+            except:
+                pass
 
     return reader
 
diff --git a/python/paddle/v2/dataset/wmt14.py b/python/paddle/v2/dataset/wmt14.py
index 5104e29051e4480f3a7eb18421f1b519841b009b..b9e602f324ad9bf43416b420c6d5697050a5c802 100644
--- a/python/paddle/v2/dataset/wmt14.py
+++ b/python/paddle/v2/dataset/wmt14.py
@@ -15,7 +15,7 @@
 WMT14 dataset.
 The original WMT14 dataset is too large and a small set of data for set is
 provided. This module will download dataset from
-http://paddlepaddle.cdn.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz and
+http://paddlemodels.bj.bcebos.com/wmt/wmt14.tgz and
 parse training set and test set into paddle reader creators.
 
 """
@@ -37,11 +37,10 @@ URL_DEV_TEST = ('http://www-lium.univ-lemans.fr/~schwenk/'
 MD5_DEV_TEST = '7d7897317ddd8ba0ae5c5fa7248d3ff5'
 # this is a small set of data for test. The original data is too large and
 # will be add later.
-URL_TRAIN = ('http://paddlepaddle.cdn.bcebos.com/demo/'
-             'wmt_shrinked_data/wmt14.tgz')
+URL_TRAIN = ('http://paddlemodels.bj.bcebos.com/wmt/wmt14.tgz')
 MD5_TRAIN = '0791583d57d5beb693b9414c5b36798c'
 # BLEU of this trained model is 26.92
-URL_MODEL = 'http://paddlepaddle.bj.bcebos.com/demo/wmt_14/wmt14_model.tar.gz'
+URL_MODEL = 'http://paddlemodels.bj.bcebos.com/wmt%2Fwmt14.tgz'
 MD5_MODEL = '0cb4a5366189b6acba876491c8724fa3'
 
 START = "<s>"
diff --git a/python/paddle/v2/image.py b/python/paddle/v2/image.py
index 9235c41e9eb95b25a0dc53a494a203e7a4525981..08d8bd68f9b7eb703c15f7cb5ad1300969db5713 100644
--- a/python/paddle/v2/image.py
+++ b/python/paddle/v2/image.py
@@ -182,7 +182,7 @@ def resize_short(im, size):
         h_new = size * h / w
     else:
         w_new = size * w / h
-    im = cv2.resize(im, (h_new, w_new), interpolation=cv2.INTER_CUBIC)
+    im = cv2.resize(im, (w_new, h_new), interpolation=cv2.INTER_CUBIC)
     return im
 
 
@@ -324,7 +324,6 @@ def simple_transform(im,
         if np.random.randint(2) == 0:
             im = left_right_flip(im, is_color)
     else:
-        im = center_crop(im, crop_size, is_color)
         im = center_crop(im, crop_size, is_color=is_color)
     if len(im.shape) == 3:
         im = to_chw(im)
diff --git a/python/requirements.txt b/python/requirements.txt
index ea827e9d5a0dcf8eb2ede1f6eaa88c777a138816..f8298a63612cb217ce0e711e78fffdf86b73313d 100644
--- a/python/requirements.txt
+++ b/python/requirements.txt
@@ -1,5 +1,5 @@
 requests==2.9.2
-numpy>=1.12
+numpy>=1.12,<=1.14 #TODO:change to ">=1.12" when numpy fix bug in 1.15 and higher version
 protobuf==3.1
 recordio>=0.1.0
 matplotlib
@@ -8,4 +8,4 @@ scipy>=0.19.0
 Pillow
 nltk>=3.2.2
 graphviz
-LinkChecker
+six
diff --git a/python/setup.py.in b/python/setup.py.in
index 5506443733650631fe045be3f701a41519352e8d..786c9f2e39880b68700b8acb94b3d35a48323958 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -1,16 +1,13 @@
 from setuptools import setup, Distribution, Extension
 import subprocess
-import shutil
 import os
+import re
+import shutil
 class BinaryDistribution(Distribution):
     def has_ext_modules(foo):
         return True
 
-MAJOR   = 0
-MINOR   = 14
-PATCH   = 0
 RC      = 0
-ISTAGED = False
 
 
 
@@ -20,16 +17,51 @@ def git_commit():
         git_commit = subprocess.Popen(cmd, stdout = subprocess.PIPE).communicate()[0].strip()
     except:
         git_commit = 'Unknown'
-    return git_commit
+    git_commit = git_commit.decode()
+    return str(git_commit)
+
+def _get_version_detail(idx):
+    assert idx < 3, "vesion info consists of %(major)d.%(minor)d.%(patch)d, \
+        so detail index must less than 3"
+
+    if re.match('@TAG_VERSION_REGEX@', '@PADDLE_VERSION@'):
+        version_details = '@PADDLE_VERSION@'.split('.')
+
+        if len(version_details) == 3:
+            return version_details[idx]
+
+    return 0
+
+def get_major():
+    return int(_get_version_detail(0))
+
+def get_minor():
+    return int(_get_version_detail(1))
+
+def get_patch():
+    return str(_get_version_detail(2))
+
+def is_taged():
+    try:
+        cmd = ['git', 'describe', '--exact-match', '--tags', 'HEAD', '2>/dev/null']
+        git_tag = subprocess.Popen(cmd, stdout = subprocess.PIPE).communicate()[0].strip()
+        git_tag = git_tag.decode()
+    except:
+        return False
+
+    if str(git_tag).replace('v', '') == '@PADDLE_VERSION@':
+        return True
+    else:
+        return False
 
 def write_version_py(filename='paddle/version.py'):
     cnt = '''
 # THIS FILE IS GENERATED FROM PADDLEPADDLE SETUP.PY
 #
-full_version    = '%(major)d.%(minor)d.%(patch)d'
+full_version    = '%(major)d.%(minor)d.%(patch)s'
 major           = '%(major)d'
 minor           = '%(minor)d'
-patch           = '%(patch)d'
+patch           = '%(patch)s'
 rc              = '%(rc)d'
 istaged         = %(istaged)s
 commit          = '%(commit)s'
@@ -37,13 +69,13 @@ with_mkl        = '%(with_mkl)s'
 
 def show():
     if istaged:
-        print 'full_version:', full_version
-        print 'major:', major
-        print 'minor:', minor
-        print 'patch:', patch
-        print 'rc:', rc
+        print('full_version:', full_version)
+        print('major:', major)
+        print('minor:', minor)
+        print('patch:', patch)
+        print('rc:', rc)
     else:
-        print 'commit:', commit
+        print('commit:', commit)
 
 def mkl():
     return with_mkl
@@ -51,13 +83,13 @@ def mkl():
     commit = git_commit()
     with open(filename, 'w') as f:
         f.write(cnt % {
-            'major': MAJOR,
-            'minor': MINOR,
-            'patch': PATCH,
+            'major': get_major(),
+            'minor': get_minor(),
+            'patch': get_patch(),
             'rc': RC,
             'version': '${PADDLE_VERSION}',
             'commit': commit,
-            'istaged': ISTAGED,
+            'istaged': is_taged(),
             'with_mkl': '@WITH_MKL@'})
 
 write_version_py(filename='@PADDLE_BINARY_DIR@/python/paddle/version.py')
@@ -72,6 +104,8 @@ packages=['paddle',
           'paddle.fluid.proto',
           'paddle.fluid.proto.profiler',
           'paddle.fluid.layers',
+          'paddle.fluid.contrib',
+          'paddle.fluid.contrib.decoder',
           'paddle.fluid.transpiler',
           'paddle.fluid.transpiler.details']
 
@@ -125,15 +159,20 @@ if '${WITH_MKL}' == 'ON':
     shutil.copy('${MKLML_LIB}', libs_path)
     shutil.copy('${MKLML_IOMP_LIB}', libs_path)
     package_data['paddle.libs']+=['libmklml_intel.so','libiomp5.so']
-if '${WITH_MKLDNN}' == 'ON':
-    # change rpath of libmkldnn.so.0, add $ORIGIN/ to it.
-    # The reason is that all thirdparty libraries in the same directory,
-    # thus, libmkldnn.so.0 will find libmklml_intel.so and libiomp5.so.
-    command = "patchelf --set-rpath '$ORIGIN/' ${MKLDNN_SHARED_LIB}"
-    if os.system(command) != 0:
-        raise Exception("patchelf --set-rpath for libmkldnn.so.0 fails")
-    package_data['paddle.libs']+=['libmkldnn.so.0']
-    shutil.copy('${MKLDNN_SHARED_LIB}', libs_path)
+if '${CMAKE_BUILD_TYPE}' == 'Release':
+    # only change rpath in Release mode.
+    if '${WITH_MKLDNN}' == 'ON':
+        # TODO(typhoonzero): use install_name_tool to patch mkl libs once
+        # we can support mkl on mac.
+        #
+        # change rpath of libmkldnn.so.0, add $ORIGIN/ to it.
+        # The reason is that all thirdparty libraries in the same directory,
+        # thus, libmkldnn.so.0 will find libmklml_intel.so and libiomp5.so.
+        command = "patchelf --set-rpath '$ORIGIN/' ${MKLDNN_SHARED_LIB}"
+        if os.system(command) != 0:
+            raise Exception("patch libmkldnn.so failed, command: %s" % command)
+        package_data['paddle.libs']+=['libmkldnn.so.0']
+        shutil.copy('${MKLDNN_SHARED_LIB}', libs_path)
 # remove unused paddle/libs/__init__.py
 os.remove(libs_path+'/__init__.py')
 package_dir['paddle.libs']=libs_path
@@ -142,9 +181,22 @@ package_dir['paddle.libs']=libs_path
 # The reason is that libwarpctc.so, libiomp5.so etc are in paddle.libs, and
 # core.so is in paddle.fluid, thus paddle/fluid/../libs will pointer to above libraries.
 # This operation will fix https://github.com/PaddlePaddle/Paddle/issues/3213
-command = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so"
-if os.system(command) != 0:
-    raise Exception("patchelf --set-rpath for core.so fails")
+if '${CMAKE_BUILD_TYPE}' == 'Release':
+    # only change rpath in Release mode, since in Debug mode, core.so is too large to be changed.
+    if "@APPLE@" == "1":
+        command = "install_name_tool -id \"@loader_path/../libs/\" ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so"
+    else:
+        command = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so"
+    if os.system(command) != 0:
+        raise Exception("patch core.so failed, command: %s" % command)
+    if '${WITH_FLUID_ONLY}'== 'OFF':
+        # change rpath of _swig_paddle.so.
+        if "@APPLE@" == "1":
+            command = "install_name_tool -id \"@loader_path/../paddle/libs/\" ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle.so"
+        else:
+            command = "patchelf --set-rpath '$ORIGIN/../paddle/libs/' ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle.so"
+        if os.system(command) != 0:
+            raise Exception("patch _swig_paddle.so failed, command: %s" % command)
 
 setup(name='${PACKAGE_NAME}',
       version='${PADDLE_VERSION}',
diff --git a/tools/check_ctest_hung.py b/tools/check_ctest_hung.py
index 7de76c381b29a1ff8dcf2167f0e861dc261aa47b..c44690a93ac3c1f1833ee62b4e13d1ae8220fb55 100644
--- a/tools/check_ctest_hung.py
+++ b/tools/check_ctest_hung.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import sys
 import re
 
@@ -46,7 +48,7 @@ Diff:  set(['test_parallel_executor_crf'])
                 start_parts = escape(l).split(" ")
                 m = re.search("Start\s+[0-9]+\:\s([a-z0-9_]+)", escape(l))
                 started.add(m.group(1))
-    print "Diff: ", started - passed
+    print("Diff: ", started - passed)
 
 
 if __name__ == "__main__":
diff --git a/tools/check_pr_approval.py b/tools/check_pr_approval.py
new file mode 100644
index 0000000000000000000000000000000000000000..937b0be7562fab93157c16b942631f0a580dfc68
--- /dev/null
+++ b/tools/check_pr_approval.py
@@ -0,0 +1,49 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import sys
+import json
+
+
+def check_approval(count, required_reviewers):
+    json_buff = ""
+    for line in sys.stdin:
+        json_buff = "".join([json_buff, line])
+    json_resp = json.loads(json_buff)
+    approves = 0
+    approved_user_ids = []
+    for review in json_resp:
+        if review["state"] == "APPROVED":
+            approves += 1
+            approved_user_ids.append(review["user"]["id"])
+
+    # convert to int
+    required_reviewers_int = set()
+    for rr in required_reviewers:
+        required_reviewers_int.add(int(rr))
+
+    if len(set(approved_user_ids) & required_reviewers_int) >= count:
+        print("TRUE")
+    else:
+        print("FALSE")
+
+
+if __name__ == "__main__":
+    if len(sys.argv) > 1 and sys.argv[1].isdigit():
+        check_approval(int(sys.argv[1]), sys.argv[2:])
+    else:
+        print(
+            "Usage: python check_pr_approval.py [count] [required reviewer id] ..."
+        )
diff --git a/tools/codestyle/cpplint_pre_commit.hook b/tools/codestyle/cpplint_pre_commit.hook
index 2c65222c8aa7a019f0f8fea68fe02612f70bd41f..aa14d3a2a12208eda11e82d88bc582eb3d2f5893 100755
--- a/tools/codestyle/cpplint_pre_commit.hook
+++ b/tools/codestyle/cpplint_pre_commit.hook
@@ -4,7 +4,7 @@ TOTAL_ERRORS=0
 
 # The trick to remove deleted files: https://stackoverflow.com/a/2413151
 for file in $(git diff --cached --name-status | awk '$1 != "D" {print $2}'); do
-    if [[ $file =~ ^(paddle/legacy/api/.*|paddle/legacy/capi/.*|paddle/contrib/.*|paddle/legacy/cuda/.*|paddle/legacy/function/.*|paddle/legacy/gserver/.*|paddle/legacy/math/.*|paddle/legacy/optimizer/.*|paddle/legacy/parameter/.*|paddle/legacy/pserver/.*|paddle/legacy/trainer/.*|paddle/legacy/utils/.*|paddle/testing/TestUtil.*) ]]; then
+    if [[ $file =~ ^(paddle/legacy/api/.*|paddle/legacy/capi/.*|paddle/contrib/.*|paddle/legacy/cuda/.*|paddle/legacy/function/.*|paddle/legacy/gserver/.*|paddle/legacy/math/.*|paddle/legacy/optimizer/.*|paddle/legacy/parameter/.*|paddle/legacy/pserver/.*|paddle/legacy/trainer/.*|paddle/legacy/utils/.*|paddle/testing/TestUtil.*|patches/grpc/.*) ]]; then
         continue;
     else
         cpplint --filter=-readability/fn_size $file;
diff --git a/tools/diff_api.py b/tools/diff_api.py
index cf9f2c72cb78ddf88ff2a7bb1c0ee4b00ec0ec96..97c739ed2a5627ad9fd326f206976a4579dc26a3 100644
--- a/tools/diff_api.py
+++ b/tools/diff_api.py
@@ -20,9 +20,7 @@ for each_diff in result:
     if each_diff[0] in ['-', '?']:  # delete or change API is not allowed
         error = True
     elif each_diff[0] == '+':
-        # only new layers is allowed.
-        if not each_diff.startswith('+ paddle.fluid.layers.'):
-            error = True
+        error = True
 
     if each_diff[0] != ' ':
         print(each_diff)
diff --git a/tools/manylinux1/Dockerfile.x64 b/tools/manylinux1/Dockerfile.x64
index bca0b77ad71a3f65dda15191e5f540bfc2e043d1..0d59e4c110ff8502acb4dbcda15f855f7652a946 100644
--- a/tools/manylinux1/Dockerfile.x64
+++ b/tools/manylinux1/Dockerfile.x64
@@ -13,7 +13,7 @@ ENV PATH /opt/rh/devtoolset-2/root/usr/bin:$PATH
 ENV LD_LIBRARY_PATH /opt/rh/devtoolset-2/root/usr/lib64:/opt/rh/devtoolset-2/root/usr/lib:/usr/local/lib64:/usr/local/lib:${LD_LIBRARY_PATH}
 ENV PKG_CONFIG_PATH=/usr/local/lib/pkgconfig
 
-RUN yum install -y sqlite-devel zlib-devel openssl-devel pcre-devel vim tk-devel tkinter libtool xz
+RUN yum install -y sqlite-devel zlib-devel openssl-devel pcre-devel vim tk-devel tkinter libtool xz graphviz
 COPY build_scripts /build_scripts
 RUN bash build_scripts/build.sh && \
   bash build_scripts/install_nccl2.sh && rm -r build_scripts
@@ -40,11 +40,13 @@ RUN wget -O /root/requirements.txt https://raw.githubusercontent.com/PaddlePaddl
 
 RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27mu/bin/pip install -r /root/requirements.txt && \
     LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27m/bin/pip install -r /root/requirements.txt && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.5.1/bin/pip3 install -r /root/requirements.txt && \
     go get github.com/Masterminds/glide && \
     rm -rf /root/requirements.txt
 
 RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27mu/bin/pip install pre-commit 'ipython==5.3.0' opencv-python && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27m/bin/pip install pre-commit 'ipython==5.3.0' opencv-python
+    LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27m/bin/pip install pre-commit 'ipython==5.3.0' opencv-python && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.5.1/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python
 
 RUN wget -O /opt/swig-2.0.12.tar.gz https://cytranet.dl.sourceforge.net/project/swig/swig/swig-2.0.12/swig-2.0.12.tar.gz && \
     cd /opt && tar xzf swig-2.0.12.tar.gz && cd /opt/swig-2.0.12 && ./configure && make && make install && cd /opt && rm swig-2.0.12.tar.gz
diff --git a/tools/manylinux1/build_scripts/build.sh b/tools/manylinux1/build_scripts/build.sh
index 93591fa9ddad8a78df344e1e912a5f1c7e93dfa4..eb4b477dcb538f7ba17cfc54057a97c9669a6916 100644
--- a/tools/manylinux1/build_scripts/build.sh
+++ b/tools/manylinux1/build_scripts/build.sh
@@ -28,7 +28,7 @@ AUTOCONF_HASH=954bd69b391edc12d6a4a51a2dd1476543da5c6bbf05a95b59dc0dd6fd4c2969
 PYTHON_COMPILE_DEPS="zlib-devel bzip2-devel ncurses-devel sqlite-devel readline-devel tk-devel gdbm-devel db4-devel libpcap-devel xz-devel"
 
 # Libraries that are allowed as part of the manylinux1 profile
-MANYLINUX1_DEPS="glibc-devel libstdc++-devel glib2-devel libX11-devel libXext-devel libXrender-devel  mesa-libGL-devel libICE-devel libSM-devel ncurses-devel"
+MANYLINUX1_DEPS="glibc-devel libstdc++-devel glib2-devel libX11-devel libXext-devel libXrender-devel  mesa-libGL-devel libICE-devel libSM-devel ncurses-devel freetype-devel libpng-devel"
 
 # Get build utilities
 MY_DIR=$(dirname "${BASH_SOURCE[0]}")
@@ -105,7 +105,7 @@ curl-config --features
 rm -rf /usr/local/ssl
 
 # Install patchelf (latest with unreleased bug fixes)
-curl -sLO https://nipy.bic.berkeley.edu/manylinux/patchelf-0.9njs2.tar.gz
+curl -sLO http://nipy.bic.berkeley.edu/manylinux/patchelf-0.9njs2.tar.gz
 check_sha256sum patchelf-0.9njs2.tar.gz $PATCHELF_HASH
 tar -xzf patchelf-0.9njs2.tar.gz
 (cd patchelf-0.9njs2 && ./configure && make && make install)
diff --git a/tools/manylinux1/build_scripts/install_nccl2.sh b/tools/manylinux1/build_scripts/install_nccl2.sh
index 282c5c290da14bd3c04346ab01fdb48423c23f88..43a99d8287bbaa13ff75d9f25972a6335ae0754a 100644
--- a/tools/manylinux1/build_scripts/install_nccl2.sh
+++ b/tools/manylinux1/build_scripts/install_nccl2.sh
@@ -21,5 +21,5 @@ for sub_deb in $DEBS; do
   ar x $sub_deb && tar xf data.tar.xz
 done
 mv -f usr/include/nccl.h /usr/local/include/
-mv -f usr/lib/libnccl* /usr/local/lib/
+mv -f usr/lib/x86_64-linux-gnu/libnccl* /usr/local/lib/
 rm -rf $DIR
diff --git a/tools/print_signatures.py b/tools/print_signatures.py
index 5e7ffd44c7b0ba2270069bc4467dc377a58b2417..e2805c4e7e6aa26a5865b64a874feef672bf9b36 100644
--- a/tools/print_signatures.py
+++ b/tools/print_signatures.py
@@ -17,6 +17,8 @@ Print all signature of a python module in alphabet order.
 Usage:
     ./print_signature  "paddle.fluid" > signature.txt
 """
+from __future__ import print_function
+
 import importlib
 import inspect
 import collections
@@ -64,4 +66,4 @@ def visit_all_module(mod):
 visit_all_module(importlib.import_module(sys.argv[1]))
 
 for name in member_dict:
-    print name, member_dict[name]
+    print(name, member_dict[name])
diff --git a/tools/test_runner.py b/tools/test_runner.py
index 9dc750b89058cd73355a2f7984d577252c03526d..9b9f165e7368364bbb0a78d6dcbbe4be0d6bf98b 100644
--- a/tools/test_runner.py
+++ b/tools/test_runner.py
@@ -12,19 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import os
 import sys
 import paddle.fluid as fluid
 import importlib
-import cStringIO
+from six.moves import cStringIO
 
 
 def main():
     sys.path.append(os.getcwd())
     some_test_failed = False
     for module_name in sys.argv[1:]:
-        buffer = cStringIO.StringIO()
+        buffer = cStringIO()
         main = fluid.Program()
         startup = fluid.Program()
         scope = fluid.core.Scope()
@@ -37,8 +39,11 @@ def main():
                     res = unittest.TextTestRunner(stream=buffer).run(tests)
                     if not res.wasSuccessful():
                         some_test_failed = True
-                        print >> sys.stderr, module_name, 'failed\n', buffer.getvalue(
-                        )
+                        print(
+                            module_name,
+                            'failed\n',
+                            buffer.getvalue(),
+                            file=sys.stderr)
 
     if some_test_failed:
         exit(1)
diff --git a/tools/timeline.py b/tools/timeline.py
index b413bb6fe0505df8fb09fa0759fefb6509b95bc9..f850476831d84787bf5cc7c7f7c91ff9dd6a2d5b 100644
--- a/tools/timeline.py
+++ b/tools/timeline.py
@@ -14,6 +14,7 @@
 
 import argparse
 import json
+import six
 import sys
 import unittest
 
@@ -124,7 +125,7 @@ class Timeline(object):
         return cur_pid
 
     def _allocate_pids(self):
-        for k, profile_pb in self._profile_dict.iteritems():
+        for k, profile_pb in six.iteritems(self._profile_dict):
             for event in profile_pb.events:
                 if event.type == profiler_pb2.Event.CPU:
                     if (k, event.device_id, "CPU") not in self._devices:
@@ -140,7 +141,7 @@ class Timeline(object):
                                                     (k, event.device_id), pid)
 
     def _allocate_events(self):
-        for k, profile_pb in self._profile_dict.iteritems():
+        for k, profile_pb in six.iteritems(self._profile_dict):
             for event in profile_pb.events:
                 if event.type == profiler_pb2.Event.CPU:
                     type = "CPU"