Merge branch 'develop' into math_function

1ef97fa7 · Luo Tao · f67275a9 · 84aea8a8 · 1ef97fa7 · 1ef97fa7
299 changed file
--- a/.travis.yml
+++ b/.travis.yml
@@ -4,13 +4,17 @@ cache:
    - $HOME/.ccache
    - $HOME/.cache/pip
    - $TRAVIS_BUILD_DIR/build/third_party
+    - $TRAVIS_BUILD_DIR/build_android/third_party
 sudo: required
 dist: trusty
+services:
+  - docker
 os:
  - linux
 env:
  - JOB=build_doc
  - JOB=check_style
+  - JOB=build_android
 addons:
  apt:
    packages:
@@ -41,8 +45,10 @@ before_install:
    function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; }
 script:
  - |
-    timeout 2580 paddle/scripts/travis/${JOB}.sh # 43min timeout
+    # 43min timeout
-    RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true ;else exit 1; fi;
+    if [[ "$JOB" == "build_android" ]]; then timeout 2580 docker run -it --rm -v "$TRAVIS_BUILD_DIR:/paddle" paddlepaddle/paddle:latest-dev-android;
+    else timeout 2580 paddle/scripts/travis/${JOB}.sh; fi;
+    RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true; else exit 1; fi;
  - |
    if [[ "$JOB" != "build_doc" ]]; then exit 0; fi;
    if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit 0; fi;
@@ -50,7 +56,7 @@ script:
    export DEPLOY_DOCS_SH=https://raw.githubusercontent.com/PaddlePaddle/PaddlePaddle.org/master/scripts/deploy/deploy_docs.sh
    export DOCS_DIR=`pwd`
    cd ..
-    curl $DEPLOY_DOCS_SH | bash -s $CONTENT_DEC_PASSWD $TRAVIS_BRANCH $DOCS_DIR $DOCS_DIR/build/doc   
+    curl $DEPLOY_DOCS_SH | bash -s $CONTENT_DEC_PASSWD $TRAVIS_BRANCH $DOCS_DIR $DOCS_DIR/build/doc/v2   
 notifications:
  email:
    on_success: change

--- a/AUTHORS.md
+++ b/AUTHORS.md
 | Github account | name |
 |---|---|
+| abhinavarora | Abhinav Arora |
 | backyes | Yan-Fei Wang |
 | beckett1124 | Bin Qi |
 | JiayiFeng | Jia-Yi Feng |

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -144,6 +144,8 @@ include(external/eigen)     # download eigen3
 include(external/pybind11)  # download pybind11
 include(external/cares)
 include(external/grpc)
+include(external/snappy)    # download snappy
+include(external/snappystream)
 include(cudnn)              # set cudnn libraries, must before configure
 include(cupti)
@@ -166,11 +168,11 @@ include_directories("${CMAKE_CURRENT_BINARY_DIR}/proto")
 include_directories("${CMAKE_CURRENT_BINARY_DIR}/go/pserver/client/c")
 set(EXTERNAL_LIBS
-    ${GFLAGS_LIBRARIES}
+    gflags
-    ${GLOG_LIBRARIES}
+    glog
    ${CBLAS_LIBRARIES}
-    ${PROTOBUF_LIBRARY}
+    protobuf
-    ${ZLIB_LIBRARIES}
+    zlib
    ${PYTHON_LIBRARIES}
 )

--- a/benchmark/cluster/vgg16/vgg16_fluid.py
+++ b/benchmark/cluster/vgg16/vgg16_fluid.py
 #   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -106,10 +106,10 @@ def vgg16_bn_drop(input):
    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
    drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
-    fc1 = fluid.layers.fc(input=drop, size=512, act=None)
+    fc1 = fluid.layers.fc(input=drop, size=4096, act=None)
    bn = fluid.layers.batch_norm(input=fc1, act='relu')
    drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
-    fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
+    fc2 = fluid.layers.fc(input=drop2, size=4096, act=None)
    return fc2
@@ -138,13 +138,14 @@ def main():
    avg_cost = fluid.layers.mean(x=cost)
    # Evaluator
-    accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
+    batch_size = fluid.layers.create_tensor(dtype='int64')
+    batch_acc = fluid.layers.accuracy(
+        input=predict, label=label, total=batch_size)
    # inference program
    inference_program = fluid.default_main_program().clone()
    with fluid.program_guard(inference_program):
-        test_target = accuracy.metrics + accuracy.states
+        inference_program = fluid.io.get_inference_program(batch_acc)
-        inference_program = fluid.io.get_inference_program(test_target)
    # Optimization
    optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
@@ -157,27 +158,30 @@ def main():
    # test
    def test(exe):
-        accuracy.reset(exe)
+        test_pass_acc = fluid.average.WeightedAverage()
        for batch_id, data in enumerate(test_reader()):
            img_data = np.array(map(lambda x: x[0].reshape(data_shape),
                                    data)).astype("float32")
            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
            y_data = y_data.reshape([-1, 1])
-            exe.run(inference_program,
+            outs = exe.run(inference_program,
-                    feed={"pixel": img_data,
+                           feed={"pixel": img_data,
-                          "label": y_data})
+                                 "label": y_data},
+                           fetch_list=[batch_acc, batch_size])
+            test_pass_acc.add(value=np.array(outs[0]), weight=np.array(outs[1]))
-        return accuracy.eval(exe)
+        return test_pass_acc.eval()
    def train_loop(exe, trainer_prog):
        iters = 0
        ts = time.time()
+        train_pass_acc = fluid.average.WeightedAverage()
        for pass_id in range(args.num_passes):
            # train
            start_time = time.time()
            num_samples = 0
-            accuracy.reset(exe)
+            train_pass_acc.reset()
            with profiler.profiler("CPU", 'total') as prof:
                for batch_id, data in enumerate(train_reader()):
                    ts = time.time()
@@ -187,13 +191,14 @@ def main():
                    y_data = np.array(map(lambda x: x[1], data)).astype("int64")
                    y_data = y_data.reshape([-1, 1])
-                    loss, acc = exe.run(
+                    loss, acc, b_size = exe.run(
                        trainer_prog,
                        feed={"pixel": img_data,
                              "label": y_data},
-                        fetch_list=[avg_cost] + accuracy.metrics)
+                        fetch_list=[avg_cost, batch_acc, batch_size])
                    iters += 1
                    num_samples += len(data)
+                    train_pass_acc.add(value=acc, weight=b_size)
                    print(
                        "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, Speed = %.2f img/s"
                        % (pass_id, iters, loss, acc,
@@ -201,7 +206,7 @@ def main():
                    )  # The accuracy is the accumulation of batches, but not the current batch.
            pass_elapsed = time.time() - start_time
-            pass_train_acc = accuracy.eval(exe)
+            pass_train_acc = train_pass_acc.eval()
            pass_test_acc = test(exe)
            print(
                "Pass = %d, Training performance = %f imgs/s, Train accuracy = %f, Test accuracy = %f\n"

--- a/cmake/external/snappy.cmake
+++ b/cmake/external/snappy.cmake
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+IF(MOBILE_INFERENCE)
+    return()
+ENDIF()
+include (ExternalProject)
+# NOTE: snappy is needed when linking with recordio
+SET(SNAPPY_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy)
+SET(SNAPPY_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy)
+SET(SNAPPY_INCLUDE_DIR "${SNAPPY_INSTALL_DIR}/include/" CACHE PATH "snappy include directory." FORCE)
+ExternalProject_Add(
+    extern_snappy
+    GIT_REPOSITORY "https://github.com/google/snappy"
+    GIT_TAG "1.1.7"
+    PREFIX          ${SNAPPY_SOURCES_DIR}
+    UPDATE_COMMAND  ""
+    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+                    -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+                    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+                    -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                    -DCMAKE_INSTALL_PREFIX=${SNAPPY_INSTALL_DIR}
+                    -DCMAKE_INSTALL_LIBDIR=${SNAPPY_INSTALL_DIR}/lib
+                    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+                    -DBUILD_TESTING=OFF
+                    -DSNAPPY_BUILD_TESTS:BOOL=OFF
+                    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+                    ${EXTERNAL_OPTIONAL_ARGS}
+    CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${SNAPPY_INSTALL_DIR}
+                     -DCMAKE_INSTALL_LIBDIR:PATH=${SNAPPY_INSTALL_DIR}/lib
+                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+                     -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+    BUILD_COMMAND   make -j8
+    INSTALL_COMMAND make install
+)
+add_library(snappy STATIC IMPORTED GLOBAL)
+set_property(TARGET snappy PROPERTY IMPORTED_LOCATION
+             "${SNAPPY_INSTALL_DIR}/lib/libsnappy.a")
+include_directories(${SNAPPY_INCLUDE_DIR})
+add_dependencies(snappy extern_snappy)
--- a/cmake/external/snappystream.cmake
+++ b/cmake/external/snappystream.cmake
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+IF(MOBILE_INFERENCE)
+    return()
+ENDIF()
+include (ExternalProject)
+# NOTE: snappy is needed when linking with recordio
+SET(SNAPPYSTREAM_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy_stream)
+SET(SNAPPYSTREAM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy_stream)
+SET(SNAPPYSTREAM_INCLUDE_DIR "${SNAPPYSTREAM_INSTALL_DIR}/include/" CACHE PATH "snappy stream include directory." FORCE)
+ExternalProject_Add(
+        extern_snappystream
+        GIT_REPOSITORY "https://github.com/hoxnox/snappystream.git"
+        GIT_TAG "0.2.8"
+        PREFIX          ${SNAPPYSTREAM_SOURCES_DIR}
+        UPDATE_COMMAND  ""
+        CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+                        -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+                        -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+                        -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                        -DCMAKE_INSTALL_PREFIX=${SNAPPY_INSTALL_DIR}
+                        -DCMAKE_INSTALL_LIBDIR=${SNAPPY_INSTALL_DIR}/lib
+                        -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+                        -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+                        -DSNAPPY_ROOT=${SNAPPY_INSTALL_DIR}
+                        ${EXTERNAL_OPTIONAL_ARGS}
+                        CMAKE_CACHE_ARGS
+                        -DCMAKE_INSTALL_PREFIX:PATH=${SNAPPYSTREAM_INSTALL_DIR}
+                        -DCMAKE_INSTALL_LIBDIR:PATH=${SNAPPYSTREAM_INSTALL_DIR}/lib
+                        -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+        BUILD_COMMAND   make -j8
+        INSTALL_COMMAND make install
+        DEPENDS snappy
+)
+add_library(snappystream STATIC IMPORTED GLOBAL)
+set_property(TARGET snappystream PROPERTY IMPORTED_LOCATION
+        "${SNAPPYSTREAM_INSTALL_DIR}/lib/libsnappystream.a")
+include_directories(${SNAPPYSTREAM_INCLUDE_DIR})
+add_dependencies(snappystream extern_snappystream)
--- a/cmake/external/zlib.cmake
+++ b/cmake/external/zlib.cmake
@@ -28,7 +28,7 @@ ENDIF(WIN32)
 INCLUDE_DIRECTORIES(${ZLIB_INCLUDE_DIR})
 ExternalProject_Add(
-    zlib
+    extern_zlib
    ${EXTERNAL_PROJECT_LOG_ARGS}
    GIT_REPOSITORY  "https://github.com/madler/zlib.git"
    GIT_TAG         "v1.2.8"
@@ -49,9 +49,11 @@ ExternalProject_Add(
                     -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
 )
+ADD_LIBRARY(zlib STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET zlib PROPERTY IMPORTED_LOCATION ${ZLIB_LIBRARIES})
+ADD_DEPENDENCIES(zlib extern_zlib)
 LIST(APPEND external_project_dependencies zlib)
-ADD_LIBRARY(zlib_target STATIC IMPORTED GLOBAL)
-SET_PROPERTY(TARGET zlib_target PROPERTY IMPORTED_LOCATION ${ZLIB_LIBRARIES})
 IF(WITH_C_API)
  INSTALL(DIRECTORY ${ZLIB_INCLUDE_DIR} DESTINATION third_party/zlib)

--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -104,7 +104,9 @@ function(merge_static_libs TARGET_NAME)
  foreach(lib ${libs})
    list(APPEND libs_deps ${${lib}_LIB_DEPENDS})
  endforeach()
-  list(REMOVE_DUPLICATES libs_deps)
+  if(libs_deps)
+    list(REMOVE_DUPLICATES libs_deps)
+  endif()
  # To produce a library we need at least one source file.
  # It is created by add_custom_command below and will helps
@@ -191,10 +193,13 @@ function(cc_library TARGET_NAME)
        list(REMOVE_ITEM cc_library_DEPS warpctc)
        add_dependencies(${TARGET_NAME} warpctc)
      endif()
-      # Support linking flags: --whole-archive (Linux) / -force_load (MacOS)
-      target_circle_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
      if("${cc_library_DEPS}" MATCHES "ARCHIVE_START")
+        # Support linking flags: --whole-archive (Linux) / -force_load (MacOS).
+        # WARNING: Please don't use ARCHIVE_START&ARCHIVE_END if TARGET_NAME will be linked by other libraries.
+        target_circle_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
        list(REMOVE_ITEM cc_library_DEPS ARCHIVE_START ARCHIVE_END)
+      else()
+        target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
      endif()
      add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
    endif()

--- a/doc/CMakeLists.txt
+++ b/doc/CMakeLists.txt
-if(NOT DEFINED SPHINX_THEME)
+add_subdirectory(v2)
-    set(SPHINX_THEME default)
-endif()
-if(NOT DEFINED SPHINX_THEME_DIR)
-    set(SPHINX_THEME_DIR)
-endif()
-# configured documentation tools and intermediate build results
-set(BINARY_BUILD_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_build")
-# Sphinx cache with pickled ReST documents
-set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
-# HTML output director
-set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")
-configure_file(
-    "${CMAKE_CURRENT_SOURCE_DIR}/templates/conf.py.en.in"
-    "${BINARY_BUILD_DIR_EN}/conf.py"
-    @ONLY)
-sphinx_add_target(paddle_docs
-                  html
-                  ${BINARY_BUILD_DIR_EN}
-                  ${SPHINX_CACHE_DIR_EN}
-                  ${CMAKE_CURRENT_SOURCE_DIR}
-                  ${SPHINX_HTML_DIR_EN})
-# configured documentation tools and intermediate build results
-set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build")
-# Sphinx cache with pickled ReST documents
-set(SPHINX_CACHE_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_doctrees")
-# HTML output directory
-set(SPHINX_HTML_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/html")
-configure_file(
-    "${CMAKE_CURRENT_SOURCE_DIR}/templates/conf.py.cn.in"
-    "${BINARY_BUILD_DIR_CN}/conf.py"
-    @ONLY)
-sphinx_add_target(paddle_docs_cn
-                  html
-                  ${BINARY_BUILD_DIR_CN}
-                  ${SPHINX_CACHE_DIR_CN}
-                  ${CMAKE_CURRENT_SOURCE_DIR}
-                  ${SPHINX_HTML_DIR_CN})
-add_subdirectory(api)
--- a/doc/build_and_install/index_cn.rst
+++ b/doc/build_and_install/index_cn.rst
-安装与编译
-==========
-.. _install_steps:
-安装流程
-++++++++
-PaddlePaddle提供pip和Docker的安装方式：
-.. toctree::
-   :maxdepth: 1
-   pip_install_cn.rst
-   docker_install_cn.rst
-编译流程
-++++++++
-..  warning::
-    建议直接使用上述安装流程，方便快速安装。只有在遇到需要独立定制的二进制时才需要编译。
-..  toctree::
-    :maxdepth: 1
-    build_from_source_cn.rst
-常见问题解答
-++++++++++
-`常见问题解答 <http://www.paddlepaddle.org/docs/develop/documentation/zh/faq/build_and_install/index_cn.html>`_
--- a/doc/dev/contribute_to_paddle_en.md
+++ b/doc/dev/contribute_to_paddle_en.md
-../../CONTRIBUTING.md
\ No newline at end of file
--- a/doc/dev/new_op_cn.md
+++ b/doc/dev/new_op_cn.md
--- a/doc/dev/new_op_en.md
+++ b/doc/dev/new_op_en.md
--- a/doc/dev/new_op_kernel_en.md
+++ b/doc/dev/new_op_kernel_en.md
--- a/doc/dev/use_eigen_cn.md
+++ b/doc/dev/use_eigen_cn.md
--- a/doc/dev/use_eigen_en.md
+++ b/doc/dev/use_eigen_en.md
--- a/doc/howto/cluster/fluid_cluster_train_en.md
+++ b/doc/howto/cluster/fluid_cluster_train_en.md
--- a/doc/howto/optimization/cpu_profiling_cn.md
+++ b/doc/howto/optimization/cpu_profiling_cn.md
--- a/doc/howto/optimization/cpu_profiling_en.md
+++ b/doc/howto/optimization/cpu_profiling_en.md
--- a/doc/howto/read_source.md
+++ b/doc/howto/read_source.md
--- a/doc/howto/cmd_parameter/index_cn.rst
+++ b/doc/howto/cmd_parameter/index_cn.rst
-..  _cmd_line_index:
-命令行参数设置
-===============
-..  toctree::
-  :maxdepth: 1
-  use_case_cn.md
-  arguments_cn.md
-  detail_introduction_cn.md
--- a/doc/templates/conf.py.cn.in
+++ b/doc/templates/conf.py.cn.in
@@ -21,10 +21,11 @@ import paddle.v2
 MarkdownParser = parser.CommonMarkParser
 AutoStructify = transform.AutoStructify
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
-templates_path = ["@PADDLE_SOURCE_DIR@/doc_theme/templates"]
+templates_path = ["@PADDLE_SOURCE_DIR@/doc/templates"]
 # -- General configuration ------------------------------------------------
@@ -120,7 +121,7 @@ html_theme = 'sphinx_rtd_theme'
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['@PADDLE_SOURCE_DIR@/doc_theme/static']
+#html_static_path = []
 # Output file base name for HTML help builder.
 htmlhelp_basename = project + 'doc'

--- a/doc/templates/conf.py.en.in
+++ b/doc/templates/conf.py.en.in
@@ -22,10 +22,11 @@ import paddle.v2
 MarkdownParser = parser.CommonMarkParser
 AutoStructify = transform.AutoStructify
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
-templates_path = ["@PADDLE_SOURCE_DIR@/doc_theme/templates"]
+templates_path = ["@PADDLE_SOURCE_DIR@/doc/templates"]
 # -- General configuration ------------------------------------------------
@@ -120,7 +121,7 @@ html_theme = 'sphinx_rtd_theme'
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['@PADDLE_SOURCE_DIR@/doc_theme/static']
+#html_static_path = []
 # Output file base name for HTML help builder.
 htmlhelp_basename = project + 'doc'

--- a/doc/templates/layout.html
+++ b/doc/templates/layout.html
@@ -2,6 +2,13 @@
 {# Import the theme's layout. #}
 {% extends "!layout.html" %}
+{# SIDE NAV, TOGGLES ON MOBILE #}		
+{% block menu %}
+<nav class="doc-menu-vertical" role="navigation">
+{% set toctree = toctree(maxdepth=-1, collapse=False,titles_only=True, includehidden=True) %}
+{{ toctree }}
+</nav>
+{% endblock %}
 {%- block extrahead %} 
 <script>

--- a/doc/v2/CMakeLists.txt
+++ b/doc/v2/CMakeLists.txt
+if(NOT DEFINED SPHINX_THEME)
+    set(SPHINX_THEME default)
+endif()
+if(NOT DEFINED SPHINX_THEME_DIR)
+    set(SPHINX_THEME_DIR)
+endif()
+# configured documentation tools and intermediate build results
+set(BINARY_BUILD_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_build")
+# Sphinx cache with pickled ReST documents
+set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
+# HTML output director
+set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")
+configure_file(
+    "${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.en.in"
+    "${BINARY_BUILD_DIR_EN}/conf.py"
+    @ONLY)
+sphinx_add_target(paddle_docs
+                  html
+                  ${BINARY_BUILD_DIR_EN}
+                  ${SPHINX_CACHE_DIR_EN}
+                  ${CMAKE_CURRENT_SOURCE_DIR}
+                  ${SPHINX_HTML_DIR_EN})
+# configured documentation tools and intermediate build results
+set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build")
+# Sphinx cache with pickled ReST documents
+set(SPHINX_CACHE_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_doctrees")
+# HTML output directory
+set(SPHINX_HTML_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/html")
+configure_file(
+    "${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.cn.in"
+    "${BINARY_BUILD_DIR_CN}/conf.py"
+    @ONLY)
+sphinx_add_target(paddle_docs_cn
+                  html
+                  ${BINARY_BUILD_DIR_CN}
+                  ${SPHINX_CACHE_DIR_CN}
+                  ${CMAKE_CURRENT_SOURCE_DIR}
+                  ${SPHINX_HTML_DIR_CN})
+add_subdirectory(api)
--- a/doc/api/CMakeLists.txt
+++ b/doc/api/CMakeLists.txt
@@ -8,7 +8,7 @@ set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
 set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")
 configure_file(
-    "${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.en.in"
+    "${CMAKE_CURRENT_SOURCE_DIR}/../../templates/conf.py.en.in"
    "${BINARY_BUILD_DIR_EN}/conf.py"
    @ONLY)

--- a/doc/api/v2/config/activation.rst
+++ b/doc/api/v2/config/activation.rst
--- a/doc/api/v2/config/attr.rst
+++ b/doc/api/v2/config/attr.rst
--- a/doc/api/v2/config/evaluators.rst
+++ b/doc/api/v2/config/evaluators.rst
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
--- a/doc/api/v2/config/networks.rst
+++ b/doc/api/v2/config/networks.rst
--- a/doc/api/v2/config/optimizer.rst
+++ b/doc/api/v2/config/optimizer.rst
--- a/doc/api/v2/config/pooling.rst
+++ b/doc/api/v2/config/pooling.rst
--- a/doc/api/v2/data.rst
+++ b/doc/api/v2/data.rst
--- a/doc/api/v2/data/data_reader.rst
+++ b/doc/api/v2/data/data_reader.rst
--- a/doc/api/v2/data/dataset.rst
+++ b/doc/api/v2/data/dataset.rst
--- a/doc/api/v2/data/image.rst
+++ b/doc/api/v2/data/image.rst
--- a/doc/api/fluid/data_feeder.rst
+++ b/doc/api/fluid/data_feeder.rst
--- a/doc/api/fluid/evaluator.rst
+++ b/doc/api/fluid/evaluator.rst
--- a/doc/api/fluid/executor.rst
+++ b/doc/api/fluid/executor.rst
--- a/doc/api/fluid/gen_doc.py
+++ b/doc/api/fluid/gen_doc.py
--- a/doc/api/fluid/gen_doc.sh
+++ b/doc/api/fluid/gen_doc.sh
--- a/doc/api/fluid/index.rst
+++ b/doc/api/fluid/index.rst
--- a/doc/api/fluid/initializer.rst
+++ b/doc/api/fluid/initializer.rst
--- a/doc/api/fluid/io.rst
+++ b/doc/api/fluid/io.rst
--- a/doc/api/fluid/layers.rst
+++ b/doc/api/fluid/layers.rst
--- a/doc/api/fluid/nets.rst
+++ b/doc/api/fluid/nets.rst
--- a/doc/api/fluid/optimizer.rst
+++ b/doc/api/fluid/optimizer.rst
--- a/doc/api/fluid/param_attr.rst
+++ b/doc/api/fluid/param_attr.rst
--- a/doc/api/fluid/profiler.rst
+++ b/doc/api/fluid/profiler.rst
--- a/doc/api/fluid/regularizer.rst
+++ b/doc/api/fluid/regularizer.rst
--- a/doc/api/index_en.rst
+++ b/doc/api/index_en.rst
@@ -5,7 +5,7 @@ API
    :maxdepth: 1
    overview.rst
-    v2/model_configs.rst
+    model_configs.rst
-    v2/data.rst
+    data.rst
-    v2/run_logic.rst
+    run_logic.rst
    fluid/index.rst
--- a/doc/api/v2/model_configs.rst
+++ b/doc/api/v2/model_configs.rst
--- a/doc/api/overview.rst
+++ b/doc/api/overview.rst
@@ -7,10 +7,6 @@ it proposes some high-level concepts such as `Layers <http://www.paddlepaddle.or
 A model is composed of the computation described by a group of `Layers`, with `Evaluator` to define the error, `Optimizer` to update the parameters and `Data Reader` to feed in the data.
 We also provide the `interface for Training and Inference <http://www.paddlepaddle.org/docs/develop/api/en/v2/run_logic.html>`_ to help control the training and inference phrase,
-it has several easy to use methods
+it has several easy to use methods to better expose the internal running details, different `events <http://www.paddlepaddle.org/docs/develop/api/en/v2/run_logic.html#event>`_ are available to users by writing some callbacks.
- `paddle.train` 
+All in all, the V2 API gives a higher abstraction and make PaddlePaddle programs require fiew lines of code.
- `paddle.test`
- `paddle.infer`
-to better expose the internal running details, different `events <http://www.paddlepaddle.org/docs/develop/api/en/v2/run_logic.html#event>`_ are available to users by writing some callbacks.
--- a/doc/api/v2/run_logic.rst
+++ b/doc/api/v2/run_logic.rst
--- a/doc/build_and_install/build_from_source_cn.rst
+++ b/doc/build_and_install/build_from_source_cn.rst
--- a/doc/build_and_install/build_from_source_en.rst
+++ b/doc/build_and_install/build_from_source_en.rst
--- a/doc/build_and_install/docker_install_cn.rst
+++ b/doc/build_and_install/docker_install_cn.rst
--- a/doc/build_and_install/docker_install_en.rst
+++ b/doc/build_and_install/docker_install_en.rst
--- a/doc/v2/build_and_install/index_cn.rst
+++ b/doc/v2/build_and_install/index_cn.rst
+安装与编译
+==========
+.. _install_steps:
+PaddlePaddle针对不同的用户群体提供了多种安装方式。
+专注深度学习模型开发
+-----------------
+PaddlePaddle提供了多种python wheel包，可通过pip一键安装：
+.. toctree::
+	:maxdepth: 1
+	pip_install_cn.rst
+这是最便捷的安装方式，请根据机器配置和系统选择对应的安装包。
+关注底层框架
+----------
+PaddlePaddle提供了基于Docker的安装方式，请参照以下教程：
+.. toctree::
+	:maxdepth: 1
+	docker_install_cn.rst
+我们推荐在Docker中运行PaddlePaddle，该方式具有以下优势：
+- 无需单独安装第三方依赖
+- 方便分享运行时环境，易于问题的复现
+对于有定制化二进制文件需求的用户，我们同样提供了从源码编译安装PaddlePaddle的方法：
+.. toctree::
+    :maxdepth: 1
+    build_from_source_cn.rst
+.. warning::
+	需要提醒的是，这种安装方式会涉及到一些第三方库的下载、编译及安装，整个安装过程耗时较长。
+常见问题汇总
+-----------
+如果在安装过程中遇到了问题，请先尝试在下面的页面寻找答案：
+:ref:`常见问题解答 <install_faq>`
+如果问题没有得到解决，欢迎向PaddlePaddle社区反馈问题：
+`创建issue <https://github.com/PaddlePaddle/Paddle/issues/new>`_
--- a/doc/build_and_install/index_en.rst
+++ b/doc/build_and_install/index_en.rst
--- a/doc/build_and_install/paddleci.png
+++ b/doc/build_and_install/paddleci.png
--- a/doc/build_and_install/pip_install_cn.rst
+++ b/doc/build_and_install/pip_install_cn.rst
--- a/doc/build_and_install/pip_install_en.rst
+++ b/doc/build_and_install/pip_install_en.rst
--- a/doc/dev/FullyConnected.jpg
+++ b/doc/dev/FullyConnected.jpg
--- a/doc/dev/contribute_to_paddle_cn.md
+++ b/doc/dev/contribute_to_paddle_cn.md
--- a/doc/v2/dev/contribute_to_paddle_en.md
+++ b/doc/v2/dev/contribute_to_paddle_en.md
+../../../CONTRIBUTING.md
\ No newline at end of file
--- a/doc/dev/index_cn.rst
+++ b/doc/dev/index_cn.rst
@@ -6,3 +6,4 @@
  contribute_to_paddle_cn.md
  write_docs_cn.rst
+  new_layer_cn.rst
--- a/doc/dev/index_en.rst
+++ b/doc/dev/index_en.rst
@@ -6,3 +6,4 @@ Development
  contribute_to_paddle_en.md
  write_docs_en.rst
+  new_layer_en.rst
--- a/doc/dev/new_layer_cn.rst
+++ b/doc/dev/new_layer_cn.rst
-================
+==================
-实现新的网络层
+如何实现新的网络层
-================
+==================
 这份教程展示了如何在PaddlePaddle中实现一个自定义的网络层。在这里我们使用全连接层作为例子来展示实现新网络层所需要的四个步骤。

--- a/doc/dev/new_layer_en.rst
+++ b/doc/dev/new_layer_en.rst
--- a/doc/dev/write_docs_cn.rst
+++ b/doc/dev/write_docs_cn.rst
--- a/doc/dev/write_docs_en.rst
+++ b/doc/dev/write_docs_en.rst
--- a/doc/faq/build_and_install/index_cn.rst
+++ b/doc/faq/build_and_install/index_cn.rst
+.. _install_faq:
 ###################
 编译安装与单元测试
 ###################

--- a/doc/faq/build_and_install/index_en.rst
+++ b/doc/faq/build_and_install/index_en.rst
--- a/doc/faq/cluster/index_cn.rst
+++ b/doc/faq/cluster/index_cn.rst
--- a/doc/faq/cluster/index_en.rst
+++ b/doc/faq/cluster/index_en.rst
--- a/doc/faq/index_cn.rst
+++ b/doc/faq/index_cn.rst
--- a/doc/faq/index_en.rst
+++ b/doc/faq/index_en.rst
--- a/doc/faq/local/index_cn.rst
+++ b/doc/faq/local/index_cn.rst
--- a/doc/faq/local/index_en.rst
+++ b/doc/faq/local/index_en.rst
--- a/doc/faq/local/src/reduce_min_pool_size.py
+++ b/doc/faq/local/src/reduce_min_pool_size.py
--- a/doc/faq/local/src/word2vec_config.py
+++ b/doc/faq/local/src/word2vec_config.py
--- a/doc/faq/local/src/word2vec_dataprovider.py
+++ b/doc/faq/local/src/word2vec_dataprovider.py
--- a/doc/faq/model/index_cn.rst
+++ b/doc/faq/model/index_cn.rst
--- a/doc/faq/model/index_en.rst
+++ b/doc/faq/model/index_en.rst
--- a/doc/faq/parameter/index_cn.rst
+++ b/doc/faq/parameter/index_cn.rst
--- a/doc/faq/parameter/index_en.rst
+++ b/doc/faq/parameter/index_en.rst
--- a/doc/getstarted/concepts/src/infer.py
+++ b/doc/getstarted/concepts/src/infer.py
--- a/doc/getstarted/concepts/src/train.py
+++ b/doc/getstarted/concepts/src/train.py
--- a/doc/getstarted/concepts/use_concepts_cn.rst
+++ b/doc/getstarted/concepts/use_concepts_cn.rst
--- a/doc/getstarted/concepts/use_concepts_en.rst
+++ b/doc/getstarted/concepts/use_concepts_en.rst
--- a/doc/getstarted/index_cn.rst
+++ b/doc/getstarted/index_cn.rst
--- a/doc/getstarted/index_en.rst
+++ b/doc/getstarted/index_en.rst
--- a/doc/getstarted/quickstart_cn.rst
+++ b/doc/getstarted/quickstart_cn.rst
--- a/doc/getstarted/quickstart_en.rst
+++ b/doc/getstarted/quickstart_en.rst
--- a/doc/howto/capi/compile_paddle_lib_cn.md
+++ b/doc/howto/capi/compile_paddle_lib_cn.md
--- a/doc/howto/capi/compile_paddle_lib_en.md
+++ b/doc/howto/capi/compile_paddle_lib_en.md
--- a/doc/howto/capi/images/csr.png
+++ b/doc/howto/capi/images/csr.png
--- a/doc/howto/capi/images/sequence_data.png
+++ b/doc/howto/capi/images/sequence_data.png
--- a/doc/howto/capi/images/workflow_of_CAPI.png
+++ b/doc/howto/capi/images/workflow_of_CAPI.png
--- a/doc/howto/capi/index_cn.rst
+++ b/doc/howto/capi/index_cn.rst
--- a/doc/howto/capi/index_en.rst
+++ b/doc/howto/capi/index_en.rst
--- a/doc/howto/capi/organization_of_the_inputs_cn.md
+++ b/doc/howto/capi/organization_of_the_inputs_cn.md
--- a/doc/howto/capi/organization_of_the_inputs_en.md
+++ b/doc/howto/capi/organization_of_the_inputs_en.md
--- a/doc/howto/capi/workflow_of_capi_cn.md
+++ b/doc/howto/capi/workflow_of_capi_cn.md
--- a/doc/howto/capi/workflow_of_capi_en.md
+++ b/doc/howto/capi/workflow_of_capi_en.md
--- a/doc/howto/cluster/cmd_argument_cn.md
+++ b/doc/howto/cluster/cmd_argument_cn.md
--- a/doc/howto/cluster/cmd_argument_en.md
+++ b/doc/howto/cluster/cmd_argument_en.md
--- a/doc/howto/cluster/index_cn.rst
+++ b/doc/howto/cluster/index_cn.rst
 分布式训练
 ==========
-本节将介绍如何使用PaddlePaddle在不同的集群框架下完成分布式训练。分布式训练架构如下图所示：
+深度学习模型的效果好坏与数据量的大小往往有直接的关系：相同的模型，在增大训练数据集后一般都能取得更好的效果。但是当数据量增大到一定程度后，单台计算机已经难以承受。这时，使用多台计算机进行分布式训练就是一个很自然的解决方案。在分布式训练中，训练数据被分割为多份，参与训练的多台机器分别读取自己的数据进行训练，并协同对整体模型的参数进行更新。
+分布式训练一般有着如下图所示的架构：
 .. image:: src/ps_cn.png
   :width: 500
@@ -10,13 +12,25 @@
 - 计算节点（Trainer）: 每个trainer启动后读取切分好的一部分数据，开始神经网络的“前馈”和“后馈”计算，并和参数服务器通信。在完成一定量数据的训练后，上传计算得出的梯度（gradients），然后下载优化更新后的神经网络参数（parameters）。
 - 参数服务器（Parameter server）:每个参数服务器只保存整个神经网络所有参数的一部分。参数服务器接收从计算节点上传的梯度，并完成参数优化更新，再将更新后的参数下发到每个计算节点。
-这样，通过计算节点和参数服务器的分布式协作，可以完成神经网络的SGD方法的训练。PaddlePaddle可以同时支持同步随机梯度下降（SGD）和异步随机梯度下降。
+通过计算节点和参数服务器的分布式协作，可以完成神经网络的同步随机梯度下降（SGD）方法的训练。PaddlePaddle同时支持同步随机梯度下降（SGD）和异步随机梯度下降（ASGD）。
-在使用同步SGD训练神经网络时，PaddlePaddle使用同步屏障（barrier），使梯度的提交和参数的更新按照顺序方式执行。在异步SGD中，则并不会等待所有trainer提交梯度才更新参数，这样极大地提高了计算的并行性：参数服务器之间不相互依赖，并行地接收梯度和更新参数，参数服务器也不会等待计算节点全部都提交梯度之后才开始下一步，计算节点之间也不会相互依赖，并行地执行模型的训练。可以看出，虽然异步SGD方式会提高参数更新并行度, 但是并不能保证参数同步更新，在任意时间某一台参数服务器上保存的参数可能比另一台要更新，与同步SGD相比，梯度会有噪声。
+在开始集群训练之前，需要先进行集群配置、PaddlePaddle安装等准备工作，了解如何通过这些步骤来配置分布式训练所需的基本环境：
 ..  toctree::
  :maxdepth: 1
  preparations_cn.md
+集群训练有大量可配置的参数，例如使用的机器数量、通信端口等。了解如何通过设置启动参数的方式，对分布式训练的过程进行配置：
+..  toctree::
+  :maxdepth: 1
  cmd_argument_cn.md
+PaddlePaddle可以兼容各种不同的集群。每种集群各有优势，使用的具体方式也有区别：
+..  toctree::
+  :maxdepth: 1
  multi_cluster/index_cn.rst
--- a/doc/howto/cluster/index_en.rst
+++ b/doc/howto/cluster/index_en.rst
--- a/doc/howto/cluster/multi_cluster/fabric_cn.md
+++ b/doc/howto/cluster/multi_cluster/fabric_cn.md
--- a/doc/howto/cluster/multi_cluster/fabric_en.md
+++ b/doc/howto/cluster/multi_cluster/fabric_en.md
--- a/doc/howto/cluster/multi_cluster/index_cn.rst
+++ b/doc/howto/cluster/multi_cluster/index_cn.rst
--- a/doc/howto/cluster/multi_cluster/index_en.rst
+++ b/doc/howto/cluster/multi_cluster/index_en.rst
--- a/doc/howto/cluster/multi_cluster/k8s_aws_cn.md
+++ b/doc/howto/cluster/multi_cluster/k8s_aws_cn.md
--- a/doc/howto/cluster/multi_cluster/k8s_aws_en.md
+++ b/doc/howto/cluster/multi_cluster/k8s_aws_en.md
--- a/doc/howto/cluster/multi_cluster/k8s_cn.md
+++ b/doc/howto/cluster/multi_cluster/k8s_cn.md
--- a/doc/howto/cluster/multi_cluster/k8s_distributed_cn.md
+++ b/doc/howto/cluster/multi_cluster/k8s_distributed_cn.md
--- a/doc/howto/cluster/multi_cluster/k8s_distributed_en.md
+++ b/doc/howto/cluster/multi_cluster/k8s_distributed_en.md
--- a/doc/howto/cluster/multi_cluster/k8s_en.md
+++ b/doc/howto/cluster/multi_cluster/k8s_en.md
--- a/doc/howto/cluster/multi_cluster/openmpi_cn.md
+++ b/doc/howto/cluster/multi_cluster/openmpi_cn.md
--- a/doc/howto/cluster/multi_cluster/openmpi_en.md
+++ b/doc/howto/cluster/multi_cluster/openmpi_en.md
--- a/doc/howto/cluster/multi_cluster/src/add_security_group.png
+++ b/doc/howto/cluster/multi_cluster/src/add_security_group.png
--- a/doc/howto/cluster/multi_cluster/src/create_efs.png
+++ b/doc/howto/cluster/multi_cluster/src/create_efs.png
--- a/doc/howto/cluster/multi_cluster/src/k8s-paddle-arch.png
+++ b/doc/howto/cluster/multi_cluster/src/k8s-paddle-arch.png
--- a/doc/howto/cluster/multi_cluster/src/k8s_data/Dockerfile
+++ b/doc/howto/cluster/multi_cluster/src/k8s_data/Dockerfile
--- a/doc/howto/cluster/multi_cluster/src/k8s_data/README.md
+++ b/doc/howto/cluster/multi_cluster/src/k8s_data/README.md
--- a/doc/howto/cluster/multi_cluster/src/k8s_data/get_data.sh
+++ b/doc/howto/cluster/multi_cluster/src/k8s_data/get_data.sh
--- a/doc/howto/cluster/multi_cluster/src/k8s_train/Dockerfile
+++ b/doc/howto/cluster/multi_cluster/src/k8s_train/Dockerfile
--- a/doc/howto/cluster/multi_cluster/src/k8s_train/README.md
+++ b/doc/howto/cluster/multi_cluster/src/k8s_train/README.md
--- a/doc/howto/cluster/multi_cluster/src/k8s_train/start.sh
+++ b/doc/howto/cluster/multi_cluster/src/k8s_train/start.sh
--- a/doc/howto/cluster/multi_cluster/src/k8s_train/start_paddle.py
+++ b/doc/howto/cluster/multi_cluster/src/k8s_train/start_paddle.py
--- a/doc/howto/cluster/multi_cluster/src/pserver_and_trainer.png
+++ b/doc/howto/cluster/multi_cluster/src/pserver_and_trainer.png
--- a/doc/howto/cluster/multi_cluster/src/route53_create_recordset.png
+++ b/doc/howto/cluster/multi_cluster/src/route53_create_recordset.png
--- a/doc/howto/cluster/multi_cluster/src/route53_create_zone.png
+++ b/doc/howto/cluster/multi_cluster/src/route53_create_zone.png
--- a/doc/howto/cluster/multi_cluster/src/worker_security_group.png
+++ b/doc/howto/cluster/multi_cluster/src/worker_security_group.png
--- a/doc/howto/cluster/preparations_cn.md
+++ b/doc/howto/cluster/preparations_cn.md
--- a/doc/howto/cluster/preparations_en.md
+++ b/doc/howto/cluster/preparations_en.md
--- a/doc/howto/cluster/src/Dockerfile
+++ b/doc/howto/cluster/src/Dockerfile
--- a/doc/howto/cluster/src/efs_mount.png
+++ b/doc/howto/cluster/src/efs_mount.png
--- a/doc/howto/cluster/src/managed_policy.png
+++ b/doc/howto/cluster/src/managed_policy.png
--- a/doc/howto/cluster/src/ps_cn.png
+++ b/doc/howto/cluster/src/ps_cn.png
--- a/doc/howto/cluster/src/ps_en.png
+++ b/doc/howto/cluster/src/ps_en.png
--- a/doc/howto/cluster/src/trainer.png
+++ b/doc/howto/cluster/src/trainer.png
--- a/doc/howto/cluster/src/trainer_cn.png
+++ b/doc/howto/cluster/src/trainer_cn.png
--- a/doc/howto/cluster/src/word2vec/api_train_v2.py
+++ b/doc/howto/cluster/src/word2vec/api_train_v2.py
--- a/doc/howto/cluster/src/word2vec/api_train_v2_cluster.py
+++ b/doc/howto/cluster/src/word2vec/api_train_v2_cluster.py
--- a/doc/howto/cluster/src/word2vec/prepare.py
+++ b/doc/howto/cluster/src/word2vec/prepare.py
--- a/doc/howto/cmd_parameter/arguments_cn.md
+++ b/doc/howto/cmd_parameter/arguments_cn.md
--- a/doc/howto/cmd_parameter/arguments_en.md
+++ b/doc/howto/cmd_parameter/arguments_en.md
--- a/doc/howto/cmd_parameter/detail_introduction_cn.md
+++ b/doc/howto/cmd_parameter/detail_introduction_cn.md
--- a/doc/howto/cmd_parameter/detail_introduction_en.md
+++ b/doc/howto/cmd_parameter/detail_introduction_en.md
--- a/doc/v2/howto/cmd_parameter/index_cn.rst
+++ b/doc/v2/howto/cmd_parameter/index_cn.rst
+..  _cmd_line_index:
+命令行参数设置
+===============
+深度学习算法的实现有着多样化的特点，运行环境、运行阶段、模型结构、训练策略等等这些都是常见的变化因素。PaddlePaddle支持用户灵活地设置各种命令行参数，以实现对模型训练或预测流程的控制。
+在这一部分，首先以几个实际场景为例，展示了部分命令行参数的使用:
+..  toctree::
+  :maxdepth: 1
+  use_case_cn.md
+接着对所有参数的使用场合进行概述和分类:
+..  toctree::
+  :maxdepth: 1
+  arguments_cn.md
+最后给出细节描述，详细解释这些参数的属性和意义:
+..  toctree::
+  :maxdepth: 1
+  detail_introduction_cn.md
--- a/doc/howto/cmd_parameter/index_en.rst
+++ b/doc/howto/cmd_parameter/index_en.rst
--- a/doc/howto/cmd_parameter/use_case_cn.md
+++ b/doc/howto/cmd_parameter/use_case_cn.md
--- a/doc/howto/cmd_parameter/use_case_en.md
+++ b/doc/howto/cmd_parameter/use_case_en.md
--- a/doc/howto/index_cn.rst
+++ b/doc/howto/index_cn.rst
--- a/doc/howto/index_en.rst
+++ b/doc/howto/index_en.rst
@@ -6,5 +6,6 @@ HOW TO
  cmd_parameter/index_en.rst
  cluster/index_en.rst
+  capi/index_en.rst
  rnn/index_en.rst
  optimization/gpu_profiling_en.rst
--- a/doc/howto/optimization/gpu_profiling_cn.rst
+++ b/doc/howto/optimization/gpu_profiling_cn.rst
@@ -55,7 +55,7 @@ above profilers.
 :code:`paddle/math/test` 目录中的 :code:`test_GpuProfiler` 就是用于展示上述分析工具的用法。
-.. literalinclude:: ../../../paddle/math/tests/test_GpuProfiler.cpp
+.. literalinclude:: ../../../../paddle/math/tests/test_GpuProfiler.cpp
   :language: c++
   :lines: 137-151
   :linenos:
@@ -83,7 +83,7 @@ program crashes when CPU version of PaddlePaddle invokes them.
 1. 加入 :code:`REGISTER_TIMER_INFO` 和 :code:`printAllStatus` 函数（如高亮部分）。
-    .. literalinclude:: ../../../paddle/math/tests/test_GpuProfiler.cpp
+    .. literalinclude:: ../../../../paddle/math/tests/test_GpuProfiler.cpp
        :language: c++
        :lines: 137-151
        :emphasize-lines: 8-12,14
@@ -130,7 +130,7 @@ nvprof 工具
 1. 将 :code:`REGISTER_GPU_PROFILER` 函数加到代码中（参考强调部分）。
-    .. literalinclude:: ../../../paddle/math/tests/test_GpuProfiler.cpp
+    .. literalinclude:: ../../../../paddle/math/tests/test_GpuProfiler.cpp
        :language: c++
        :lines: 137-151
        :emphasize-lines: 6-7

--- a/doc/howto/optimization/gpu_profiling_en.rst
+++ b/doc/howto/optimization/gpu_profiling_en.rst
@@ -54,7 +54,7 @@ In this tutorial, we will focus on nvprof and nvvp.
 :code:`test_GpuProfiler` from :code:`paddle/math/tests` directory will be used to evaluate
 above profilers.
-.. literalinclude:: ../../../paddle/math/tests/test_GpuProfiler.cpp
+.. literalinclude:: ../../../../paddle/math/tests/test_GpuProfiler.cpp
   :language: c++
   :lines: 137-151
   :linenos:
@@ -80,7 +80,7 @@ As a simple example, consider the following:
 1. Add :code:`REGISTER_TIMER_INFO` and :code:`printAllStatus` functions (see the emphasize-lines).
-    .. literalinclude:: ../../../paddle/math/tests/test_GpuProfiler.cpp
+    .. literalinclude:: ../../../../paddle/math/tests/test_GpuProfiler.cpp
        :language: c++
        :lines: 137-151
        :emphasize-lines: 8-12,14
@@ -127,7 +127,7 @@ To use this command line profiler **nvprof**, you can simply issue the following
 1. Add :code:`REGISTER_GPU_PROFILER` function (see the emphasize-lines).
-    .. literalinclude:: ../../../paddle/math/tests/test_GpuProfiler.cpp
+    .. literalinclude:: ../../../../paddle/math/tests/test_GpuProfiler.cpp
        :language: c++
        :lines: 137-151
        :emphasize-lines: 6-7

--- a/doc/howto/optimization/nvvp1.png
+++ b/doc/howto/optimization/nvvp1.png
--- a/doc/howto/optimization/nvvp2.png
+++ b/doc/howto/optimization/nvvp2.png
--- a/doc/howto/optimization/nvvp3.png
+++ b/doc/howto/optimization/nvvp3.png
--- a/doc/howto/optimization/nvvp4.png
+++ b/doc/howto/optimization/nvvp4.png
--- a/doc/howto/optimization/pprof_1.png
+++ b/doc/howto/optimization/pprof_1.png
--- a/doc/howto/optimization/pprof_2.png
+++ b/doc/howto/optimization/pprof_2.png
--- a/doc/howto/rnn/hierarchical_layer_cn.rst
+++ b/doc/howto/rnn/hierarchical_layer_cn.rst
--- a/doc/howto/rnn/hierarchical_layer_en.rst
+++ b/doc/howto/rnn/hierarchical_layer_en.rst
--- a/doc/howto/rnn/hrnn_rnn_api_compare_cn.rst
+++ b/doc/howto/rnn/hrnn_rnn_api_compare_cn.rst
--- a/doc/howto/rnn/hrnn_rnn_api_compare_en.rst
+++ b/doc/howto/rnn/hrnn_rnn_api_compare_en.rst
--- a/doc/howto/rnn/index_cn.rst
+++ b/doc/howto/rnn/index_cn.rst
--- a/doc/howto/rnn/index_en.rst
+++ b/doc/howto/rnn/index_en.rst
--- a/doc/howto/rnn/recurrent_group_cn.md
+++ b/doc/howto/rnn/recurrent_group_cn.md
--- a/doc/howto/rnn/recurrent_group_en.md
+++ b/doc/howto/rnn/recurrent_group_en.md
--- a/doc/howto/rnn/rnn_config_cn.rst
+++ b/doc/howto/rnn/rnn_config_cn.rst
--- a/doc/howto/rnn/rnn_config_en.rst
+++ b/doc/howto/rnn/rnn_config_en.rst
--- a/doc/howto/rnn/src/bi_lstm.jpg
+++ b/doc/howto/rnn/src/bi_lstm.jpg
--- a/doc/howto/rnn/src/encoder-decoder-attention-model.png
+++ b/doc/howto/rnn/src/encoder-decoder-attention-model.png
--- a/doc/howto/rnn/src/glossary_rnn.dot
+++ b/doc/howto/rnn/src/glossary_rnn.dot
--- a/doc/howto/rnn/src/glossary_rnn_with_memory.dot
+++ b/doc/howto/rnn/src/glossary_rnn_with_memory.dot
--- a/doc/howto/rnn/src/simple_full_hierarchical_recurrent.dot
+++ b/doc/howto/rnn/src/simple_full_hierarchical_recurrent.dot
--- a/doc/howto/rnn/src/simple_full_recurrent.dot
+++ b/doc/howto/rnn/src/simple_full_recurrent.dot
--- a/doc/index_cn.rst
+++ b/doc/index_cn.rst
--- a/doc/index_en.rst
+++ b/doc/index_en.rst
--- a/doc_theme/static/css/override.css
+++ b/doc_theme/static/css/override.css
-* {
-    font-family:"Roboto","Lato","proxima-nova","Helvetica Neue",Arial,sans-serif;
-}
-body {
-    padding-top: 80px;
-    background-image: none !important;
-    font-family: Roboto;
-}
-a, a:focus, a:hover, a:visited {
-    color: #597cf1;
-}
-.site-header {
-    position: fixed;
-    top: 0;
-    width: 100%;
-    left: 0;
-    z-index: 99;
-    background: #333;
-    height: 80px;
-    display: -webkit-flex;
-    display: -ms-flex;
-    display: -o-flex;
-    display: flex;
-    flex-flow: row nowrap;
-    justify-content: space-between;
-    box-shadow: #ccc 0 3px 3px;
-}
-.site-header > div {
-    height: 80px;
-    display: inline-block;
-    background-color: #2f323a;
-    padding: 0 30px;
-}
-.site-header .site-logo {
-    line-height: 80px;
-    width: 290px;
-    flex: 0 1 290px;
-}
-.site-header .site-logo > a {
-    display: inline-block;
-    width: 230px;
-}
-.site-header .site-nav-links {
-    flex: 0 1 100%;
-}
-.site-header .site-nav-links .site-menu {
-    height: 30px;
-    line-height: 30px; 
-    font-size: 12px;
-    background: -webkit-linear-gradient(#282b33, #2f323a);
-    background: -o-linear-gradient(#282b33, #2f323a);
-    background: -moz-linear-gradient(#282b33, #2f323a);
-    background: linear-gradient(to left, #282b33, #2f323a);
-    margin-right: -30px;
-    padding-right: 30px;
-}
-.site-header .site-nav-links .site-menu .site-page-links {
-    display: inline-block;
-    float: right;
-    margin-right: 20px;
-}
-.site-header .site-nav-links .site-menu .site-page-links> li {
-    display: inline-block;
-    float: left;
-}
-.site-header .site-nav-links .site-menu .site-page-links > li > a {
-    color: #a7adbd;
-    display: inline-block;
-    height: 30px;
-    padding: 0 20px;
-    font-size: 12px;
-}
-.site-header .site-nav-links .site-menu .site-page-links > li:hover > a,
-.site-header .site-nav-links .site-menu .site-page-links > li.active > a {
-    background-color: #2f323a;
-    color: #bcc1d0;
-}
-.site-header .site-nav-links .site-menu .site-page-links > li.active > a {
-    font-weight: bold;
-}
-.site-header .site-nav-links .site-menu .fork-on-github {
-    color: #597cf1;
-    line-height: 30px;
-    display: inline-block;
-    padding: 0 0 0 20px;
-    float: right;
-    position: relative;
-}
-.site-header .site-nav-links .site-menu .fork-on-github .fa {
-    margin-right: 5px;
-    font-size: 16px;
-    vertical-align: middle;
-}
-.site-header .site-nav-links .site-menu .language-switcher {
-    height: 30px;
-    display: inline-block;
-    float: right;
-    line-height: 30px;
-    padding: 0 20px;
-    position: relative;
-}
-.site-header .site-nav-links .site-menu .language-switcher > a {
-    color: #a7adbd;
-}
-.site-header .site-nav-links .site-menu .language-switcher.open > a {
-    background-color: #24272f;
-    color: #bcc1d0;
-}
-.site-header .site-nav-links .site-menu .language-switcher .fa {
-    margin-left: 5px;
-}
-.site-header .site-nav-links .site-menu .language-switcher .fa-angle-down {
-    display: inline;
-}
-.site-header .site-nav-links .site-menu .language-switcher.open .fa-angle-down {
-    display: none;
-}
-.site-header .site-nav-links .site-menu .language-switcher .fa-angle-up {
-    display: none;
-}
-.site-header .site-nav-links .site-menu .language-switcher.open .fa-angle-up {
-    display: inline;
-}
-.site-header .site-nav-links .site-menu .fork-on-github:before,
-.site-header .site-nav-links .site-menu .language-switcher:before {
-    width: 1px;
-    height: 12px;
-    top: 9px;
-    background-color: #3a3d47;
-    left: 0;
-    display: inline-block;
-    position: absolute;
-    content: "";
-}
-.site-header .site-nav-links .site-menu .language-switcher .dropdown-menu {
-    display: none;
-    position: absolute;
-    box-shadow: #ccc 0 0 5px;
-    background-color: #fff;
-    width: 100%;
-    left: 0;
-    top: 30px;
-}
-.site-header .site-nav-links .site-menu .language-switcher .dropdown-menu > li {
-    line-height: 30px;
-    padding: 0 20px;
-}
-.site-header .site-nav-links .site-menu .language-switcher .dropdown-menu > li:hover {
-    background-color: #f7f8fe;
-}
-.site-header .site-nav-links .site-menu .language-switcher .dropdown-menu > li + li {
-    border-top: 1px solid #dedfe5;
-}
-.site-header .site-nav-links .site-menu .language-switcher .dropdown-menu > li > a {
-    color: #2f323a;
-}
-.site-header .site-nav-links .site-menu .language-switcher.open .dropdown-menu {
-    display: inline-block;
-}
-.site-header .site-nav-links .doc-module {
-    display: block;
-    height: 50px;
-    line-height: 50px;
-}
-.site-header .site-nav-links .doc-module > ul > li {
-    display: inline-block;
-    float: left;
-}
-.site-header .site-nav-links .doc-module > ul > li > a {
-    color: #c9cbd0;
-    font-size: 14px;
-    display: inline-block;
-    height: 50px;
-    line-height: 50px;
-    border-bottom: 2px solid transparent;
-    padding: 0 20px;
-}
-.site-header .site-nav-links .doc-module > ul > li:hover > a {
-    color: #fff;
-}
-.site-header .site-nav-links .doc-module > ul > li.current > a {
-    border-bottom-color: #fff;
-    color: #fff;
-}
-.site-header .site-nav-links .doc-module [role="search"]{
-    float: right;
-}
-.site-header .site-nav-links .doc-module [role="search"] input {
-    background-color: #3a3d47;
-    border-radius: 15px;
-    color: #a7adbd;
-    border: 1px solid transparent;
-    padding: 6px 15px;
-    width: 180px;
-    box-shadow: none;
-    transition: all .2s;
-    -webkit-transition: all .2s;
-    -moz-transition: all .2s;
-    -o-transition: all .2s;
-    background-repeat: no-repeat;
-    background-position: 145px center;
-    background-image: url("data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAAAACXBIWXMAAA7EAAAOxAGVKw4bAAAKTWlDQ1BQaG90b3Nob3AgSUNDIHByb2ZpbGUAAHjanVN3WJP3Fj7f92UPVkLY8LGXbIEAIiOsCMgQWaIQkgBhhBASQMWFiApWFBURnEhVxILVCkidiOKgKLhnQYqIWotVXDjuH9yntX167+3t+9f7vOec5/zOec8PgBESJpHmomoAOVKFPDrYH49PSMTJvYACFUjgBCAQ5svCZwXFAADwA3l4fnSwP/wBr28AAgBw1S4kEsfh/4O6UCZXACCRAOAiEucLAZBSAMguVMgUAMgYALBTs2QKAJQAAGx5fEIiAKoNAOz0ST4FANipk9wXANiiHKkIAI0BAJkoRyQCQLsAYFWBUiwCwMIAoKxAIi4EwK4BgFm2MkcCgL0FAHaOWJAPQGAAgJlCLMwAIDgCAEMeE80DIEwDoDDSv+CpX3CFuEgBAMDLlc2XS9IzFLiV0Bp38vDg4iHiwmyxQmEXKRBmCeQinJebIxNI5wNMzgwAABr50cH+OD+Q5+bk4eZm52zv9MWi/mvwbyI+IfHf/ryMAgQAEE7P79pf5eXWA3DHAbB1v2upWwDaVgBo3/ldM9sJoFoK0Hr5i3k4/EAenqFQyDwdHAoLC+0lYqG9MOOLPv8z4W/gi372/EAe/tt68ABxmkCZrcCjg/1xYW52rlKO58sEQjFu9+cj/seFf/2OKdHiNLFcLBWK8ViJuFAiTcd5uVKRRCHJleIS6X8y8R+W/QmTdw0ArIZPwE62B7XLbMB+7gECiw5Y0nYAQH7zLYwaC5EAEGc0Mnn3AACTv/mPQCsBAM2XpOMAALzoGFyolBdMxggAAESggSqwQQcMwRSswA6cwR28wBcCYQZEQAwkwDwQQgbkgBwKoRiWQRlUwDrYBLWwAxqgEZrhELTBMTgN5+ASXIHrcBcGYBiewhi8hgkEQcgIE2EhOogRYo7YIs4IF5mOBCJhSDSSgKQg6YgUUSLFyHKkAqlCapFdSCPyLXIUOY1cQPqQ28ggMor8irxHMZSBslED1AJ1QLmoHxqKxqBz0XQ0D12AlqJr0Rq0Hj2AtqKn0UvodXQAfYqOY4DRMQ5mjNlhXIyHRWCJWBomxxZj5Vg1Vo81Yx1YN3YVG8CeYe8IJAKLgBPsCF6EEMJsgpCQR1hMWEOoJewjtBK6CFcJg4Qxwicik6hPtCV6EvnEeGI6sZBYRqwm7iEeIZ4lXicOE1+TSCQOyZLkTgohJZAySQtJa0jbSC2kU6Q+0hBpnEwm65Btyd7kCLKArCCXkbeQD5BPkvvJw+S3FDrFiOJMCaIkUqSUEko1ZT/lBKWfMkKZoKpRzame1AiqiDqfWkltoHZQL1OHqRM0dZolzZsWQ8ukLaPV0JppZ2n3aC/pdLoJ3YMeRZfQl9Jr6Afp5+mD9HcMDYYNg8dIYigZaxl7GacYtxkvmUymBdOXmchUMNcyG5lnmA+Yb1VYKvYqfBWRyhKVOpVWlX6V56pUVXNVP9V5qgtUq1UPq15WfaZGVbNQ46kJ1Bar1akdVbupNq7OUndSj1DPUV+jvl/9gvpjDbKGhUaghkijVGO3xhmNIRbGMmXxWELWclYD6yxrmE1iW7L57Ex2Bfsbdi97TFNDc6pmrGaRZp3mcc0BDsax4PA52ZxKziHODc57LQMtPy2x1mqtZq1+rTfaetq+2mLtcu0W7eva73VwnUCdLJ31Om0693UJuja6UbqFutt1z+o+02PreekJ9cr1Dund0Uf1bfSj9Rfq79bv0R83MDQINpAZbDE4Y/DMkGPoa5hpuNHwhOGoEctoupHEaKPRSaMnuCbuh2fjNXgXPmasbxxirDTeZdxrPGFiaTLbpMSkxeS+Kc2Ua5pmutG003TMzMgs3KzYrMnsjjnVnGueYb7ZvNv8jYWlRZzFSos2i8eW2pZ8ywWWTZb3rJhWPlZ5VvVW16xJ1lzrLOtt1ldsUBtXmwybOpvLtqitm63Edptt3xTiFI8p0in1U27aMez87ArsmuwG7Tn2YfYl9m32zx3MHBId1jt0O3xydHXMdmxwvOuk4TTDqcSpw+lXZxtnoXOd8zUXpkuQyxKXdpcXU22niqdun3rLleUa7rrStdP1o5u7m9yt2W3U3cw9xX2r+00umxvJXcM970H08PdY4nHM452nm6fC85DnL152Xlle+70eT7OcJp7WMG3I28Rb4L3Le2A6Pj1l+s7pAz7GPgKfep+Hvqa+It89viN+1n6Zfgf8nvs7+sv9j/i/4XnyFvFOBWABwQHlAb2BGoGzA2sDHwSZBKUHNQWNBbsGLww+FUIMCQ1ZH3KTb8AX8hv5YzPcZyya0RXKCJ0VWhv6MMwmTB7WEY6GzwjfEH5vpvlM6cy2CIjgR2yIuB9pGZkX+X0UKSoyqi7qUbRTdHF09yzWrORZ+2e9jvGPqYy5O9tqtnJ2Z6xqbFJsY+ybuIC4qriBeIf4RfGXEnQTJAntieTE2MQ9ieNzAudsmjOc5JpUlnRjruXcorkX5unOy553PFk1WZB8OIWYEpeyP+WDIEJQLxhP5aduTR0T8oSbhU9FvqKNolGxt7hKPJLmnVaV9jjdO31D+miGT0Z1xjMJT1IreZEZkrkj801WRNberM/ZcdktOZSclJyjUg1plrQr1zC3KLdPZisrkw3keeZtyhuTh8r35CP5c/PbFWyFTNGjtFKuUA4WTC+oK3hbGFt4uEi9SFrUM99m/ur5IwuCFny9kLBQuLCz2Lh4WfHgIr9FuxYji1MXdy4xXVK6ZHhp8NJ9y2jLspb9UOJYUlXyannc8o5Sg9KlpUMrglc0lamUycturvRauWMVYZVkVe9ql9VbVn8qF5VfrHCsqK74sEa45uJXTl/VfPV5bdra3kq3yu3rSOuk626s91m/r0q9akHV0IbwDa0b8Y3lG19tSt50oXpq9Y7NtM3KzQM1YTXtW8y2rNvyoTaj9nqdf13LVv2tq7e+2Sba1r/dd3vzDoMdFTve75TsvLUreFdrvUV99W7S7oLdjxpiG7q/5n7duEd3T8Wej3ulewf2Re/ranRvbNyvv7+yCW1SNo0eSDpw5ZuAb9qb7Zp3tXBaKg7CQeXBJ9+mfHvjUOihzsPcw83fmX+39QjrSHkr0jq/dawto22gPaG97+iMo50dXh1Hvrf/fu8x42N1xzWPV56gnSg98fnkgpPjp2Snnp1OPz3Umdx590z8mWtdUV29Z0PPnj8XdO5Mt1/3yfPe549d8Lxw9CL3Ytslt0utPa49R35w/eFIr1tv62X3y+1XPK509E3rO9Hv03/6asDVc9f41y5dn3m978bsG7duJt0cuCW69fh29u0XdwruTNxdeo94r/y+2v3qB/oP6n+0/rFlwG3g+GDAYM/DWQ/vDgmHnv6U/9OH4dJHzEfVI0YjjY+dHx8bDRq98mTOk+GnsqcTz8p+Vv9563Or59/94vtLz1j82PAL+YvPv655qfNy76uprzrHI8cfvM55PfGm/K3O233vuO+638e9H5ko/ED+UPPR+mPHp9BP9z7nfP78L/eE8/sl0p8zAAAAIGNIUk0AAHolAACAgwAA+f8AAIDpAAB1MAAA6mAAADqYAAAXb5JfxUYAAAEpSURBVHjanNO7K8dhFMfx1w8LBqVM5DLxF7hMTGSQpAwmJSkDizAZLSb5Ayi3clsMFgwWISGXkoSyGYRSym15fvr27duvH5/leTqd8+6c83ye1NLatohqMIgWVOEV+5jDAr7ElBO5j+IIH+hBJRqwjDHsoTQOyAvnCPpRi4tYziVmMY2dkPMc7aAG42hPKE7rAwMBNhEfYQgzOJNZ3xhGL4qigGasyk43OEdjFFCGe9nrNtT8Al5Q8AdAMd6jgFPU/QFwiN0oYD4sJzdLwBiuo4A5vGEKqQyF1ahPcuInOsJrrKMiwWx9OMAWWpOc+BD2MImr4Ik7FIb4AzqRH6zdhU1IxT4TlKAJ5XjCMU6CkaANi2lIXsKsj1jJsIsNdKc7yfE/pSGTPwMABBFCGflm+rsAAAAASUVORK5CYII=");
-}
-.site-header .site-nav-links .doc-module [role="search"] input:focus {
-   width: 300px;
-}
-.site-header .site-nav-links .doc-module [role="search"] input:focus {
-    background-position: 265px center;
-}
-.site-header .site-nav-links .doc-module [role="search"] input:hover,
-.site-header .site-nav-links .doc-module [role="search"] input:focus {
-   color: #fff;
-   border-color: #597cf1;
-   background-image: url("data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAAAACXBIWXMAAA7EAAAOxAGVKw4bAAAKTWlDQ1BQaG90b3Nob3AgSUNDIHByb2ZpbGUAAHjanVN3WJP3Fj7f92UPVkLY8LGXbIEAIiOsCMgQWaIQkgBhhBASQMWFiApWFBURnEhVxILVCkidiOKgKLhnQYqIWotVXDjuH9yntX167+3t+9f7vOec5/zOec8PgBESJpHmomoAOVKFPDrYH49PSMTJvYACFUjgBCAQ5svCZwXFAADwA3l4fnSwP/wBr28AAgBw1S4kEsfh/4O6UCZXACCRAOAiEucLAZBSAMguVMgUAMgYALBTs2QKAJQAAGx5fEIiAKoNAOz0ST4FANipk9wXANiiHKkIAI0BAJkoRyQCQLsAYFWBUiwCwMIAoKxAIi4EwK4BgFm2MkcCgL0FAHaOWJAPQGAAgJlCLMwAIDgCAEMeE80DIEwDoDDSv+CpX3CFuEgBAMDLlc2XS9IzFLiV0Bp38vDg4iHiwmyxQmEXKRBmCeQinJebIxNI5wNMzgwAABr50cH+OD+Q5+bk4eZm52zv9MWi/mvwbyI+IfHf/ryMAgQAEE7P79pf5eXWA3DHAbB1v2upWwDaVgBo3/ldM9sJoFoK0Hr5i3k4/EAenqFQyDwdHAoLC+0lYqG9MOOLPv8z4W/gi372/EAe/tt68ABxmkCZrcCjg/1xYW52rlKO58sEQjFu9+cj/seFf/2OKdHiNLFcLBWK8ViJuFAiTcd5uVKRRCHJleIS6X8y8R+W/QmTdw0ArIZPwE62B7XLbMB+7gECiw5Y0nYAQH7zLYwaC5EAEGc0Mnn3AACTv/mPQCsBAM2XpOMAALzoGFyolBdMxggAAESggSqwQQcMwRSswA6cwR28wBcCYQZEQAwkwDwQQgbkgBwKoRiWQRlUwDrYBLWwAxqgEZrhELTBMTgN5+ASXIHrcBcGYBiewhi8hgkEQcgIE2EhOogRYo7YIs4IF5mOBCJhSDSSgKQg6YgUUSLFyHKkAqlCapFdSCPyLXIUOY1cQPqQ28ggMor8irxHMZSBslED1AJ1QLmoHxqKxqBz0XQ0D12AlqJr0Rq0Hj2AtqKn0UvodXQAfYqOY4DRMQ5mjNlhXIyHRWCJWBomxxZj5Vg1Vo81Yx1YN3YVG8CeYe8IJAKLgBPsCF6EEMJsgpCQR1hMWEOoJewjtBK6CFcJg4Qxwicik6hPtCV6EvnEeGI6sZBYRqwm7iEeIZ4lXicOE1+TSCQOyZLkTgohJZAySQtJa0jbSC2kU6Q+0hBpnEwm65Btyd7kCLKArCCXkbeQD5BPkvvJw+S3FDrFiOJMCaIkUqSUEko1ZT/lBKWfMkKZoKpRzame1AiqiDqfWkltoHZQL1OHqRM0dZolzZsWQ8ukLaPV0JppZ2n3aC/pdLoJ3YMeRZfQl9Jr6Afp5+mD9HcMDYYNg8dIYigZaxl7GacYtxkvmUymBdOXmchUMNcyG5lnmA+Yb1VYKvYqfBWRyhKVOpVWlX6V56pUVXNVP9V5qgtUq1UPq15WfaZGVbNQ46kJ1Bar1akdVbupNq7OUndSj1DPUV+jvl/9gvpjDbKGhUaghkijVGO3xhmNIRbGMmXxWELWclYD6yxrmE1iW7L57Ex2Bfsbdi97TFNDc6pmrGaRZp3mcc0BDsax4PA52ZxKziHODc57LQMtPy2x1mqtZq1+rTfaetq+2mLtcu0W7eva73VwnUCdLJ31Om0693UJuja6UbqFutt1z+o+02PreekJ9cr1Dund0Uf1bfSj9Rfq79bv0R83MDQINpAZbDE4Y/DMkGPoa5hpuNHwhOGoEctoupHEaKPRSaMnuCbuh2fjNXgXPmasbxxirDTeZdxrPGFiaTLbpMSkxeS+Kc2Ua5pmutG003TMzMgs3KzYrMnsjjnVnGueYb7ZvNv8jYWlRZzFSos2i8eW2pZ8ywWWTZb3rJhWPlZ5VvVW16xJ1lzrLOtt1ldsUBtXmwybOpvLtqitm63Edptt3xTiFI8p0in1U27aMez87ArsmuwG7Tn2YfYl9m32zx3MHBId1jt0O3xydHXMdmxwvOuk4TTDqcSpw+lXZxtnoXOd8zUXpkuQyxKXdpcXU22niqdun3rLleUa7rrStdP1o5u7m9yt2W3U3cw9xX2r+00umxvJXcM970H08PdY4nHM452nm6fC85DnL152Xlle+70eT7OcJp7WMG3I28Rb4L3Le2A6Pj1l+s7pAz7GPgKfep+Hvqa+It89viN+1n6Zfgf8nvs7+sv9j/i/4XnyFvFOBWABwQHlAb2BGoGzA2sDHwSZBKUHNQWNBbsGLww+FUIMCQ1ZH3KTb8AX8hv5YzPcZyya0RXKCJ0VWhv6MMwmTB7WEY6GzwjfEH5vpvlM6cy2CIjgR2yIuB9pGZkX+X0UKSoyqi7qUbRTdHF09yzWrORZ+2e9jvGPqYy5O9tqtnJ2Z6xqbFJsY+ybuIC4qriBeIf4RfGXEnQTJAntieTE2MQ9ieNzAudsmjOc5JpUlnRjruXcorkX5unOy553PFk1WZB8OIWYEpeyP+WDIEJQLxhP5aduTR0T8oSbhU9FvqKNolGxt7hKPJLmnVaV9jjdO31D+miGT0Z1xjMJT1IreZEZkrkj801WRNberM/ZcdktOZSclJyjUg1plrQr1zC3KLdPZisrkw3keeZtyhuTh8r35CP5c/PbFWyFTNGjtFKuUA4WTC+oK3hbGFt4uEi9SFrUM99m/ur5IwuCFny9kLBQuLCz2Lh4WfHgIr9FuxYji1MXdy4xXVK6ZHhp8NJ9y2jLspb9UOJYUlXyannc8o5Sg9KlpUMrglc0lamUycturvRauWMVYZVkVe9ql9VbVn8qF5VfrHCsqK74sEa45uJXTl/VfPV5bdra3kq3yu3rSOuk626s91m/r0q9akHV0IbwDa0b8Y3lG19tSt50oXpq9Y7NtM3KzQM1YTXtW8y2rNvyoTaj9nqdf13LVv2tq7e+2Sba1r/dd3vzDoMdFTve75TsvLUreFdrvUV99W7S7oLdjxpiG7q/5n7duEd3T8Wej3ulewf2Re/ranRvbNyvv7+yCW1SNo0eSDpw5ZuAb9qb7Zp3tXBaKg7CQeXBJ9+mfHvjUOihzsPcw83fmX+39QjrSHkr0jq/dawto22gPaG97+iMo50dXh1Hvrf/fu8x42N1xzWPV56gnSg98fnkgpPjp2Snnp1OPz3Umdx590z8mWtdUV29Z0PPnj8XdO5Mt1/3yfPe549d8Lxw9CL3Ytslt0utPa49R35w/eFIr1tv62X3y+1XPK509E3rO9Hv03/6asDVc9f41y5dn3m978bsG7duJt0cuCW69fh29u0XdwruTNxdeo94r/y+2v3qB/oP6n+0/rFlwG3g+GDAYM/DWQ/vDgmHnv6U/9OH4dJHzEfVI0YjjY+dHx8bDRq98mTOk+GnsqcTz8p+Vv9563Or59/94vtLz1j82PAL+YvPv655qfNy76uprzrHI8cfvM55PfGm/K3O233vuO+638e9H5ko/ED+UPPR+mPHp9BP9z7nfP78L/eE8/sl0p8zAAAAIGNIUk0AAHolAACAgwAA+f8AAIDpAAB1MAAA6mAAADqYAAAXb5JfxUYAAAEpSURBVHjanNO9K4ZhFMfxz4MFg1Im8jJ5/gIvExMZJCnFpCRlYBEGGS0m+QMoLwOyGCwyWISEvJQklM0glFLeluvR3d3d08Nvua5O53w751y/K9Uz+SyiNIbRihq8Yh+LWMaXmPIi93Ec4QN9qEYjVjGBPZTHAQXhHMMg6nARy7nEAuawE3Keox2kMYWOhOKMPjAUYNPxEUYwjzPZ9Y1R9KMkCmjButx0g3M0RQEVuJe7bkPNL+AFRX8AlOI9CjhF/R8Ah9iNApbCcvJzBEzgOgpYxBtmkcpSWIuGJCd+ojO8xgaqEsw2gANsoy3JiQ9hDzO4Cp64Q3GIP6ALhcHa3diCVOwzQRmaUYknHOMkGAnasZKBFCTM+oi1LLvYRG+mkzz/UwYy8zMAmkpBg3fGpFUAAAAASUVORK5CYII=");
-}
-.doc-menu-vertical {
-    display: inline-block;
-    float: left;
-    width: 240px;
-    height: 100%;
-    background-color: #ecedee;
-    position: absolute;
-    left: 0;
-    top: 0;
-    overflow: hidden;
-    padding: 0;
-    border-right: 1px solid #dddfe3;
-}
-.doc-menu-vertical > ul {
-    display: none;
-}
-.doc-menu-vertical > ul.current{
-    display: block;
-}
-.doc-menu-vertical > ul.current > li.toctree-l1 {
-    display: none;
-}
-.doc-menu-vertical > ul.current > li.toctree-l1.current {
-    display: block;
-}
-.doc-menu-vertical > ul.current > li.toctree-l1.current > a {
-    display: none;
-}
-.doc-menu-vertical .toctree-l2  a {
-    width: 100%;
-    overflow: hidden;
-    text-overflow: ellipsis;
-    white-space: nowrap;
-    padding-right: 30px;
-}
-.doc-menu-vertical .toctree-l2 > a {
-    font-size: 14px;
-    color: #2f323a;
-    padding-left: 30px;
-    line-height: 50px;
-    display: block;
-    font-weight: bold;
-    border-bottom: 1px solid #dddfe3;
-}
-.doc-menu-vertical .toctree-l2.has-child > a:after {
-    font-family: "FontAwesome";
-    display: inline-block;
-    font-style: normal;
-    font-weight: normal;
-    text-decoration: inherit;
-    content: "";
-    float: right;
-    line-height: 50px;
-    color: #a7adbd;
-    position: absolute;
-    right: 15px;
-}
-.doc-menu-vertical .toctree-l2.has-child.current > a:after {
-    content: "";
-}
-.doc-menu-vertical .toctree-l2 > a + ul{
-    background-color: #e4e6e9;
-    height: 0;
-    overflow: hidden;
-}
-.doc-menu-vertical .toctree-l2.current > a + ul {
-    border-bottom: 1px solid #dddfe3;
-    height: auto;
-}
-.doc-menu-vertical .toctree-l2 li.active > a {
-    background-color: #597cf1;
-    color: #fff;
-}
-.doc-menu-vertical .toctree-l3 > a {
-    font-size: 12px;
-    color: #2f323a;
-    padding-left: 30px;
-    line-height: 40px;
-    display: block;
-}
-.doc-menu-vertical .toctree-l4 > a {
-    font-size: 12px;
-    color: #64697b;
-    padding-left: 50px;
-    line-height: 30px;
-    display: block;
-}
-.doc-menu-vertical .toctree-l5 > a {
-    font-size: 14px;
-    color: #ccc;
-    padding-left: 40px;
-    display: block;
-}
-.local-toc {
-    position: absolute;
-    height: 100%;
-    background-color: #f6f7f8;
-    top: 0;
-    left: 240px;
-    padding: 0;
-    z-index: 9;
-}
-.local-toc:after {
-    content: "";
-    position: absolute;
-    height: 100%;
-    width: 1px;
-    display: inline-block;
-    right: 0;
-    background-color: #dddfe3;
-    top: 0;
-    z-index: -1;
-}
-.local-toc:hover a {
-    width: auto;
-}
-.local-toc > ul > li a {
-    position: relative;
-    font-size: 12px;
-    overflow: hidden;
-    display: none;
-}
-.local-toc > ul > li > ul > li a {
-    display: block;
-    border-top: 1px solid transparent;
-    border-bottom: 1px solid transparent;
-    padding-right: 20px;
-    width: 50px;
-}
-.local-toc > ul > li > ul > li > ul > li > ul a {
-    display: none;
-}
-.local-toc > ul > li > ul li > a:after {
-    content: "";
-    display: inline-block;
-    width: 1px;
-    height: 100%;
-    background-color: transparent;
-    position: absolute;
-    right: 0;
-    top: 0;
-}
-.local-toc > ul > li > ul li a:hover{
-    background-color: #e6eaf7 !important;
-}
-.local-toc > ul > li > ul li a:hover:after {
-    background-color: #e6eaf7 !important;
-}
-.local-toc > ul > li > ul li.active > a {
-    color: #ff9711;
-    background-color: #fff;
-    border-top: 1px solid #dddfe3;
-    border-bottom: 1px solid #dddfe3;
-}
-.local-toc > ul > li > ul li.active > a:before {
-    background-color: #ff9711;
-    width: 10px;
-    height: 10px;
-    margin: 15px 20px;
-    border-radius: 5px;
-}
-.local-toc > ul > li > ul li.active > a:after {
-    background-color: #fff;
-}
-.local-toc > ul > li > ul > li {
-    position: relative;
-    line-height: 40px;
-    white-space: nowrap;
-}
-.local-toc > ul > li > ul > li > a {
-    color: #64697b;
-}
-.local-toc > ul > li > ul > li > a + ul {
-    display: none;
-}
-.local-toc > ul > li > ul > li > a:before {
-    display: inline-block;
-    content: "";
-    width: 6px;
-    height: 6px;
-    background-color: #ccc;
-    border-radius: 3px;
-    margin: 17px 22px;
-    float: left;
-}
-.local-toc > ul > li > ul > li > ul > li > a {
-    color: #a7adbd;
-}
-.local-toc > ul > li > ul > li > ul > li > a:before {
-    display: inline-block;
-    content: "";
-    width: 6px;
-    height: 6px;
-    background-color: #ccc;
-    border-radius: 3px;
-    margin: 17px 22px;
-    float: left;
-}
-.main-content-wrap {
-    position: absolute;
-    width: 100%;
-    top: 80px;
-    bottom: 0;
-    overflow: auto;
-    background-color: #f6f7f8;
-}
-.doc-content-wrap {
-    margin-left: 290px;
-    height: 100%;
-    position: relative;
-    padding-top: 60px;
-    background-color: #fff;
-}
-.doc-content-wrap > div[role='navigation'] {
-    position: absolute;
-    top: 0;
-    width: 100%;
-    left: 0;
-    padding: 0 30px;
-    height: 60px;
-}
-.wy-breadcrumbs {
-    line-height: 50px;
-    height: 60px;
-    background-image: url("data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAAUCAYAAABMDlehAAAAGXRFWHRTb2Z0d2FyZQBBZG9iZSBJbWFnZVJlYWR5ccllPAAAA4ZpVFh0WE1MOmNvbS5hZG9iZS54bXAAAAAAADw/eHBhY2tldCBiZWdpbj0i77u/IiBpZD0iVzVNME1wQ2VoaUh6cmVTek5UY3prYzlkIj8+IDx4OnhtcG1ldGEgeG1sbnM6eD0iYWRvYmU6bnM6bWV0YS8iIHg6eG1wdGs9IkFkb2JlIFhNUCBDb3JlIDUuNS1jMDIxIDc5LjE1NTc3MiwgMjAxNC8wMS8xMy0xOTo0NDowMCAgICAgICAgIj4gPHJkZjpSREYgeG1sbnM6cmRmPSJodHRwOi8vd3d3LnczLm9yZy8xOTk5LzAyLzIyLXJkZi1zeW50YXgtbnMjIj4gPHJkZjpEZXNjcmlwdGlvbiByZGY6YWJvdXQ9IiIgeG1sbnM6eG1wTU09Imh0dHA6Ly9ucy5hZG9iZS5jb20veGFwLzEuMC9tbS8iIHhtbG5zOnN0UmVmPSJodHRwOi8vbnMuYWRvYmUuY29tL3hhcC8xLjAvc1R5cGUvUmVzb3VyY2VSZWYjIiB4bWxuczp4bXA9Imh0dHA6Ly9ucy5hZG9iZS5jb20veGFwLzEuMC8iIHhtcE1NOk9yaWdpbmFsRG9jdW1lbnRJRD0ieG1wLmRpZDpjMjhmMGQ3ZC0wODU3LTQ0ZTctOGRhZi00NGU3OTc1ZmM2MzkiIHhtcE1NOkRvY3VtZW50SUQ9InhtcC5kaWQ6NzRBN0NEODRBRTM2MTFFNjlGMDI4RUM3M0VDQzY4NTkiIHhtcE1NOkluc3RhbmNlSUQ9InhtcC5paWQ6NzRBN0NEODNBRTM2MTFFNjlGMDI4RUM3M0VDQzY4NTkiIHhtcDpDcmVhdG9yVG9vbD0iQWRvYmUgUGhvdG9zaG9wIENDIDIwMTQgKE1hY2ludG9zaCkiPiA8eG1wTU06RGVyaXZlZEZyb20gc3RSZWY6aW5zdGFuY2VJRD0ieG1wLmlpZDozNWQwMzI1ZC01ZDAyLTQ1YTYtODUxOS1lNWUzNjU5NGFhMzAiIHN0UmVmOmRvY3VtZW50SUQ9ImFkb2JlOmRvY2lkOnBob3Rvc2hvcDozZGVmZmY0OS1mNjA4LTExNzktYTRlZC1kZjJiNGY3N2YwNzMiLz4gPC9yZGY6RGVzY3JpcHRpb24+IDwvcmRmOlJERj4gPC94OnhtcG1ldGE+IDw/eHBhY2tldCBlbmQ9InIiPz7FGmP1AAAAKUlEQVR42mK4/+DpfwY9Q0tBJgYGhv8g4h8uFoKLEGOAc9FYSARAgAEAUgMQYBNmQ7sAAAAASUVORK5CYII=");
-    background-repeat: repeat no-repeat;
-    background-position: center 50px;
-}
-.wy-breadcrumbs > li {
-    color: #ccc;
-}
-.wy-breadcrumbs > li a {
-    color: #ff9711;
-    padding: 0;
-}
-.wy-breadcrumbs > li:first-child a {
-    color: #597cf1;
-}
-.wy-nav-content{
-    max-width: none;
-    overflow: auto;
-    position: relative;
-    padding: 30px;
-    background-color: #fff;
-}
-.wy-nav-content h1 {
-    font-size: 24px;
-    color: #2f323a;
-    margin-bottom: 30px;
-}
-.wy-nav-content h2 {
-    font-size: 20px;
-    color: #2f323a;
-    margin-bottom: 30px;
-}
-.wy-nav-content h3 {
-    font-size: 18px;
-    color: #2f323a;
-    margin-bottom: 30px;
-}
-.wy-nav-content h4 {
-    font-size: 16px;
-    color: #2f323a;
-    margin-bottom: 30px;
-}
-.wy-nav-content p + h1,
-.wy-nav-content p + h2,
-.wy-nav-content p + h3,
-.wy-nav-content p + h4 {
-    margin-top: 20px;
-}
-.wy-nav-content p{
-    color: #2f323a;
-    margin-bottom: 20px;
-    font-size: 14px;
-}
-#search-results h2 {
-    font-size: 24px;
-    margin: 20px 0 10px 0;
-}
-#search-results p {
-    color: #a7adbd;
-}
-#search-results ul.search > li {
-    border-bottom: none;
-}
-#search-results ul.search > li > a {
-    color: #597cf1;
-}
-.rst-content .highlighted{
-    background-color: transparent;
-    color: #ff9711;
-    padding: 0;
-}
--- a/doc_theme/static/images/PP_w.png
+++ b/doc_theme/static/images/PP_w.png
--- a/doc_theme/static/js/paddle_doc_init.js
+++ b/doc_theme/static/js/paddle_doc_init.js
-$(document).ready(function(){
-    $('.local-toc').on('click' ,'a.reference.internal', function (){
-        $('.local-toc li.active').removeClass('active');
-        $(this).parent('li').addClass('active');
-    });
-    if ($('.local-toc a:visible').length) {
-        $('.local-toc > ul').addClass('nav nav-stacked');
-        $('#doc-content').scrollspy({
-            target: '.local-toc'
-        });
-		$('.local-toc').perfectScrollbar();
-    } else {
-		$('.doc-content-wrap').css('margin-left', '-=50px');
-        $('.local-toc').remove();
-    }
-    if (!$('.doc-menu-vertical > ul > li.current > ul').length) {
-        $('.doc-content-wrap').css('margin-left', '-=240px');
-        $('.doc-menu-vertical').remove();
-        $('.local-toc').css('left', '0');
-    }
-	$('.doc-menu-vertical .toctree-l2').each(function (i, e){
-        $(e).toggleClass('has-child', !!$(e).find('ul').length);
-    });
-    $('.doc-menu-vertical').find('li.current').last().addClass('active');
-    $('.doc-menu-vertical').perfectScrollbar();
-});
--- a/doc_theme/templates/breadcrumbs.html
+++ b/doc_theme/templates/breadcrumbs.html
-{# Support for Sphinx 1.3+ page_source_suffix, but don't break old builds. #}
-{% if page_source_suffix %} 
-{% set suffix = page_source_suffix %}
-{% else %}
-{% set suffix = source_suffix %}
-{% endif %}
-{% if meta is defined and 'github_url' in meta %}
-{% set display_github = True %}
-{% endif %}
-{% if meta is defined and 'bitbucket_url' in meta %}
-{% set display_bitbucket = True %}
-{% endif %}
-<div role="navigation" aria-label="breadcrumbs navigation">
-  <ul class="wy-breadcrumbs">
-      {% for doc in parents %}
-        <li><a href="{{ doc.link|e }}">{{ doc.title }}</a> > </li>
-      {% endfor %}
-    <li>{{ title }}</li>
-  </ul>
-</div>
--- a/doc_theme/templates/layout.html
+++ b/doc_theme/templates/layout.html
-{# TEMPLATE VAR SETTINGS #}
-{%- set url_root = pathto('', 1) %}
-{%- if url_root == '#' %}{% set url_root = '' %}{% endif %}
-{%- if not embedded and docstitle %}
-  {%- set titlesuffix = " &mdash; "|safe + docstitle|e %}
-{%- else %}
-  {%- set titlesuffix = "" %}
-{%- endif %}
-<!DOCTYPE html>
-<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
-<!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
-<head>
-  <meta charset="utf-8">
-  {{ metatags }}
-  <meta name="viewport" content="width=device-width, initial-scale=1.0">
-  {% block htmltitle %}
-  <title>{{ title|striptags|e }}{{ titlesuffix }}</title>
-  {% endblock %}
-  {# FAVICON #}
-  {% if favicon %}
-    <link rel="shortcut icon" href="{{ pathto('_static/' + favicon, 1) }}"/>
-  {% endif %}
-  {# CSS #}
-  {# OPENSEARCH #}
-  {% if not embedded %}
-    {% if use_opensearch %}
-      <link rel="search" type="application/opensearchdescription+xml" title="{% trans docstitle=docstitle|e %}Search within {{ docstitle }}{% endtrans %}" href="{{ pathto('_static/opensearch.xml', 1) }}"/>
-    {% endif %}
-  {% endif %}
-  {# RTD hosts this file, so just load on non RTD builds #}
-  {% if not READTHEDOCS %}
-    <link rel="stylesheet" href="{{ pathto('_static/' + style, 1) }}" type="text/css" />
-  {% endif %}
-  {% for cssfile in css_files %}
-    <link rel="stylesheet" href="{{ pathto(cssfile, 1) }}" type="text/css" />
-  {% endfor %}
-  {% for cssfile in extra_css_files %}
-    <link rel="stylesheet" href="{{ pathto(cssfile, 1) }}" type="text/css" />
-  {% endfor %}
-  {%- block linktags %}
-    {%- if hasdoc('about') %}
-        <link rel="author" title="{{ _('About these documents') }}"
-              href="{{ pathto('about') }}"/>
-    {%- endif %}
-    {%- if hasdoc('genindex') %}
-        <link rel="index" title="{{ _('Index') }}"
-              href="{{ pathto('genindex') }}"/>
-    {%- endif %}
-    {%- if hasdoc('search') %}
-        <link rel="search" title="{{ _('Search') }}" href="{{ pathto('search') }}"/>
-    {%- endif %}
-    {%- if hasdoc('copyright') %}
-        <link rel="copyright" title="{{ _('Copyright') }}" href="{{ pathto('copyright') }}"/>
-    {%- endif %}
-    <link rel="top" title="{{ docstitle|e }}" href="{{ pathto('index') }}"/>
-    {%- if parents %}
-        <link rel="up" title="{{ parents[-1].title|striptags|e }}" href="{{ parents[-1].link|e }}"/>
-    {%- endif %}
-    {%- if next %}
-        <link rel="next" title="{{ next.title|striptags|e }}" href="{{ next.link|e }}"/>
-    {%- endif %}
-    {%- if prev %}
-        <link rel="prev" title="{{ prev.title|striptags|e }}" href="{{ prev.link|e }}"/>
-    {%- endif %}
-  {%- endblock %}
-  {%- block extrahead %} 
-  <link rel="stylesheet" href="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/css/perfect-scrollbar.min.css" type="text/css" />
-  <link rel="stylesheet" href="{{pathto('_static/css/override.css', 1)}}" type="text/css" />
-  <script>
-  var _hmt = _hmt || [];
-  (function() {
-    var hm = document.createElement("script");
-    hm.src = "//hm.baidu.com/hm.js?b9a314ab40d04d805655aab1deee08ba";
-    var s = document.getElementsByTagName("script")[0]; 
-    s.parentNode.insertBefore(hm, s);
-  })();
-  </script>
-  {% endblock %}
-  {# Keep modernizr in head - http://modernizr.com/docs/#installing #}
-  <script src="{{ pathto('_static/js/modernizr.min.js', 1) }}"></script>
-</head>
-<body class="wy-body-for-nav" role="document">
-  {% block extrabody %}
-  <header class="site-header">
-    <div class="site-logo">
-      <a href="/"><img src="{{pathto('_static/images/PP_w.png', 1)}}"></a>
-    </div>
-    <div class="site-nav-links">
-      <div class="site-menu">
-        <a class="fork-on-github" href="https://github.com/PaddlePaddle/Paddle" target="_blank"><i class="fa fa-github"></i>Fork me on Github</a>
-        <div class="language-switcher dropdown">
-          <a type="button" data-toggle="dropdown">
-            <span>English</span>
-            <i class="fa fa-angle-up"></i>
-            <i class="fa fa-angle-down"></i>
-          </a>
-          <ul class="dropdown-menu">
-            <li><a href="/doc_cn">中文</a></li>
-            <li><a href="/doc">English</a></li>
-          </ul>
-        </div>
-        <ul class="site-page-links">
-          <li><a href="/">Home</a></li>
-        </ul>
-      </div>
-      <div class="doc-module">
-        {%set modules = toctree(maxdepth=0, collapse=False, titles_only=True)%}
-        {{modules}}
-        {% include "searchbox.html" %}        
-      </div>
-    </div>
-  </header>
-  {% endblock %}
-  <div class="main-content-wrap">
-    {# SIDE NAV, TOGGLES ON MOBILE #}
-    <nav class="doc-menu-vertical" role="navigation">
-        {% block menu %}
-          {% set toctree = toctree(maxdepth=-1, collapse=False,titles_only=True, includehidden=True) %}
-          {{ toctree }}
-        {% endblock %}
-    </nav>
-    {% if False %}
-    <nav class="local-toc">{{ toc }}</nav>
-    {% endif %}
-    <section class="doc-content-wrap">
-      {% include "breadcrumbs.html" %}
-      {# PAGE CONTENT #}
-      <div class="wy-nav-content" id="doc-content">
-        <div class="rst-content">
-          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
-           <div itemprop="articleBody">
-            {% block body %}{% endblock %}
-           </div>
-          </div>
-          {% include "footer.html" %}
-        </div>
-      </div>
-    </section>
-  </div>
-  {% include "versions.html" %}
-  {% if not embedded %}
-    <script type="text/javascript">
-        var DOCUMENTATION_OPTIONS = {
-            URL_ROOT:'{{ url_root }}',
-            VERSION:'{{ release|e }}',
-            COLLAPSE_INDEX:false,
-            FILE_SUFFIX:'{{ '' if no_search_suffix else file_suffix }}',
-            HAS_SOURCE:  {{ has_source|lower }},
-            SOURCELINK_SUFFIX: ".txt",
-        };
-    </script>
-    {%- for scriptfile in script_files %}
-      <script type="text/javascript" src="{{ pathto(scriptfile, 1) }}"></script>
-    {%- endfor %}
-  {% endif %}
-  {# RTD hosts this file, so just load on non RTD builds #}
-  {% if not READTHEDOCS %}
-    <script type="text/javascript" src="{{ pathto('_static/js/theme.js', 1) }}"></script>
-  {% endif %}
-  <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/js/bootstrap.min.js" integrity="sha384-Tc5IQib027qvyjSMfHjOMaLkfuWVxZxUPnCJA7l2mCWNIpG9mGCD8wGNIcPD7Txa" crossorigin="anonymous"></script>
-  <script src="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/js/perfect-scrollbar.jquery.min.js"></script>
-  <script src="{{ pathto('_static/js/paddle_doc_init.js', 1) }}"></script>
-  {%- block footer %} {% endblock %}
-</body>
-</html>
--- a/doc_theme/templates/search.html
+++ b/doc_theme/templates/search.html
-{#
-    basic/search.html
-    ~~~~~~~~~~~~~~~~~
-    Template for the search page.
-    :copyright: Copyright 2007-2013 by the Sphinx team, see AUTHORS.
-    :license: BSD, see LICENSE for details.
-#}
-{%- extends "layout.html" %}
-{% set title = _('Search') %}
-{% set script_files = script_files + ['_static/searchtools.js'] %}
-{% block footer %}
-  <script type="text/javascript">
-    jQuery(function() { Search.loadIndex("{{ pathto('searchindex.js', 1) }}"); });
-    jQuery('.doc-content-wrap > div[role="navigation"]').remove();
-    jQuery('.doc-content-wrap').css('padding-top', 0);
-  </script>
-  {# this is used when loading the search index using $.ajax fails,
-     such as on Chrome for documents on localhost #}
-  <script type="text/javascript" id="searchindexloader"></script>
-  {{ super() }}
-{% endblock %}
-{% block body %}
-  <noscript>
-  <div id="fallback" class="admonition warning">
-    <p class="last">
-      {% trans %}Please activate JavaScript to enable the search
-      functionality.{% endtrans %}
-    </p>
-  </div>
-  </noscript>
-  {% if search_performed %}
-    <h2>{{ _('Search Results') }}</h2>
-    {% if not search_results %}
-      <p>{{ _('Your search did not match any documents. Please make sure that all words are spelled correctly and that you\'ve selected enough categories.') }}</p>
-    {% endif %}
-  {% endif %}
-  <div id="search-results">
-  {% if search_results %}
-    <ul>
-    {% for href, caption, context in search_results %}
-      <li>
-        <a href="{{ pathto(item.href) }}">{{ caption }}</a>
-        <p class="context">{{ context|e }}</p>
-      </li>
-    {% endfor %}
-    </ul>
-  {% endif %}
-  </div>
-{% endblock %}
--- a/paddle/fluid/CMakeLists.txt
+++ b/paddle/fluid/CMakeLists.txt
@@ -5,3 +5,4 @@ add_subdirectory(operators)
 add_subdirectory(pybind)
 add_subdirectory(inference)
 add_subdirectory(string)
+add_subdirectory(recordio)
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -5,14 +5,14 @@ cc_library(ddim SRCS ddim.cc DEPS eigen3 boost)
 cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
 nv_test(dim_test SRCS dim_test.cu DEPS ddim)
-if (WITH_GPU)
+if(WITH_GPU)
  nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS ddim place paddle_memory device_context framework_proto)
 else()
  cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS ddim place paddle_memory device_context framework_proto)
-endif ()
+endif()
 cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
-if (WITH_GPU)
+if(WITH_GPU)
  nv_test(tensor_util_test SRCS tensor_util_test.cc tensor_util_test.cu DEPS tensor)
 else()
  cc_test(tensor_util_test SRCS tensor_util_test.cc DEPS tensor)
@@ -39,8 +39,13 @@ cc_library(data_device_transform SRCS data_device_transform.cc DEPS tensor)
 nv_test(data_device_transform_test SRCS data_device_transform_test.cu
        DEPS operator op_registry init math_function)
-cc_library(data_type_transform SRCS data_type_transform.cc DEPS tensor)
+if(WITH_GPU)
-cc_test(data_type_transform_test SRCS data_type_transform_test.cc DEPS data_type_transform)
+  nv_library(data_type_transform SRCS data_type_transform.cu DEPS tensor)
+  nv_test(data_type_transform_test SRCS data_type_transform_test.cc data_type_transform_test.cu DEPS data_type_transform)
+else()
+  cc_library(data_type_transform SRCS data_type_transform.cc DEPS tensor)
+  cc_test(data_type_transform_test SRCS data_type_transform_test.cc DEPS data_type_transform)
+endif()
 cc_library(data_layout_transform SRCS data_layout_transform.cc DEPS tensor math_function)
 cc_test(data_layout_transform_test SRCS data_layout_transform_test.cc DEPS data_layout_transform)

--- a/paddle/fluid/framework/channel.h
+++ b/paddle/fluid/framework/channel.h
@@ -28,24 +28,19 @@ class Channel {
  virtual bool Send(T*) = 0;
  virtual bool Receive(T*) = 0;
  virtual size_t Cap() = 0;
+  virtual void Lock() = 0;
+  virtual void Unlock() = 0;
  virtual void Close() = 0;
  virtual ~Channel() {}
 };
 // Forward declaration of channel implementations.
-namespace details {
 template <typename T>
-class Buffered;
+class ChannelImpl;
-template <typename T>
-class UnBuffered;
-}  // namespace details
 template <typename T>
 Channel<T>* MakeChannel(size_t buffer_size) {
-  if (buffer_size > 0) {
+  return new ChannelImpl<T>(buffer_size);
-    return new details::Buffered<T>(buffer_size);
-  }
-  return new details::UnBuffered<T>();
 }
 template <typename T>
@@ -89,6 +84,19 @@ class ChannelHolder {
    if (IsInitialized()) holder_->Close();
  }
+  size_t Cap() {
+    if (IsInitialized()) return holder_->Cap();
+    return -1;
+  }
+  void Lock() {
+    if (IsInitialized()) holder_->Lock();
+  }
+  void Unlock() {
+    if (IsInitialized()) holder_->Unlock();
+  }
  inline bool IsInitialized() const { return holder_ != nullptr; }
  inline const std::type_index Type() {
@@ -106,6 +114,9 @@ class ChannelHolder {
    virtual const std::type_index Type() const = 0;
    virtual void* Ptr() const = 0;
    virtual void Close() = 0;
+    virtual void Lock() = 0;
+    virtual void Unlock() = 0;
+    virtual size_t Cap() = 0;
  };
  template <typename T>
@@ -115,11 +126,28 @@ class ChannelHolder {
    }
    virtual const std::type_index Type() const { return type_; }
    virtual void* Ptr() const { return static_cast<void*>(channel_.get()); }
    virtual void Close() {
      if (channel_) channel_->Close();
    }
+    virtual size_t Cap() {
+      if (channel_)
+        return channel_->Cap();
+      else
+        return -1;
+    }
+    virtual void Lock() {
+      if (channel_) channel_->Lock();
+    }
+    virtual void Unlock() {
+      if (channel_) channel_->Unlock();
+    }
    std::unique_ptr<Channel<T>> channel_;
    const std::type_index type_;
  };
@@ -131,5 +159,4 @@ class ChannelHolder {
 }  // namespace framework
 }  // namespace paddle
-#include "paddle/fluid/framework/details/buffered_channel.h"
+#include "paddle/fluid/framework/channel_impl.h"
-#include "paddle/fluid/framework/details/unbuffered_channel.h"
--- a/paddle/fluid/framework/channel_impl.h
+++ b/paddle/fluid/framework/channel_impl.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <stddef.h>  // for size_t
+#include <atomic>
+#include <condition_variable>
+#include <deque>
+#include "paddle/fluid/framework/channel.h"
+#include "paddle/fluid/platform/enforce.h"
+namespace paddle {
+namespace framework {
+template <typename T>
+class ChannelImpl : public paddle::framework::Channel<T> {
+  friend Channel<T> *paddle::framework::MakeChannel<T>(size_t);
+  friend void paddle::framework::CloseChannel<T>(Channel<T> *);
+ public:
+  virtual bool Send(T *);
+  virtual bool Receive(T *);
+  virtual size_t Cap() { return cap_; }
+  virtual void Lock();
+  virtual void Unlock();
+  virtual void Close();
+  ChannelImpl(size_t);
+  virtual ~ChannelImpl();
+ private:
+  struct QueueMessage {
+    T *data;
+    std::condition_variable_any cond;
+    bool chan_closed = false;
+    bool completed = false;
+    QueueMessage(T *item) : data(item) {}
+    void Wait(std::unique_lock<std::recursive_mutex> &lock) {
+      cond.wait(lock, [this]() { return completed; });
+    }
+    void Notify() {
+      completed = true;
+      cond.notify_all();
+    }
+  };
+  bool send_return(bool value) {
+    send_ctr--;
+    destructor_cond_.notify_all();
+    return value;
+  }
+  bool recv_return(bool value) {
+    recv_ctr--;
+    destructor_cond_.notify_all();
+    return value;
+  }
+  size_t cap_;
+  std::recursive_mutex mu_;
+  bool closed_;
+  std::deque<T> buf_;
+  std::deque<std::shared_ptr<QueueMessage>> recvq;
+  std::deque<std::shared_ptr<QueueMessage>> sendq;
+  std::atomic<unsigned> send_ctr{0};
+  std::atomic<unsigned> recv_ctr{0};
+  std::condition_variable_any destructor_cond_;
+};
+template <typename T>
+ChannelImpl<T>::ChannelImpl(size_t capacity)
+    : cap_(capacity), closed_(false), send_ctr(0), recv_ctr(0) {
+  PADDLE_ENFORCE_GE(capacity, 0);
+}
+template <typename T>
+bool ChannelImpl<T>::Send(T *item) {
+  send_ctr++;
+  std::unique_lock<std::recursive_mutex> lock{mu_};
+  // If channel is closed, do nothing
+  if (closed_) {
+    lock.unlock();
+    // TODO(abhinavarora) Should panic on closed channel
+    return send_return(false);
+  }
+  // If there is a receiver, directly pass the value we want
+  // to send to the receiver, bypassing the channel buffer if any
+  if (!recvq.empty()) {
+    std::shared_ptr<QueueMessage> m = recvq.front();
+    recvq.pop_front();
+    // Do the data transfer
+    *(m->data) = std::move(*item);
+    // Wake up the blocked process and unlock
+    m->Notify();
+    lock.unlock();
+    return send_return(true);
+  }
+  // Unbuffered channel will always bypass this
+  // If buffered channel has space in buffer,
+  // write the element to the buffer.
+  if (buf_.size() < cap_) {
+    // Copy to buffer
+    buf_.push_back(std::move(*item));
+    // Release lock and return true
+    lock.unlock();
+    return send_return(true);
+  }
+  // Block on channel, because some receiver will complete
+  // the operation for us
+  auto m = std::make_shared<QueueMessage>(item);
+  sendq.push_back(m);
+  m->Wait(lock);
+  // TODO(abhinavarora) Should panic on closed channel
+  return send_return(!m->chan_closed);
+}
+template <typename T>
+bool ChannelImpl<T>::Receive(T *item) {
+  recv_ctr++;
+  std::unique_lock<std::recursive_mutex> lock{mu_};
+  // If channel is closed and buffer is empty or
+  // channel is unbuffered
+  if (closed_ && buf_.empty()) {
+    lock.unlock();
+    return recv_return(false);
+  }
+  // If there is a sender, directly receive the value we want
+  // from the sender, bypassing the channel buffer if any
+  if (!sendq.empty()) {
+    std::shared_ptr<QueueMessage> m = sendq.front();
+    sendq.pop_front();
+    // Do the data transfer
+    *item = std::move(*(m->data));
+    // Wake up the blocked process and unlock
+    m->Notify();
+    lock.unlock();
+    return recv_return(true);
+  }
+  // If this is a buffered channel and there are items in buffer
+  if (buf_.size() > 0) {
+    // Directly read from buffer
+    *item = std::move(buf_.front());
+    buf_.pop_front();
+    // Release lock and return true
+    lock.unlock();
+    return recv_return(true);
+  }
+  // No sender available, block on this channel
+  // Some receiver will complete the option for us
+  auto m = std::make_shared<QueueMessage>(item);
+  recvq.push_back(m);
+  m->Wait(lock);
+  return recv_return(!m->chan_closed);
+}
+template <typename T>
+void ChannelImpl<T>::Lock() {
+  mu_.lock();
+}
+template <typename T>
+void ChannelImpl<T>::Unlock() {
+  mu_.unlock();
+}
+template <typename T>
+void ChannelImpl<T>::Close() {
+  std::unique_lock<std::recursive_mutex> lock{mu_};
+  if (closed_) {
+    // TODO(abhinavarora): closing an already closed channel should panic
+    lock.unlock();
+    return;
+  }
+  closed_ = true;
+  // Empty the readers
+  while (!recvq.empty()) {
+    std::shared_ptr<QueueMessage> m = recvq.front();
+    recvq.pop_front();
+    m->chan_closed = true;
+    m->Notify();
+  }
+  // Empty the senders
+  while (!sendq.empty()) {
+    std::shared_ptr<QueueMessage> m = sendq.front();
+    sendq.pop_front();
+    m->chan_closed = true;
+    m->Notify();
+  }
+}
+template <typename T>
+ChannelImpl<T>::~ChannelImpl() {
+  Close();
+  // The destructor must wait for all readers and writers to complete their task
+  // The channel has been closed, so we will not accept new readers and writers
+  std::unique_lock<std::recursive_mutex> lock{mu_};
+  destructor_cond_.wait(lock,
+                        [this]() { return send_ctr == 0 && recv_ctr == 0; });
+}
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/channel_test.cc
+++ b/paddle/fluid/framework/channel_test.cc
@@ -23,8 +23,19 @@ using paddle::framework::Channel;
 using paddle::framework::ChannelHolder;
 using paddle::framework::MakeChannel;
 using paddle::framework::CloseChannel;
-using paddle::framework::details::Buffered;
-using paddle::framework::details::UnBuffered;
+TEST(Channel, ChannelCapacityTest) {
+  const size_t buffer_size = 10;
+  auto ch = MakeChannel<size_t>(buffer_size);
+  EXPECT_EQ(ch->Cap(), buffer_size);
+  CloseChannel(ch);
+  delete ch;
+  ch = MakeChannel<size_t>(0);
+  EXPECT_EQ(ch->Cap(), 0U);
+  CloseChannel(ch);
+  delete ch;
+}
 void RecevingOrderEqualToSendingOrder(Channel<int> *ch) {
  unsigned sum_send = 0;
@@ -35,38 +46,17 @@ void RecevingOrderEqualToSendingOrder(Channel<int> *ch) {
    }
  });
  for (int i = 0; i < 5; i++) {
-    int recv;
+    int recv = 999;
    EXPECT_EQ(ch->Receive(&recv), true);
    EXPECT_EQ(recv, i);
  }
+  std::this_thread::sleep_for(std::chrono::milliseconds(200));
  CloseChannel(ch);
  t.join();
  EXPECT_EQ(sum_send, 10U);
  delete ch;
 }
-TEST(Channel, MakeAndClose) {
-  using paddle::framework::details::Buffered;
-  using paddle::framework::details::UnBuffered;
-  {
-    // MakeChannel should return a buffered channel is buffer_size > 0.
-    auto ch = MakeChannel<int>(10);
-    EXPECT_NE(dynamic_cast<Buffered<int> *>(ch), nullptr);
-    EXPECT_EQ(dynamic_cast<UnBuffered<int> *>(ch), nullptr);
-    CloseChannel(ch);
-    delete ch;
-  }
-  {
-    // MakeChannel should return an un-buffered channel is buffer_size = 0.
-    auto ch = MakeChannel<int>(0);
-    EXPECT_EQ(dynamic_cast<Buffered<int> *>(ch), nullptr);
-    EXPECT_NE(dynamic_cast<UnBuffered<int> *>(ch), nullptr);
-    CloseChannel(ch);
-    delete ch;
-  }
-}
 TEST(Channel, SufficientBufferSizeDoesntBlock) {
  const size_t buffer_size = 10;
  auto ch = MakeChannel<size_t>(buffer_size);
@@ -166,7 +156,6 @@ TEST(Channel, ReceiveFromBufferedChannelReturnResidualValuesTest) {
 TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) {
  const size_t buffer_size = 10;
  auto ch = MakeChannel<size_t>(buffer_size);
-  size_t sum = 0;
  std::thread t([&]() {
    // Try to write more than buffer size.
    for (size_t i = 0; i < 2 * buffer_size; ++i) {
@@ -174,12 +163,9 @@ TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) {
        EXPECT_EQ(ch->Send(&i), true);  // should block after 10 iterations
      else
        EXPECT_EQ(ch->Send(&i), false);
-      sum += i;
    }
  });
-  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait 0.1 sec
+  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait 0.2 sec
-  EXPECT_EQ(sum, 45U);
  CloseChannel(ch);
  t.join();
  delete ch;
@@ -211,7 +197,7 @@ void ChannelCloseUnblocksReceiversTest(Channel<int> *ch) {
        },
        &thread_ended[i]);
  }
-  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait 0.1 sec
+  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait 0.2 sec
  // Verify that all the threads are blocked
  for (size_t i = 0; i < num_threads; i++) {
@@ -222,7 +208,7 @@ void ChannelCloseUnblocksReceiversTest(Channel<int> *ch) {
  // This should unblock all receivers
  CloseChannel(ch);
-  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait 0.1 sec
+  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait 0.2 sec
  // Verify that all threads got unblocked
  for (size_t i = 0; i < num_threads; i++) {
@@ -232,10 +218,7 @@ void ChannelCloseUnblocksReceiversTest(Channel<int> *ch) {
  for (size_t i = 0; i < num_threads; i++) t[i].join();
 }
-void ChannelCloseUnblocksSendersTest(Channel<int> *ch) {
+void ChannelCloseUnblocksSendersTest(Channel<int> *ch, bool isBuffered) {
-  using paddle::framework::details::Buffered;
-  using paddle::framework::details::UnBuffered;
  size_t num_threads = 5;
  std::thread t[num_threads];
  bool thread_ended[num_threads];
@@ -253,9 +236,9 @@ void ChannelCloseUnblocksSendersTest(Channel<int> *ch) {
        },
        &thread_ended[i], &send_success[i]);
  }
-  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait
+  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
-  if (dynamic_cast<Buffered<int> *>(ch)) {
+  if (isBuffered) {
    // If ch is Buffered, atleast 4 threads must be blocked.
    int ct = 0;
    for (size_t i = 0; i < num_threads; i++) {
@@ -272,14 +255,14 @@ void ChannelCloseUnblocksSendersTest(Channel<int> *ch) {
  // This should unblock all senders
  CloseChannel(ch);
-  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait
+  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
  // Verify that all threads got unblocked
  for (size_t i = 0; i < num_threads; i++) {
    EXPECT_EQ(thread_ended[i], true);
  }
-  if (dynamic_cast<Buffered<int> *>(ch)) {
+  if (isBuffered) {
    // Verify that only 1 send was successful
    int ct = 0;
    for (size_t i = 0; i < num_threads; i++) {
@@ -304,7 +287,7 @@ TEST(Channel, BufferedChannelCloseUnblocksReceiversTest) {
 //  any senders waiting for channel to have write space
 TEST(Channel, BufferedChannelCloseUnblocksSendersTest) {
  auto ch = MakeChannel<int>(1);
-  ChannelCloseUnblocksSendersTest(ch);
+  ChannelCloseUnblocksSendersTest(ch, true);
  delete ch;
 }
@@ -320,7 +303,7 @@ TEST(Channel, UnbufferedChannelCloseUnblocksReceiversTest) {
 //  unblocks any senders waiting for senders
 TEST(Channel, UnbufferedChannelCloseUnblocksSendersTest) {
  auto ch = MakeChannel<int>(0);
-  ChannelCloseUnblocksReceiversTest(ch);
+  ChannelCloseUnblocksSendersTest(ch, false);
  delete ch;
 }
@@ -342,7 +325,7 @@ TEST(Channel, UnbufferedLessReceiveMoreSendTest) {
    ch->Receive(&recv);
    EXPECT_EQ(recv, i);
  }
-  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait 0.5 sec
+  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait 0.2 sec
  EXPECT_EQ(sum_send, 3U);
  CloseChannel(ch);
@@ -368,7 +351,7 @@ TEST(Channel, UnbufferedMoreReceiveLessSendTest) {
    ch->Send(&i);
    sum_send += i;
  }
-  std::this_thread::sleep_for(std::chrono::milliseconds(500));  // wait 0.5 sec
+  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait 0.2 sec
  EXPECT_EQ(sum_send, 10U);
  EXPECT_EQ(sum_receive, 10U);
  // send three more elements
@@ -386,7 +369,7 @@ TEST(Channel, UnbufferedMoreReceiveLessSendTest) {
 // This tests that destroying a channel unblocks
 //  any senders waiting for channel to have write space
-void ChannelDestroyUnblockSenders(Channel<int> *ch) {
+void ChannelDestroyUnblockSenders(Channel<int> *ch, bool isBuffered) {
  size_t num_threads = 5;
  std::thread t[num_threads];
  bool thread_ended[num_threads];
@@ -405,11 +388,9 @@ void ChannelDestroyUnblockSenders(Channel<int> *ch) {
        &thread_ended[i], &send_success[i]);
  }
-  std::this_thread::sleep_for(std::chrono::milliseconds(500));  // wait 0.5 sec
+  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait 0.2 sec
-  bool is_buffered_channel = false;
-  if (dynamic_cast<Buffered<int> *>(ch)) is_buffered_channel = true;
-  if (is_buffered_channel) {
+  if (isBuffered) {
    // If channel is buffered, verify that atleast 4 threads are blocked
    int ct = 0;
    for (size_t i = 0; i < num_threads; i++) {
@@ -432,13 +413,13 @@ void ChannelDestroyUnblockSenders(Channel<int> *ch) {
    EXPECT_EQ(thread_ended[i], true);
  }
-  // Count number of successfuld sends
+  // Count number of successful sends
  int ct = 0;
  for (size_t i = 0; i < num_threads; i++) {
    if (send_success[i]) ct++;
  }
-  if (is_buffered_channel) {
+  if (isBuffered) {
    // Only 1 send must be successful
    EXPECT_EQ(ct, 1);
  } else {
@@ -495,7 +476,7 @@ TEST(Channel, BufferedChannelDestroyUnblocksReceiversTest) {
 TEST(Channel, BufferedChannelDestroyUnblocksSendersTest) {
  size_t buffer_size = 1;
  auto ch = MakeChannel<int>(buffer_size);
-  ChannelDestroyUnblockSenders(ch);
+  ChannelDestroyUnblockSenders(ch, true);
 }
 // This tests that destroying an unbuffered channel also unblocks
@@ -507,7 +488,20 @@ TEST(Channel, UnbufferedChannelDestroyUnblocksReceiversTest) {
 TEST(Channel, UnbufferedChannelDestroyUnblocksSendersTest) {
  auto ch = MakeChannel<int>(0);
-  ChannelDestroyUnblockSenders(ch);
+  ChannelDestroyUnblockSenders(ch, false);
+}
+TEST(ChannelHolder, ChannelHolderCapacityTest) {
+  const size_t buffer_size = 10;
+  ChannelHolder *ch = new ChannelHolder();
+  ch->Reset<int>(buffer_size);
+  EXPECT_EQ(ch->Cap(), buffer_size);
+  delete ch;
+  ch = new ChannelHolder();
+  ch->Reset<int>(0);
+  EXPECT_EQ(ch->Cap(), 0U);
+  delete ch;
 }
 void ChannelHolderSendReceive(ChannelHolder *ch) {
@@ -641,7 +635,7 @@ void ChannelHolderCloseUnblocksReceiversTest(ChannelHolder *ch) {
        },
        &thread_ended[i]);
  }
-  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait 0.1 sec
+  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait 0.2 sec
  // Verify that all the threads are blocked
  for (size_t i = 0; i < num_threads; i++) {
@@ -652,7 +646,7 @@ void ChannelHolderCloseUnblocksReceiversTest(ChannelHolder *ch) {
  // This should unblock all receivers
  ch->close();
-  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait 0.1 sec
+  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait 0.2 sec
  // Verify that all threads got unblocked
  for (size_t i = 0; i < num_threads; i++) {
@@ -663,9 +657,6 @@ void ChannelHolderCloseUnblocksReceiversTest(ChannelHolder *ch) {
 }
 void ChannelHolderCloseUnblocksSendersTest(ChannelHolder *ch, bool isBuffered) {
-  using paddle::framework::details::Buffered;
-  using paddle::framework::details::UnBuffered;
  size_t num_threads = 5;
  std::thread t[num_threads];
  bool thread_ended[num_threads];
@@ -683,7 +674,7 @@ void ChannelHolderCloseUnblocksSendersTest(ChannelHolder *ch, bool isBuffered) {
        },
        &thread_ended[i], &send_success[i]);
  }
-  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait
+  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
  if (isBuffered) {
    // If ch is Buffered, atleast 4 threads must be blocked.
@@ -702,7 +693,7 @@ void ChannelHolderCloseUnblocksSendersTest(ChannelHolder *ch, bool isBuffered) {
  // This should unblock all senders
  ch->close();
-  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait
+  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
  // Verify that all threads got unblocked
  for (size_t i = 0; i < num_threads; i++) {
@@ -775,7 +766,7 @@ void ChannelHolderDestroyUnblockSenders(ChannelHolder *ch, bool isBuffered) {
        &thread_ended[i], &send_success[i]);
  }
-  std::this_thread::sleep_for(std::chrono::milliseconds(500));  // wait 0.5 sec
+  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait 0.2 sec
  if (isBuffered) {
    // If channel is buffered, verify that atleast 4 threads are blocked
    int ct = 0;
@@ -836,7 +827,7 @@ void ChannelHolderDestroyUnblockReceivers(ChannelHolder *ch) {
        },
        &thread_ended[i]);
  }
-  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait
+  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
  // Verify that all threads are blocked
  for (size_t i = 0; i < num_threads; i++) {

--- a/paddle/fluid/framework/data_transform.cc
+++ b/paddle/fluid/framework/data_transform.cc
@@ -42,6 +42,7 @@ void DataTransform(const OpKernelType& expected_kernel_type,
    PassTensorData(&out, &in);
  }
+  // do data type transform
  if (expected_kernel_type.data_type_ != kernel_type_for_var.data_type_) {
    TransDataType(kernel_type_for_var, expected_kernel_type, in, &out);
    transformed = true;

--- a/paddle/fluid/framework/data_type.h
+++ b/paddle/fluid/framework/data_type.h
@@ -16,13 +16,16 @@ limitations under the License. */
 #include <typeindex>
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/float16.h"
 namespace paddle {
 namespace framework {
 inline proto::VarType::Type ToDataType(std::type_index type) {
  using namespace paddle::framework::proto;
-  if (typeid(float).hash_code() == type.hash_code()) {
+  if (typeid(platform::float16).hash_code() == type.hash_code()) {
+    return proto::VarType::FP16;
+  } else if (typeid(float).hash_code() == type.hash_code()) {
    return proto::VarType::FP32;
  } else if (typeid(double).hash_code() == type.hash_code()) {
    return proto::VarType::FP64;
@@ -40,6 +43,8 @@ inline proto::VarType::Type ToDataType(std::type_index type) {
 inline std::type_index ToTypeIndex(proto::VarType::Type type) {
  using namespace paddle::framework::proto;
  switch (type) {
+    case proto::VarType::FP16:
+      return typeid(platform::float16);
    case proto::VarType::FP32:
      return typeid(float);
    case proto::VarType::FP64:
@@ -59,6 +64,9 @@ template <typename Visitor>
 inline void VisitDataType(proto::VarType::Type type, Visitor visitor) {
  using namespace paddle::framework::proto;
  switch (type) {
+    case proto::VarType::FP16:
+      visitor.template operator()<platform::float16>();
+      break;
    case proto::VarType::FP32:
      visitor.template operator()<float>();
      break;

--- a/paddle/fluid/framework/data_type_transform.cc
+++ b/paddle/fluid/framework/data_type_transform.cc
@@ -47,9 +47,15 @@ struct CastDataType {
      auto* context = static_cast<const platform::CPUDeviceContext*>(ctx_);
      trans(*context, in_begin, in_end, out_begin,
            CastDataTypeFunctor<InType, OutType>());
+#ifdef __NVCC__
+    } else if (platform::is_gpu_place(in_.place())) {
+      platform::Transform<platform::CUDADeviceContext> trans;
+      auto* context = static_cast<const platform::CUDADeviceContext*>(ctx_);
+      trans(*context, in_begin, in_end, out_begin,
+            CastDataTypeFunctor<InType, OutType>());
+#endif
    } else {
-      // TODO(dzhwinter): enhance Copy CPU<->GPU with different data type?
+      PADDLE_THROW("Unsupported place!");
-      PADDLE_THROW("Unsupport CPU <-> GPU!");
    }
  }
 };
@@ -65,6 +71,10 @@ void TransDataType(const OpKernelType& kernel_type_for_var,
  auto ctx = pool.Get(in.place());
  switch (src_type) {
+    case proto::VarType::FP16:
+      framework::VisitDataType(dst_type,
+                               CastDataType<platform::float16>(in, out, ctx));
+      break;
    case proto::VarType::FP32:
      framework::VisitDataType(dst_type, CastDataType<float>(in, out, ctx));
      break;

--- a/paddle/fluid/framework/data_type_transform.cu
+++ b/paddle/fluid/framework/data_type_transform.cu
+data_type_transform.cc
\ No newline at end of file
--- a/paddle/fluid/framework/data_type_transform_test.cc
+++ b/paddle/fluid/framework/data_type_transform_test.cc
@@ -22,32 +22,145 @@ TEST(DataTypeTransform, CPUTransform) {
  auto place = CPUPlace();
-  Tensor in;
+  auto kernel_fp16 = OpKernelType(proto::VarType::FP16, place,
-  Tensor out;
+                                  DataLayout::kAnyLayout, LibraryType::kPlain);
-  float* ptr = in.mutable_data<float>(make_ddim({2, 3}), place);
-  int data_number = 2 * 3;
-  for (int i = 0; i < data_number; ++i) {
-    ptr[i] = i / 3;
-  }
  auto kernel_fp32 = OpKernelType(proto::VarType::FP32, place,
                                  DataLayout::kAnyLayout, LibraryType::kPlain);
  auto kernel_fp64 = OpKernelType(proto::VarType::FP64, place,
                                  DataLayout::kAnyLayout, LibraryType::kPlain);
  auto kernel_int32 = OpKernelType(proto::VarType::INT32, place,
                                   DataLayout::kAnyLayout, LibraryType::kPlain);
+  auto kernel_int64 = OpKernelType(proto::VarType::INT64, place,
+                                   DataLayout::kAnyLayout, LibraryType::kPlain);
+  auto kernel_bool = OpKernelType(proto::VarType::BOOL, place,
+                                  DataLayout::kAnyLayout, LibraryType::kPlain);
-  TransDataType(kernel_fp32, kernel_fp64, in, &out);
+  // data type transform from float32
-  double* out_data_double = out.data<double>();
+  {
-  for (int i = 0; i < data_number; ++i) {
+    Tensor in;
-    ASSERT_EQ(out_data_double[i], static_cast<double>(i / 3));
+    Tensor out;
+    float* ptr = in.mutable_data<float>(make_ddim({2, 3}), place);
+    int data_number = 2 * 3;
+    for (int i = 0; i < data_number; ++i) {
+      ptr[i] = i / 3;
+    }
+    TransDataType(kernel_fp32, kernel_fp64, in, &out);
+    double* out_data_double = out.data<double>();
+    for (int i = 0; i < data_number; ++i) {
+      ASSERT_EQ(out_data_double[i], static_cast<double>(i / 3));
+    }
+    TransDataType(kernel_fp32, kernel_int32, in, &out);
+    int* out_data_int = out.data<int>();
+    for (int i = 0; i < data_number; ++i) {
+      ASSERT_EQ(out_data_int[i], static_cast<int>(i / 3));
+    }
  }
-  TransDataType(kernel_fp32, kernel_int32, in, &out);
+  // data type transform from/to float16
-  int* out_data_int = out.data<int>();
+  {
-  for (int i = 0; i < data_number; ++i) {
+    Tensor in;
-    ASSERT_EQ(out_data_int[i], static_cast<int>(i / 3));
+    Tensor out;
+    float16* ptr = in.mutable_data<float16>(make_ddim({2, 3}), place);
+    int data_number = 2 * 3;
+    for (int i = 0; i < data_number; ++i) {
+      ptr[i] = i;
+    }
+    // transform from float16 to other data types
+    TransDataType(kernel_fp16, kernel_fp32, in, &out);
+    float* out_data_float = out.data<float>();
+    for (int i = 0; i < data_number; ++i) {
+      ASSERT_EQ(out_data_float[i], static_cast<float>(ptr[i]));
+    }
+    TransDataType(kernel_fp16, kernel_fp64, in, &out);
+    double* out_data_double = out.data<double>();
+    for (int i = 0; i < data_number; ++i) {
+      ASSERT_EQ(out_data_double[i], static_cast<double>(ptr[i]));
+    }
+    TransDataType(kernel_fp16, kernel_int32, in, &out);
+    int* out_data_int = out.data<int>();
+    for (int i = 0; i < data_number; ++i) {
+      ASSERT_EQ(out_data_int[i], static_cast<int>(ptr[i]));
+    }
+    TransDataType(kernel_fp16, kernel_int64, in, &out);
+    int64_t* out_data_int64 = out.data<int64_t>();
+    for (int i = 0; i < data_number; ++i) {
+      ASSERT_EQ(out_data_int64[i], static_cast<int64_t>(ptr[i]));
+    }
+    TransDataType(kernel_fp16, kernel_bool, in, &out);
+    bool* out_data_bool = out.data<bool>();
+    for (int i = 0; i < data_number; ++i) {
+      ASSERT_EQ(out_data_bool[i], static_cast<bool>(ptr[i]));
+    }
+    // transform float to float16
+    float* in_data_float = in.mutable_data<float>(make_ddim({2, 3}), place);
+    for (int i = 0; i < data_number; ++i) {
+      in_data_float[i] = i;
+    }
+    TransDataType(kernel_fp32, kernel_fp16, in, &out);
+    ptr = out.data<float16>();
+    for (int i = 0; i < data_number; ++i) {
+      ASSERT_EQ(ptr[i].x, static_cast<float16>(in_data_float[i]).x);
+    }
+    // transform double to float16
+    double* in_data_double = in.mutable_data<double>(make_ddim({2, 3}), place);
+    for (int i = 0; i < data_number; ++i) {
+      in_data_double[i] = i;
+    }
+    TransDataType(kernel_fp64, kernel_fp16, in, &out);
+    ptr = out.data<float16>();
+    for (int i = 0; i < data_number; ++i) {
+      ASSERT_EQ(ptr[i].x, static_cast<float16>(in_data_double[i]).x);
+    }
+    // transform int to float16
+    int* in_data_int = in.mutable_data<int>(make_ddim({2, 3}), place);
+    for (int i = 0; i < data_number; ++i) {
+      in_data_int[i] = i;
+    }
+    TransDataType(kernel_int32, kernel_fp16, in, &out);
+    ptr = out.data<float16>();
+    for (int i = 0; i < data_number; ++i) {
+      ASSERT_EQ(ptr[i].x, static_cast<float16>(in_data_int[i]).x);
+    }
+    // transform int64 to float16
+    int64_t* in_data_int64 = in.mutable_data<int64_t>(make_ddim({2, 3}), place);
+    for (int i = 0; i < data_number; ++i) {
+      in_data_int64[i] = i;
+    }
+    TransDataType(kernel_int64, kernel_fp16, in, &out);
+    ptr = out.data<float16>();
+    for (int i = 0; i < data_number; ++i) {
+      ASSERT_EQ(ptr[i].x, static_cast<float16>(in_data_int64[i]).x);
+    }
+    // transform bool to float16
+    bool* in_data_bool = in.mutable_data<bool>(make_ddim({2, 3}), place);
+    for (int i = 0; i < data_number; ++i) {
+      in_data_bool[i] = i;
+    }
+    TransDataType(kernel_bool, kernel_fp16, in, &out);
+    ptr = out.data<float16>();
+    for (int i = 0; i < data_number; ++i) {
+      ASSERT_EQ(ptr[i].x, static_cast<float16>(in_data_bool[i]).x);
+    }
  }
 }
--- a/paddle/fluid/framework/data_type_transform_test.cu
+++ b/paddle/fluid/framework/data_type_transform_test.cu
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/framework/data_type_transform.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "gtest/gtest.h"
+TEST(DataTypeTransform, GPUTransform) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  auto cpu_place = CPUPlace();
+  auto gpu_place = CUDAPlace(0);
+  CUDADeviceContext context(gpu_place);
+  auto kernel_fp16 = OpKernelType(proto::VarType::FP16, gpu_place,
+                                  DataLayout::kAnyLayout, LibraryType::kPlain);
+  auto kernel_fp32 = OpKernelType(proto::VarType::FP32, gpu_place,
+                                  DataLayout::kAnyLayout, LibraryType::kPlain);
+  auto kernel_fp64 = OpKernelType(proto::VarType::FP64, gpu_place,
+                                  DataLayout::kAnyLayout, LibraryType::kPlain);
+  auto kernel_int32 = OpKernelType(proto::VarType::INT32, gpu_place,
+                                   DataLayout::kAnyLayout, LibraryType::kPlain);
+  auto kernel_int64 = OpKernelType(proto::VarType::INT64, gpu_place,
+                                   DataLayout::kAnyLayout, LibraryType::kPlain);
+  auto kernel_bool = OpKernelType(proto::VarType::BOOL, gpu_place,
+                                  DataLayout::kAnyLayout, LibraryType::kPlain);
+  // data type transform from float32
+  {
+    Tensor in;
+    Tensor in_gpu;
+    Tensor out_gpu;
+    Tensor out;
+    float* in_ptr = in.mutable_data<float>(make_ddim({2, 3}), cpu_place);
+    float arr[6] = {0, 1, 2, 3, 4, 5};
+    int data_number = sizeof(arr) / sizeof(arr[0]);
+    memcpy(in_ptr, arr, sizeof(arr));
+    TensorCopy(in, gpu_place, context, &in_gpu);
+    TransDataType(kernel_fp32, kernel_fp64, in_gpu, &out_gpu);
+    TensorCopy(out_gpu, cpu_place, context, &out);
+    context.Wait();
+    double* out_data_double = out.data<double>();
+    for (int i = 0; i < data_number; ++i) {
+      ASSERT_EQ(out_data_double[i], static_cast<double>(arr[i]));
+    }
+    TransDataType(kernel_fp32, kernel_int32, in_gpu, &out_gpu);
+    TensorCopy(out_gpu, cpu_place, context, &out);
+    context.Wait();
+    int* out_data_int = out.data<int>();
+    for (int i = 0; i < data_number; ++i) {
+      ASSERT_EQ(out_data_int[i], static_cast<int>(arr[i]));
+    }
+  }
+  // data type transform from/to float16
+  {
+    Tensor in;
+    Tensor in_gpu;
+    Tensor out_gpu;
+    Tensor out;
+    float16* ptr = in.mutable_data<float16>(make_ddim({2, 3}), cpu_place);
+    float16 arr[6] = {float16(0), float16(1), float16(2),
+                      float16(3), float16(4), float16(5)};
+    int data_number = sizeof(arr) / sizeof(arr[0]);
+    memcpy(ptr, arr, sizeof(arr));
+    TensorCopy(in, gpu_place, context, &in_gpu);
+    // transform from float16 to other data types
+    TransDataType(kernel_fp16, kernel_fp32, in_gpu, &out_gpu);
+    TensorCopy(out_gpu, cpu_place, context, &out);
+    context.Wait();
+    float* out_data_float = out.data<float>();
+    for (int i = 0; i < data_number; ++i) {
+      ASSERT_EQ(out_data_float[i], static_cast<float>(ptr[i]));
+    }
+    TransDataType(kernel_fp16, kernel_fp64, in_gpu, &out_gpu);
+    TensorCopy(out_gpu, cpu_place, context, &out);
+    context.Wait();
+    double* out_data_double = out.data<double>();
+    for (int i = 0; i < data_number; ++i) {
+      ASSERT_EQ(out_data_double[i], static_cast<double>(ptr[i]));
+    }
+    TransDataType(kernel_fp16, kernel_int32, in_gpu, &out_gpu);
+    TensorCopy(out_gpu, cpu_place, context, &out);
+    context.Wait();
+    int* out_data_int = out.data<int>();
+    for (int i = 0; i < data_number; ++i) {
+      ASSERT_EQ(out_data_int[i], static_cast<int>(ptr[i]));
+    }
+    TransDataType(kernel_fp16, kernel_int64, in_gpu, &out_gpu);
+    TensorCopy(out_gpu, cpu_place, context, &out);
+    context.Wait();
+    int64_t* out_data_int64 = out.data<int64_t>();
+    for (int i = 0; i < data_number; ++i) {
+      ASSERT_EQ(out_data_int64[i], static_cast<int64_t>(ptr[i]));
+    }
+    TransDataType(kernel_fp16, kernel_bool, in_gpu, &out_gpu);
+    TensorCopy(out_gpu, cpu_place, context, &out);
+    context.Wait();
+    bool* out_data_bool = out.data<bool>();
+    for (int i = 0; i < data_number; ++i) {
+      ASSERT_EQ(out_data_bool[i], static_cast<bool>(ptr[i]));
+    }
+    // transform float to float16
+    float* in_data_float = in.mutable_data<float>(make_ddim({2, 3}), cpu_place);
+    for (int i = 0; i < data_number; ++i) {
+      in_data_float[i] = i;
+    }
+    TensorCopy(in, gpu_place, context, &in_gpu);
+    TransDataType(kernel_fp32, kernel_fp16, in_gpu, &out_gpu);
+    TensorCopy(out_gpu, cpu_place, context, &out);
+    context.Wait();
+    ptr = out.data<float16>();
+    for (int i = 0; i < data_number; ++i) {
+      ASSERT_EQ(ptr[i].x, static_cast<float16>(in_data_float[i]).x);
+    }
+    // transform double to float16
+    double* in_data_double =
+        in.mutable_data<double>(make_ddim({2, 3}), cpu_place);
+    for (int i = 0; i < data_number; ++i) {
+      in_data_double[i] = i;
+    }
+    TensorCopy(in, gpu_place, context, &in_gpu);
+    TransDataType(kernel_fp64, kernel_fp16, in_gpu, &out_gpu);
+    TensorCopy(out_gpu, cpu_place, context, &out);
+    context.Wait();
+    ptr = out.data<float16>();
+    for (int i = 0; i < data_number; ++i) {
+      ASSERT_EQ(ptr[i].x, static_cast<float16>(in_data_double[i]).x);
+    }
+    // transform int to float16
+    int* in_data_int = in.mutable_data<int>(make_ddim({2, 3}), cpu_place);
+    for (int i = 0; i < data_number; ++i) {
+      in_data_int[i] = i;
+    }
+    TensorCopy(in, gpu_place, context, &in_gpu);
+    TransDataType(kernel_int32, kernel_fp16, in_gpu, &out_gpu);
+    TensorCopy(out_gpu, cpu_place, context, &out);
+    context.Wait();
+    ptr = out.data<float16>();
+    for (int i = 0; i < data_number; ++i) {
+      ASSERT_EQ(ptr[i].x, static_cast<float16>(in_data_int[i]).x);
+    }
+    // transform int64 to float16
+    int64_t* in_data_int64 =
+        in.mutable_data<int64_t>(make_ddim({2, 3}), cpu_place);
+    for (int i = 0; i < data_number; ++i) {
+      in_data_int64[i] = i;
+    }
+    TensorCopy(in, gpu_place, context, &in_gpu);
+    TransDataType(kernel_int64, kernel_fp16, in_gpu, &out_gpu);
+    TensorCopy(out_gpu, cpu_place, context, &out);
+    context.Wait();
+    ptr = out.data<float16>();
+    for (int i = 0; i < data_number; ++i) {
+      ASSERT_EQ(ptr[i].x, static_cast<float16>(in_data_int64[i]).x);
+    }
+    // transform bool to float16
+    bool* in_data_bool = in.mutable_data<bool>(make_ddim({2, 3}), cpu_place);
+    for (int i = 0; i < data_number; ++i) {
+      in_data_bool[i] = i;
+    }
+    TensorCopy(in, gpu_place, context, &in_gpu);
+    TransDataType(kernel_bool, kernel_fp16, in_gpu, &out_gpu);
+    TensorCopy(out_gpu, cpu_place, context, &out);
+    context.Wait();
+    ptr = out.data<float16>();
+    for (int i = 0; i < data_number; ++i) {
+      ASSERT_EQ(ptr[i].x, static_cast<float16>(in_data_bool[i]).x);
+    }
+  }
+}
--- a/paddle/fluid/framework/details/buffered_channel.h
+++ b/paddle/fluid/framework/details/buffered_channel.h
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#include <atomic>
-#include <condition_variable>
-#include <deque>
-#include <mutex>
-#include "paddle/fluid/framework/channel.h"
-#include "paddle/fluid/platform/enforce.h"
-namespace paddle {
-namespace framework {
-namespace details {
-// Four of the properties of Buffered Channel:
-// - A send to a full channel blocks temporarily until a receive from the
-// channel or the channel is closed.
-// - A receive from an empty channel blocks temporarily until a send to the
-// channel or the channel is closed.
-// - A send to a closed channel returns false immediately.
-// - A receive from a closed channel returns false immediately.
-template <typename T>
-class Buffered : public paddle::framework::Channel<T> {
-  friend Channel<T>* paddle::framework::MakeChannel<T>(size_t);
-  friend void paddle::framework::CloseChannel<T>(Channel<T>*);
- public:
-  virtual bool Send(T*);
-  virtual bool Receive(T*);
-  virtual size_t Cap() { return cap_; }
-  virtual void Close();
-  virtual ~Buffered();
- private:
-  size_t cap_;
-  std::mutex mu_;
-  std::condition_variable empty_cond_var_;
-  std::condition_variable full_cond_var_;
-  std::condition_variable destructor_cond_var_;
-  std::deque<T> channel_;
-  std::atomic<bool> closed_{false};
-  std::atomic<unsigned> send_ctr{0};
-  std::atomic<unsigned> recv_ctr{0};
-  Buffered(size_t cap) : cap_(cap), closed_(false) {
-    PADDLE_ENFORCE_GT(cap, 0);
-  }
-  void NotifyAllParticipants(std::unique_lock<std::mutex>*);
-};
-template <typename T>
-bool Buffered<T>::Send(T* item) {
-  bool ret = false;
-  if (closed_) {
-    return ret;
-  }
-  send_ctr++;
-  std::unique_lock<std::mutex> lock(mu_);
-  full_cond_var_.wait(lock,
-                      [this]() { return channel_.size() < cap_ || closed_; });
-  if (!closed_) {
-    channel_.push_back(std::move(*item));
-    lock.unlock();
-    empty_cond_var_.notify_one();
-    ret = true;
-  }
-  send_ctr--;
-  destructor_cond_var_.notify_one();
-  return ret;
-}
-template <typename T>
-bool Buffered<T>::Receive(T* item) {
-  bool ret = false;
-  // Once the channel has been closed and all data has been consumed,
-  // just return false. Don't even try acquiring the mutex.
-  if (closed_ && channel_.empty()) {
-    return false;
-  }
-  recv_ctr++;
-  std::unique_lock<std::mutex> lock(mu_);
-  empty_cond_var_.wait(lock, [this]() { return !channel_.empty() || closed_; });
-  if (!channel_.empty()) {
-    *item = std::move(channel_.front());
-    channel_.pop_front();
-    full_cond_var_.notify_one();
-    ret = true;
-  }
-  recv_ctr--;
-  destructor_cond_var_.notify_one();
-  return ret;
-}
-template <typename T>
-void Buffered<T>::Close() {
-  if (closed_) {
-    return;
-  }
-  std::unique_lock<std::mutex> lock(mu_);
-  closed_ = true;
-  NotifyAllParticipants(&lock);
-}
-template <typename T>
-Buffered<T>::~Buffered() {
-  std::unique_lock<std::mutex> lock(mu_);
-  closed_ = true;
-  channel_.clear();
-  NotifyAllParticipants(&lock);
-  // The destructor must wait for all readers and writers to complete their task
-  // The channel has been closed, so we will not accept new readers and writers
-  lock.lock();
-  destructor_cond_var_.wait(
-      lock, [this]() { return send_ctr == 0 && recv_ctr == 0; });
-}
-template <typename T>
-void Buffered<T>::NotifyAllParticipants(std::unique_lock<std::mutex>* lock) {
-  lock->unlock();
-  full_cond_var_.notify_all();
-  empty_cond_var_.notify_all();
-}
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
--- a/paddle/fluid/framework/details/unbuffered_channel.h
+++ b/paddle/fluid/framework/details/unbuffered_channel.h
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#include <atomic>
-#include <condition_variable>
-#include <mutex>
-#include "paddle/fluid/framework/channel.h"
-namespace paddle {
-namespace framework {
-namespace details {
-// Four of the properties of UnBuffered Channel:
-// - A send to a channel blocks temporarily until a receive from the
-// channel or the channel is closed.
-// - A receive from a channel blocks temporarily until a send to the
-// channel or the channel is closed.
-// - A send to a closed channel returns false immediately.
-// - A receive from a closed channel returns false immediately.
-template <typename T>
-class UnBuffered : public paddle::framework::Channel<T> {
-  friend Channel<T>* paddle::framework::MakeChannel<T>(size_t);
-  friend void paddle::framework::CloseChannel<T>(Channel<T>*);
- public:
-  virtual bool Send(T*);
-  virtual bool Receive(T*);
-  virtual size_t Cap() { return 0; }
-  virtual void Close();
-  virtual ~UnBuffered();
- private:
-  std::mutex mu_ch_;
-  // Mutex for readers and writers who are waiting for other reader
-  // and writer to complete execution
-  std::recursive_mutex mu_read_, mu_write_;
-  // reader_found_ is set true when a reader is ready to accept data
-  // writer_found_ is set true when a writer is ready to send data
-  // A transaction occurs only when both are true
-  std::atomic<bool> reader_found_{false}, writer_found_{false};
-  std::condition_variable cv_channel_;
-  std::condition_variable_any cv_reader_, cv_writer_, cv_destructor_;
-  T* item{nullptr};
-  std::atomic<bool> closed_{false};
-  std::atomic<unsigned> send_ctr{0};
-  std::atomic<unsigned> recv_ctr{0};
-  UnBuffered() : closed_(false) {}
-  void NotifyAllParticipants(std::unique_lock<std::mutex>*);
-};
-// This function implements the concept of how data should
-// be sent from a writer to a reader.
-template <typename T>
-bool UnBuffered<T>::Send(T* data) {
-  bool ret = false;
-  if (closed_) {
-    return ret;
-  }
-  send_ctr++;
-  // Prevent other writers from entering
-  std::unique_lock<std::recursive_mutex> writer_lock(mu_write_);
-  writer_found_ = true;
-  std::unique_lock<std::recursive_mutex> cv_lock(mu_write_);
-  // If writer comes first, it should wait till a reader arrives
-  cv_writer_.wait(cv_lock,
-                  [this]() { return reader_found_ == true || closed_; });
-  cv_reader_.notify_one();
-  if (!closed_) {
-    std::unique_lock<std::mutex> channel_lock(mu_ch_);
-    item = data;
-    channel_lock.unlock();
-    cv_channel_.notify_one();
-    channel_lock.lock();
-    cv_channel_.wait(channel_lock,
-                     [this]() { return item == nullptr || closed_; });
-    ret = true;
-  }
-  writer_found_ = false;
-  send_ctr--;
-  cv_destructor_.notify_one();
-  return ret;
-}
-// This function implements the concept of how
-// data that was sent by a writer is read from a reader.
-template <typename T>
-bool UnBuffered<T>::Receive(T* data) {
-  bool ret = false;
-  // If channel is closed, we don't even want any reader to enter.
-  // Unlike a buffered channel, an unbuffered channel does not allow
-  // readers to read after closing because there is no buffer to be consumed.
-  if (closed_) return ret;
-  recv_ctr++;
-  // Prevent other readers from entering
-  std::unique_lock<std::recursive_mutex> read_lock{mu_read_};
-  reader_found_ = true;
-  std::unique_lock<std::recursive_mutex> cv_lock{mu_read_};
-  // If reader comes first, it should wait till a writer arrives
-  cv_reader_.wait(cv_lock,
-                  [this]() { return writer_found_ == true || closed_; });
-  cv_writer_.notify_one();
-  if (!closed_) {
-    std::unique_lock<std::mutex> lock_ch{mu_ch_};
-    // Reader should wait for the writer to first write its data
-    cv_channel_.wait(lock_ch, [this]() { return item != nullptr || closed_; });
-    if (!closed_) {
-      *data = std::move(*item);
-      item = nullptr;
-      lock_ch.unlock();
-      ret = true;
-    }
-    cv_channel_.notify_one();
-  }
-  reader_found_ = false;
-  recv_ctr--;
-  cv_destructor_.notify_one();
-  return ret;
-}
-// This function implements the sequence of events
-// that take place once the channel is closed.
-template <typename T>
-void UnBuffered<T>::Close() {
-  if (closed_) {
-    return;
-  }
-  std::unique_lock<std::mutex> lock(mu_ch_);
-  item = nullptr;
-  closed_ = true;
-  NotifyAllParticipants(&lock);
-}
-// This function implements the sequence of events
-// that are executed once the object of an UnBuffered
-// channel is destroyed.
-template <typename T>
-UnBuffered<T>::~UnBuffered() {
-  std::unique_lock<std::mutex> lock(mu_ch_);
-  item = nullptr;
-  closed_ = true;
-  NotifyAllParticipants(&lock);
-  lock.lock();
-  cv_destructor_.wait(lock,
-                      [this]() { return send_ctr == 0 && recv_ctr == 0; });
-}
-// This function notifies all the readers, writers and
-// the channel condition variables.
-template <typename T>
-void UnBuffered<T>::NotifyAllParticipants(std::unique_lock<std::mutex>* lock) {
-  lock->unlock();
-  cv_writer_.notify_all();
-  cv_channel_.notify_all();
-  cv_reader_.notify_all();
-}
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
--- a/paddle/fluid/framework/dim.h
+++ b/paddle/fluid/framework/dim.h
@@ -157,9 +157,15 @@ HOSTDEVICE int64_t& indexer<0>(Dim<0>& dim, int idx) {
  throw std::invalid_argument("Invalid index");
 #else
  PADDLE_ASSERT(false);
-#endif
+#if CUDA_VERSION < 8000
+  // On CUDA versions previous to 8.0, only __shared__ variables
+  // could be declared as static in the device code.
+  int64_t head = 0;
+#else
  static int64_t head = 0;
+#endif
  return head;
+#endif
 }
 template <int D>
@@ -183,9 +189,15 @@ HOSTDEVICE int64_t indexer<0>(const Dim<0>& dim, int idx) {
  throw std::invalid_argument("Invalid index");
 #else
  PADDLE_ASSERT(false);
-#endif
+#if CUDA_VERSION < 8000
+  // On CUDA versions previous to 8.0, only __shared__ variables
+  // could be declared as static in the device code.
+  int64_t head = 0;
+#else
  static int64_t head = 0;
+#endif
  return head;
+#endif
 }
 }  // namespace

--- a/paddle/fluid/framework/prune.cc
+++ b/paddle/fluid/framework/prune.cc
@@ -27,8 +27,6 @@ namespace framework {
 const std::string kFeedOpType = "feed";
 const std::string kFetchOpType = "fetch";
-const std::string kDropOutOpType = "dropout";
-const std::string kBatchNormOpType = "batch_norm";
 bool HasDependentVar(const proto::OpDesc& op_desc,
                     const std::set<std::string>& dependent_vars) {
@@ -186,18 +184,13 @@ void Prune(const proto::ProgramDesc& input, proto::ProgramDesc* output) {
  prune_impl(input, output, 0, -1, dependent_vars);
 }
-void inference_optimize_impl(const proto::ProgramDesc& input,
+void inference_optimize_impl(proto::ProgramDesc* input, int block_id) {
-                             proto::ProgramDesc* output, int block_id) {
+  auto* op_field = input->mutable_blocks(block_id)->mutable_ops();
-  *output = input;
-  auto* op_field = output->mutable_blocks(block_id)->mutable_ops();
  for (auto& op_desc : *op_field) {
-    if (op_desc.type() == kDropOutOpType ||
+    for (auto& attr : *op_desc.mutable_attrs()) {
-        op_desc.type() == kBatchNormOpType) {
+      if (attr.name() == "is_test") {
-      for (auto& attr : *op_desc.mutable_attrs()) {
+        attr.set_b(true);
-        if (attr.name() == "is_test") {
+        break;
-          attr.set_b(true);
-          break;
-        }
      }
    }
  }
@@ -205,7 +198,12 @@ void inference_optimize_impl(const proto::ProgramDesc& input,
 void InferenceOptimize(const proto::ProgramDesc& input,
                       proto::ProgramDesc* output) {
-  inference_optimize_impl(input, output, 0);
+  *output = input;
+  int num_blocks = output->blocks_size();
+  PADDLE_ENFORCE_GT(num_blocks, 0, "ProgramDesc must have at least one block");
+  for (int i = 0; i < num_blocks; ++i) {
+    inference_optimize_impl(output, i);
+  }
 }
 }  // namespace framework

--- a/paddle/fluid/framework/reader.cc
+++ b/paddle/fluid/framework/reader.cc
@@ -25,92 +25,5 @@ DDim ReaderBase::shape(size_t idx) const {
  return shapes_[idx];
 }
-void ShuffleReader::ReadNext(std::vector<LoDTensor>* out) {
-  if (iteration_pos_ >= buffer_.size()) {
-    // Reload buffer with new data
-    buffer_.clear();
-    buffer_.reserve(buffer_size_);
-    for (int i = 0; i < buffer_size_; ++i) {
-      if (reader_->HasNext()) {
-        buffer_.push_back(std::vector<LoDTensor>());
-        reader_->ReadNext(&buffer_.back());
-      } else {
-        break;
-      }
-    }
-    // TODO(fengjiayi): 'std::random_shuffle' can be very slow. It needs to be
-    // optimize.
-    std::random_shuffle(buffer_.begin(), buffer_.end());
-    iteration_pos_ = 0;
-  }
-  out->clear();
-  if (!buffer_.empty()) {
-    std::swap(*out, buffer_[iteration_pos_++]);
-  }
-  // if buffer_ is empty, the 'out' will return as an empty vector.
-}
-void BatchReader::ReadNext(std::vector<LoDTensor>* out) {
-  buffer_.clear();
-  buffer_.reserve(batch_size_);
-  for (int i = 0; i < batch_size_; ++i) {
-    if (reader_->HasNext()) {
-      buffer_.push_back(std::vector<LoDTensor>());
-      reader_->ReadNext(&buffer_.back());
-    } else {
-      break;
-    }
-  }
-  // Concat instances
-  out->clear();
-  if (buffer_.empty()) {
-    // if buffer_ is empty, the 'out' will return as an empty vector.
-    return;
-  }
-  int out_num = buffer_[0].size();
-  out->reserve(out_num);
-  for (int j = 0; j < out_num; ++j) {
-    // Merge shape and check date type
-    std::type_index batch_type = buffer_[0][j].type();
-    DDim batch_shape = buffer_[0][j].dims();
-    for (size_t i = 1; i < buffer_.size(); ++i) {
-      std::type_index ins_type = buffer_[i][j].type();
-      DDim ins_shape = buffer_[i][j].dims();
-      PADDLE_ENFORCE_EQ(batch_type, ins_type);
-      PADDLE_ENFORCE_EQ(slice_ddim(batch_shape, 1, batch_shape.size()),
-                        slice_ddim(ins_shape, 1, ins_shape.size()));
-      PADDLE_ENFORCE_GT(ins_shape[0], 0);
-      batch_shape[0] += ins_shape[0];
-    }
-    LoDTensor out_tensor;
-    out_tensor.Resize(batch_shape);
-    out_tensor.mutable_data(platform::CPUPlace(), batch_type);
-    int64_t dst_offset = 0;
-    // Merge lod and data
-    LoD batch_lod;
-    for (size_t i = 0; i < buffer_.size(); ++i) {
-      DDim ins_shape = buffer_[i][j].dims();
-      LoD ins_lod = buffer_[i][j].lod();
-      if (i == 0) {
-        batch_lod = ins_lod;
-      } else {
-        PADDLE_ENFORCE_EQ(batch_lod.size(), ins_lod.size());
-        for (size_t level_idx = 0; level_idx < batch_lod.size(); ++level_idx) {
-          auto& lod_level = batch_lod[level_idx];
-          for (size_t k = 1; k < ins_lod[level_idx].size(); ++k) {
-            lod_level.push_back(ins_lod[level_idx][k] + lod_level.back());
-          }
-        }
-      }
-      Tensor dst = out_tensor.Slice(dst_offset, dst_offset + ins_shape[0]);
-      TensorCopy(buffer_[i][j], platform::CPUPlace(), &dst);
-      dst_offset += ins_shape[0];
-    }
-    out_tensor.set_lod(batch_lod);
-    out->push_back(out_tensor);
-  }
-}
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/reader.h
+++ b/paddle/fluid/framework/reader.h
@@ -60,83 +60,8 @@ class DecoratedReader : public ReaderBase {
  ReaderBase* reader_;
 };
-// file readers
+// The ReaderHolder is used as reader' unified wrapper,
+// making it easier to access different type reader in Variables.
-template <typename T>
-class RandomDataGenerator : public FileReader {
- public:
-  RandomDataGenerator(const std::vector<DDim>& shapes, float min, float max)
-      : FileReader(shapes), min_(min), max_(max) {
-    PADDLE_ENFORCE_LE(
-        min, max, "'min' shouldn't be greater than 'max'.(%f vs %f)", min, max);
-    unsigned int seed = std::random_device()();
-    engine_.seed(seed);
-    dist_ = std::uniform_real_distribution<float>(min_, max_);
-  }
-  void ReadNext(std::vector<LoDTensor>* out) override {
-    out->clear();
-    out->reserve(shapes_.size());
-    for (const DDim& shape : shapes_) {
-      PADDLE_ENFORCE_GE(
-          shape.size(), 2,
-          "The rank of reader's output data should be 2 at least.(Now it's %d)",
-          shape.size());
-      LoDTensor out_tensor;
-      out_tensor.Resize(shape);
-      T* data = out_tensor.mutable_data<T>(platform::CPUPlace());
-      int64_t numel = product(shape);
-      for (int64_t i = 0; i < numel; ++i) {
-        data[i] = dist_(engine_);
-      }
-      out->push_back(out_tensor);
-    }
-  }
-  bool HasNext() const override { return true; }
-  void ReInit() override { return; }
- private:
-  float min_;
-  float max_;
-  std::minstd_rand engine_;
-  std::uniform_real_distribution<float> dist_;
-};
-// decorated readers
-class ShuffleReader : public DecoratedReader {
- public:
-  ShuffleReader(ReaderBase* reader, int buffer_size)
-      : DecoratedReader(reader), buffer_size_(buffer_size), iteration_pos_(0) {
-    buffer_.reserve(buffer_size);
-  }
-  void ReadNext(std::vector<LoDTensor>* out) override;
- private:
-  int buffer_size_;
-  std::vector<std::vector<LoDTensor>> buffer_;
-  size_t iteration_pos_;
-};
-class BatchReader : public DecoratedReader {
- public:
-  BatchReader(ReaderBase* reader, int batch_size)
-      : DecoratedReader(reader), batch_size_(batch_size) {
-    buffer_.reserve(batch_size_);
-  }
-  void ReadNext(std::vector<LoDTensor>* out) override;
- private:
-  int batch_size_;
-  std::vector<std::vector<LoDTensor>> buffer_;
-};
-// The ReaderHolder is used as readers' unified wrapper,
-// making it easier to access different type readers in Variables.
 class ReaderHolder {
 public:
  void Reset(ReaderBase* reader) { reader_.reset(reader); }

--- a/paddle/fluid/framework/tensor_util_test.cc
+++ b/paddle/fluid/framework/tensor_util_test.cc
@@ -235,27 +235,53 @@ TEST(TensorToVector, Tensor) {
 TEST(TensorContainsNAN, CPU) {
  using namespace paddle::framework;
  using namespace paddle::platform;
-  Tensor src;
+  {
-  float* buf = src.mutable_data<float>({3}, CPUPlace());
+    Tensor src;
-  buf[0] = 0.0;
+    float* buf = src.mutable_data<float>({3}, CPUPlace());
-  buf[1] = NAN;
+    buf[0] = 0.0;
-  buf[2] = 0.0;
+    buf[1] = NAN;
-  ASSERT_TRUE(TensorContainsNAN(src));
+    buf[2] = 0.0;
-  buf[1] = 0.0;
+    ASSERT_TRUE(TensorContainsNAN(src));
-  ASSERT_FALSE(TensorContainsNAN(src));
+    buf[1] = 0.0;
+    ASSERT_FALSE(TensorContainsNAN(src));
+  }
+  {
+    Tensor src;
+    float16* buf = src.mutable_data<float16>({3}, CPUPlace());
+    buf[0] = 0.0;
+    buf[1].x = 0x7fff;
+    buf[2] = 0.0;
+    ASSERT_TRUE(TensorContainsNAN(src));
+    buf[1] = 0.0;
+    ASSERT_FALSE(TensorContainsNAN(src));
+  }
 }
 TEST(TensorContainsInf, CPU) {
  using namespace paddle::framework;
  using namespace paddle::platform;
-  Tensor src;
+  {
-  double* buf = src.mutable_data<double>({3}, CPUPlace());
+    Tensor src;
-  buf[0] = 1.0;
+    double* buf = src.mutable_data<double>({3}, CPUPlace());
-  buf[1] = INFINITY;
+    buf[0] = 1.0;
-  buf[2] = 0.0;
+    buf[1] = INFINITY;
-  ASSERT_TRUE(TensorContainsInf(src));
+    buf[2] = 0.0;
-  buf[1] = 1.0;
+    ASSERT_TRUE(TensorContainsInf(src));
-  ASSERT_FALSE(TensorContainsInf(src));
+    buf[1] = 1.0;
+    ASSERT_FALSE(TensorContainsInf(src));
+  }
+  {
+    Tensor src;
+    float16* buf = src.mutable_data<float16>({3}, CPUPlace());
+    buf[0] = 1.0;
+    buf[1].x = 0x7c00;
+    buf[2] = 0.0;
+    ASSERT_TRUE(TensorContainsInf(src));
+    buf[1] = 1.0;
+    ASSERT_FALSE(TensorContainsInf(src));
+  }
 }
 TEST(Tensor, FromAndToStream) {

--- a/paddle/fluid/framework/tensor_util_test.cu
+++ b/paddle/fluid/framework/tensor_util_test.cu
@@ -25,32 +25,65 @@ static __global__ void FillNAN(float* buf) {
  buf[1] = 0.1;
  buf[2] = NAN;
 }
 static __global__ void FillInf(float* buf) {
  buf[0] = 0.0;
  buf[1] = INFINITY;
  buf[2] = 0.5;
 }
+static __global__ void FillNAN(platform::float16* buf) {
+  buf[0] = 0.0;
+  buf[1] = 0.1;
+  buf[2].x = 0x7fff;
+}
+static __global__ void FillInf(platform::float16* buf) {
+  buf[0] = 0.0;
+  buf[1].x = 0x7c00;
+  buf[2] = 0.5;
+}
 TEST(TensorContainsNAN, GPU) {
-  Tensor tensor;
+  using namespace paddle::platform;
-  platform::CUDAPlace gpu(0);
+  CUDAPlace gpu(0);
-  auto& pool = platform::DeviceContextPool::Instance();
+  auto& pool = DeviceContextPool::Instance();
  auto* cuda_ctx = pool.GetByPlace(gpu);
-  float* buf = tensor.mutable_data<float>({3}, gpu);
+  {
-  FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+    Tensor tensor;
-  cuda_ctx->Wait();
+    float* buf = tensor.mutable_data<float>({3}, gpu);
-  ASSERT_TRUE(TensorContainsNAN(tensor));
+    FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+    cuda_ctx->Wait();
+    ASSERT_TRUE(TensorContainsNAN(tensor));
+  }
+  {
+    Tensor tensor;
+    float16* buf = tensor.mutable_data<float16>({3}, gpu);
+    FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+    cuda_ctx->Wait();
+    ASSERT_TRUE(TensorContainsNAN(tensor));
+  }
 }
 TEST(TensorContainsInf, GPU) {
-  Tensor tensor;
+  using namespace paddle::platform;
-  platform::CUDAPlace gpu(0);
+  CUDAPlace gpu(0);
-  auto& pool = platform::DeviceContextPool::Instance();
+  auto& pool = DeviceContextPool::Instance();
  auto* cuda_ctx = pool.GetByPlace(gpu);
-  float* buf = tensor.mutable_data<float>({3}, gpu);
+  {
-  FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+    Tensor tensor;
-  cuda_ctx->Wait();
+    float* buf = tensor.mutable_data<float>({3}, gpu);
-  ASSERT_TRUE(TensorContainsInf(tensor));
+    FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+    cuda_ctx->Wait();
+    ASSERT_TRUE(TensorContainsInf(tensor));
+  }
+  {
+    Tensor tensor;
+    float16* buf = tensor.mutable_data<float16>({3}, gpu);
+    FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+    cuda_ctx->Wait();
+    ASSERT_TRUE(TensorContainsInf(tensor));
+  }
 }
 }  // namespace framework

--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
 file(GLOB GENERAL_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc")
+string(REPLACE "_mkldnn" "" GENERAL_OPS "${GENERAL_OPS}")
 string(REPLACE ".cc" "" GENERAL_OPS "${GENERAL_OPS}")
+list(REMOVE_DUPLICATES GENERAL_OPS)
 set(DEPS_OPS "")
 set(pybind_file ${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/pybind.h)
 file(WRITE ${pybind_file} "// Generated by the paddle/operator/CMakeLists.txt.  DO NOT EDIT!\n\n")
@@ -13,6 +15,8 @@ function(op_library TARGET)
    set(cu_cc_srcs)
    set(cudnn_cu_cc_srcs)
    set(CUDNN_FILE)
+    set(mkldnn_cc_srcs)
+    set(MKLDNN_FILE)
    set(op_common_deps operator op_registry math_function)
    set(options "")
    set(oneValueArgs "")
@@ -36,12 +40,20 @@ function(op_library TARGET)
        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${CUDNN_FILE}.cu.cc)
            list(APPEND cudnn_cu_cc_srcs ${CUDNN_FILE}.cu.cc)
        endif()
+        if(WITH_MKLDNN)
+            string(REPLACE "_op" "_mkldnn_op" MKLDNN_FILE "${TARGET}")
+            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MKLDNN_FILE}.cc)
+                list(APPEND mkldnn_cc_srcs ${MKLDNN_FILE}.cc)
+            endif()
+        endif()
    else()
        foreach(src ${op_library_SRCS})
            if (${src} MATCHES ".*\\.cu$")
                list(APPEND cu_srcs ${src})
            elseif(${src} MATCHES ".*_cudnn_op.cu.cc$")
                list(APPEND cudnn_cu_cc_srcs ${src})
+            elseif(WITH_MKLDNN AND ${src} MATCHES ".*_mkldnn_op.cc$")
+                list(APPEND mkldnn_cc_srcs ${src})
            elseif(${src} MATCHES ".*\\.cu.cc$")
                list(APPEND cu_cc_srcs ${src})
            elseif(${src} MATCHES ".*\\.cc$")
@@ -62,15 +74,15 @@ function(op_library TARGET)
        set(DEPS_OPS ${TARGET} ${DEPS_OPS} PARENT_SCOPE)
    endif()
    if (WITH_GPU)
-        nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cudnn_cu_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
+        nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cudnn_cu_cc_srcs} ${mkldnn_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
                ${op_common_deps})
    else()
-        cc_library(${TARGET} SRCS ${cc_srcs} DEPS ${op_library_DEPS}
+        cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS}
-                ${op_common_deps})
+            ${op_common_deps})
    endif()
    # Define operators that don't need pybind here.
-    foreach(manual_pybind_op "net_op" "compare_op" "logical_op" "nccl_op" "tensor_array_read_write_op" "create_reader_op")
+    foreach(manual_pybind_op "net_op" "compare_op" "logical_op" "nccl_op" "tensor_array_read_write_op")
        if ("${TARGET}" STREQUAL "${manual_pybind_op}")
            set(pybind_flag 1)
        endif()
@@ -101,7 +113,8 @@ function(op_library TARGET)
    # pybind USE_CPU_ONLY_OP
    list(LENGTH cu_srcs cu_srcs_len)
    list(LENGTH cu_cc_srcs cu_cc_srcs_len)
-    if (${pybind_flag} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0)
+    list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len)
+    if (${pybind_flag} EQUAL 0 AND ${mkldnn_cc_srcs_len} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0)
        file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
        set(pybind_flag 1)
    endif()
@@ -112,6 +125,11 @@ function(op_library TARGET)
        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n")
    endif()
+    # pybind USE_OP_DEVICE_KERNEL for MKLDNN
+    if (WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0)
+        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MKLDNN);\n")
+    endif()
    # pybind USE_OP
    if (${pybind_flag} EQUAL 0)
        file(APPEND ${pybind_file} "USE_OP(${TARGET});\n")
@@ -128,9 +146,9 @@ else()
    set(DEPS_OPS ${DEPS_OPS} nccl_op)
 endif()
+add_subdirectory(detail)
 if(WITH_DISTRIBUTE)
-    add_subdirectory(detail)
+    set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf)
-    set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib_target protobuf)
    set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
    op_library(send_op DEPS ${DISTRIBUTE_DEPS})
    set_source_files_properties(send_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
@@ -170,7 +188,6 @@ op_library(recurrent_op DEPS executor)
 op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
 op_library(cos_sim_op DEPS cos_sim_functor)
 op_library(parallel_do_op DEPS executor)
-op_library(create_reader_op DEPS reader)
 if (WITH_GPU)
    op_library(conv_op DEPS vol2col depthwise_conv im2col)
@@ -184,12 +201,18 @@ op_library(save_op DEPS lod_tensor)
 op_library(load_op DEPS lod_tensor)
 op_library(save_combine_op DEPS lod_tensor)
 op_library(load_combine_op DEPS lod_tensor)
+op_library(concat_op DEPS concat_functor)
 list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
 foreach(src ${GENERAL_OPS})
    op_library(${src})
 endforeach()
-file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\nUSE_NO_KERNEL_OP(create_random_data_generator);\n")
+file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n")
+add_subdirectory(reader)
+foreach(src ${READER_LIBRARY})
+    set(OP_LIBRARY ${src} ${OP_LIBRARY})
+endforeach()
 set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")

--- a/paddle/fluid/operators/bipartite_match_op.cc
+++ b/paddle/fluid/operators/bipartite_match_op.cc
@@ -41,6 +41,14 @@ class BipartiteMatchOp : public framework::OperatorWithKernel {
    ctx->SetOutputDim("ColToRowMatchIndices", dims);
    ctx->SetOutputDim("ColToRowMatchDist", dims);
  }
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<LoDTensor>("DistMat")->type()),
+        platform::CPUPlace());
+  }
 };
 template <typename T>

--- a/paddle/fluid/operators/concat_op.cc
+++ b/paddle/fluid/operators/concat_op.cc
@@ -100,7 +100,8 @@ class ConcatOpGrad : public framework::OperatorWithKernel {
 namespace ops = paddle::operators;
 REGISTER_OP_EX(concat, ops::ConcatOp, ops::ConcatOpMaker, concat_grad,
               ops::ConcatOpGrad, false)
-REGISTER_OP_CPU_KERNEL(concat,
+REGISTER_OP_CPU_KERNEL(
-                       ops::ConcatKernel<paddle::platform::CPUPlace, float>)
+    concat, ops::ConcatKernel<paddle::platform::CPUDeviceContext, float>)
-REGISTER_OP_CPU_KERNEL(concat_grad,
+REGISTER_OP_CPU_KERNEL(
-                       ops::ConcatGradKernel<paddle::platform::CPUPlace, float>)
+    concat_grad,
+    ops::ConcatGradKernel<paddle::platform::CPUDeviceContext, float>)
--- a/paddle/fluid/operators/concat_op.h
+++ b/paddle/fluid/operators/concat_op.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/concat.h"
 #include "paddle/fluid/operators/strided_memcpy.h"
 namespace paddle {
@@ -27,54 +28,30 @@ class ConcatKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    auto ins = ctx.MultiInput<framework::Tensor>("X");
-    auto* out = ctx.Output<framework::Tensor>("Out");
+    framework::Tensor* out = ctx.Output<framework::Tensor>("Out");
    int64_t axis = static_cast<int64_t>(ctx.Attr<int>("axis"));
    auto place = ctx.GetPlace();
    out->mutable_data<T>(place);
-    auto out_stride = framework::stride_numel(out->dims());
+    // Sometimes direct copies will be faster, this maybe need deeply analysis.
+    if (axis == 0 && ins.size() < 10) {
-    size_t output_offset = 0;
+      size_t output_offset = 0;
-    // If axis >=1, copy to out immediately need to call many times
-    // of cuda memcpy. Copy the input to cpu and do the stride copy,
-    // then copy to gpu output.
-    if (platform::is_gpu_place(place) && axis >= 1) {
-      platform::CPUPlace copy_place;
-      auto& cpu_ctx = *platform::DeviceContextPool::Instance().Get(copy_place);
-      framework::Tensor cpu_out;
-      cpu_out.Resize(out->dims());
-      cpu_out.mutable_data<T>(copy_place);
-      auto& dev_ctx = ctx.device_context();
-      std::vector<std::unique_ptr<framework::Tensor>> cpu_ins;
-      for (auto* in : ins) {
-        std::unique_ptr<framework::Tensor> cpu_in(new framework::Tensor);
-        framework::TensorCopy(*in, copy_place, dev_ctx, cpu_in.get());
-        cpu_ins.emplace_back(std::move(cpu_in));
-      }
-      // TODO(dzhwinter): overlap copy and compute stream
-      // https://devblogs.nvidia.com/how-overlap-data-transfers-cuda-cc/
-      dev_ctx.Wait();
-      for (auto& in : cpu_ins) {
-        auto& cpu_in = *in.get();
-        auto in_stride = framework::stride_numel(cpu_in.dims());
-        StridedNumelCopyWithAxis<T>(
-            cpu_ctx, axis, cpu_out.data<T>() + output_offset, out_stride,
-            cpu_in.data<T>(), in_stride, in_stride[axis]);
-        output_offset += in_stride[axis];
-      }
-      framework::TensorCopy(cpu_out, place, dev_ctx, out);
-    } else {
      for (auto* in : ins) {
        auto in_stride = framework::stride_numel(in->dims());
+        auto out_stride = framework::stride_numel(out->dims());
        StridedNumelCopyWithAxis<T>(ctx.device_context(), axis,
                                    out->data<T>() + output_offset, out_stride,
                                    in->data<T>(), in_stride, in_stride[axis]);
        output_offset += in_stride[axis];
      }
+    } else {
+      std::vector<framework::Tensor> inputs(ins.size());
+      for (size_t j = 0; j < ins.size(); ++j) {
+        inputs[j] = *ins[j];
+      }
+      auto& dev_ctx = ctx.template device_context<DeviceContext>();
+      paddle::operators::math::ConcatFunctor<DeviceContext, T> concat_functor;
+      concat_functor(dev_ctx, inputs, static_cast<int>(axis), out);
    }
  }
 };
@@ -86,16 +63,31 @@ class ConcatGradKernel : public framework::OpKernel<T> {
    auto* in = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
    auto outs = ctx.MultiOutput<framework::Tensor>(framework::GradVarName("X"));
    int64_t axis = static_cast<int64_t>(ctx.Attr<int>("axis"));
-    size_t input_offset = 0;
-    auto in_stride = framework::stride_numel(in->dims());
-    for (auto& out : outs) {
+    // Sometimes direct copies will be faster, this maybe need deeply analysis.
-      out->mutable_data<T>(ctx.GetPlace());
+    if (axis == 0 && outs.size() < 10) {
-      auto out_stride = framework::stride_numel(out->dims());
+      size_t input_offset = 0;
-      StridedNumelCopyWithAxis<T>(ctx.device_context(), axis, out->data<T>(),
+      auto in_stride = framework::stride_numel(in->dims());
-                                  out_stride, in->data<T>() + input_offset,
-                                  in_stride, out_stride[axis]);
+      for (auto& out : outs) {
-      input_offset += out_stride[axis];
+        out->mutable_data<T>(ctx.GetPlace());
+        auto out_stride = framework::stride_numel(out->dims());
+        StridedNumelCopyWithAxis<T>(ctx.device_context(), axis, out->data<T>(),
+                                    out_stride, in->data<T>() + input_offset,
+                                    in_stride, out_stride[axis]);
+        input_offset += out_stride[axis];
+      }
+    } else {
+      std::vector<framework::Tensor> outputs(outs.size());
+      for (size_t j = 0; j < outs.size(); ++j) {
+        outs[j]->mutable_data<T>(ctx.GetPlace());
+        outputs[j] = *outs[j];
+      }
+      auto& dev_ctx = ctx.template device_context<DeviceContext>();
+      paddle::operators::math::ConcatGradFunctor<DeviceContext, T>
+          concat_grad_functor;
+      concat_grad_functor(dev_ctx, *in, static_cast<int>(axis), outputs);
    }
  }
 };

--- a/paddle/fluid/operators/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/conv_mkldnn_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include "mkldnn.hpp"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/conv_op.h"
+#include "paddle/fluid/platform/mkldnn_helper.h"
+namespace paddle {
+namespace operators {
+using paddle::framework::Tensor;
+using paddle::platform::MKLDNNDeviceContext;
+using paddle::platform::MKLDNNMemDesc;
+using mkldnn::memory;  // Note: paddle has also "memory" namespace
+using mkldnn::primitive;
+using mkldnn::convolution_forward;
+using mkldnn::convolution_backward_weights;
+using mkldnn::convolution_backward_data;
+using mkldnn::convolution_direct;
+using mkldnn::prop_kind;
+using mkldnn::padding_kind;
+using mkldnn::stream;
+namespace {
+std::unique_ptr<mkldnn::convolution_forward::primitive_desc>
+ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights,
+                     const memory::desc& dst, const std::vector<int>& strides,
+                     const std::vector<int>& paddings,
+                     const mkldnn::engine& engine);
+convolution_backward_weights::primitive_desc ConvBwdWeightsPrimitiveDesc(
+    const memory::desc& src, const memory::desc& diff_weights,
+    const memory::desc& diff_dst, const std::vector<int>& strides,
+    const std::vector<int>& paddings,
+    const convolution_forward::primitive_desc& conv_pd,
+    const mkldnn::engine& engine);
+convolution_backward_data::primitive_desc ConvBwdDataPrimitiveDesc(
+    const memory::desc& diff_src, const memory::desc& weights,
+    const memory::desc& diff_dst, const std::vector<int>& strides,
+    const std::vector<int>& paddings,
+    const convolution_forward::primitive_desc& conv_pd,
+    const mkldnn::engine& engine);
+}  // anonymous namespace
+template <typename T>
+class ConvOpMkldnnKernel : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
+                   "It must use CPUPlace.");
+    auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
+    const auto& mkldnn_engine = dev_ctx.GetEngine();
+    auto* input = ctx.Input<Tensor>("Input");
+    auto* filter = ctx.Input<Tensor>("Filter");
+    auto* output = ctx.Output<Tensor>("Output");
+    // Get an unique name from "argument" name of "Output" variable
+    // This name will be used as key when saving info into device context
+    const std::string key = ctx.op().Output("Output");
+    const std::string key_conv_pd = key + "@conv_pd";
+    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
+    int groups = ctx.Attr<int>("groups");
+    // TODO(pzelazko-intel) add support for group convolution and dilation
+    PADDLE_ENFORCE(groups == 1, "group convolution is not implemented yet");
+    PADDLE_ENFORCE(
+        dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1,
+        "dilation in convolution is not implemented yet");
+    const T* input_data = input->data<T>();
+    const T* filter_data = filter->data<T>();
+    // allocate memory for output
+    T* output_data = output->mutable_data<T>(ctx.GetPlace());
+    PADDLE_ENFORCE(input->dims().size() == 4,
+                   "Input must be with 4 dimensions, i.e. NCHW");
+    PADDLE_ENFORCE(filter->dims().size() == 4,
+                   "Filter must be with 4 dimensions, i.e. OIHW");
+    std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
+    std::vector<int> weights_tz =
+        paddle::framework::vectorize2int(filter->dims());
+    std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
+    // TODO(pzelazko-intel): support more formats
+    // memory descriptors for convolution src/weight/dst
+    auto conv_src_md =
+        MKLDNNMemDesc(src_tz, memory::data_type::f32, memory::format::nchw);
+    auto conv_weights_md =
+        MKLDNNMemDesc(weights_tz, memory::data_type::f32, memory::format::oihw);
+    auto conv_dst_md =
+        MKLDNNMemDesc(dst_tz, memory::data_type::f32, memory::format::nchw);
+    // create memory primitives
+    auto conv_src_memory =
+        memory({conv_src_md, mkldnn_engine}, (void*)input_data);
+    auto conv_weights_memory =
+        memory({conv_weights_md, mkldnn_engine}, (void*)filter_data);
+    auto conv_dst_memory = memory({conv_dst_md, mkldnn_engine}, output_data);
+    std::unique_ptr<convolution_forward::primitive_desc> conv_pd =
+        ConvFwdPrimitiveDesc(conv_src_md, conv_weights_md, conv_dst_md, strides,
+                             paddings, mkldnn_engine);
+    // save p_conv_pd into dev_ctx to be referred in backward path
+    auto p_conv_pd = conv_pd.get();
+    std::shared_ptr<void> conv_pd_value = std::move(conv_pd);
+    dev_ctx.SetBlob(key_conv_pd, conv_pd_value);
+    // create convolution op primitive
+    auto conv_prim = convolution_forward(*p_conv_pd, conv_src_memory,
+                                         conv_weights_memory, conv_dst_memory);
+    // push op to stream and wait MKLDNN until it's executed
+    std::vector<primitive> pipeline{conv_prim};
+    stream(stream::kind::eager).submit(pipeline).wait();
+  }
+};
+template <typename T>
+class ConvGradOpMkldnnKernel : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
+                   "It must use CPUPlace.");
+    auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
+    const auto& mkldnn_engine = dev_ctx.GetEngine();
+    const Tensor* input = ctx.Input<Tensor>("Input");
+    const Tensor* filter = ctx.Input<Tensor>("Filter");
+    const Tensor* output = ctx.Input<Tensor>("Output");
+    const Tensor* output_grad =
+        ctx.Input<Tensor>(framework::GradVarName("Output"));
+    Tensor* input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
+    Tensor* filter_grad = ctx.Output<Tensor>(framework::GradVarName("Filter"));
+    if (!input_grad && !filter_grad) return;
+    // Get an unique name from "argument" name of "Output" variable
+    // This name will be used as key when saving info into device context
+    const std::string key = ctx.op().Input("Output");
+    const std::string key_conv_pd = key + "@conv_pd";
+    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
+    const T* input_data = input->data<T>();
+    const T* filter_data = filter->data<T>();
+    const T* output_grad_data = output_grad->data<T>();
+    T* input_grad_data = nullptr;
+    T* filter_grad_data = nullptr;
+    // allocate memory for gradient of input/filter
+    if (input_grad) {
+      input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
+    }
+    if (filter_grad) {
+      filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
+    }
+    std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
+    std::vector<int> weights_tz =
+        paddle::framework::vectorize2int(filter->dims());
+    std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
+    // TODO(pzelazko-intel): support more formats
+    auto conv_src_md =
+        MKLDNNMemDesc(src_tz, memory::data_type::f32, memory::format::nchw);
+    auto conv_diff_src_md =
+        MKLDNNMemDesc(src_tz, memory::data_type::f32, memory::format::nchw);
+    auto conv_weights_md =
+        MKLDNNMemDesc(weights_tz, memory::data_type::f32, memory::format::oihw);
+    auto conv_diff_weights_md =
+        MKLDNNMemDesc(weights_tz, memory::data_type::f32, memory::format::oihw);
+    auto conv_diff_dst_md =
+        MKLDNNMemDesc(dst_tz, memory::data_type::f32, memory::format::nchw);
+    // create memory
+    auto conv_diff_dst_memory =
+        memory({conv_diff_weights_md, mkldnn_engine}, (void*)output_grad_data);
+    // Retrieve conv_pd from device context
+    std::shared_ptr<void> conv_pd;
+    convolution_forward::primitive_desc* p_conv_pd;
+    conv_pd = dev_ctx.GetBlob(key_conv_pd);
+    PADDLE_ENFORCE(conv_pd != nullptr,
+                   "Fail to find conv_pd in device context");
+    p_conv_pd =
+        static_cast<convolution_forward::primitive_desc*>(conv_pd.get());
+    // create backward conv primitive for weights
+    if (filter_grad) {
+      // create primitive descriptor
+      convolution_backward_weights::primitive_desc conv_bwd_weights_pd =
+          ConvBwdWeightsPrimitiveDesc(conv_src_md, conv_diff_weights_md,
+                                      conv_diff_dst_md, strides, paddings,
+                                      *p_conv_pd, mkldnn_engine);
+      // create memory
+      auto conv_diff_weights_memory = memory(
+          {conv_diff_weights_md, mkldnn_engine}, (void*)filter_grad_data);
+      auto conv_src_memory =
+          memory({conv_src_md, mkldnn_engine}, (void*)input_data);
+      // create backward conv primitive for weights
+      auto conv_bwd_weights_prim = convolution_backward_weights(
+          conv_bwd_weights_pd, conv_src_memory, conv_diff_dst_memory,
+          conv_diff_weights_memory);
+      // push primitive and execute it
+      std::vector<primitive> pipeline{conv_bwd_weights_prim};
+      stream(stream::kind::eager).submit(pipeline).wait();
+    }
+    if (input_grad) {
+      // create primitive descriptor
+      convolution_backward_data::primitive_desc conv_bwd_data_pd =
+          ConvBwdDataPrimitiveDesc(conv_diff_src_md, conv_weights_md,
+                                   conv_diff_dst_md, strides, paddings,
+                                   *p_conv_pd, mkldnn_engine);
+      // create memory
+      auto conv_diff_src_memory =
+          memory({conv_diff_src_md, mkldnn_engine}, (void*)input_grad_data);
+      auto conv_weights_memory =
+          memory({conv_weights_md, mkldnn_engine}, (void*)filter_data);
+      // create backward conv primitive for data
+      auto conv_bwd_data_prim =
+          convolution_backward_data(conv_bwd_data_pd, conv_diff_dst_memory,
+                                    conv_weights_memory, conv_diff_src_memory);
+      // push primitive and execute it
+      std::vector<primitive> pipeline{conv_bwd_data_prim};
+      stream(stream::kind::eager).submit(pipeline).wait();
+    }
+  }  // Compute()
+};
+namespace {
+std::unique_ptr<convolution_forward::primitive_desc> ConvFwdPrimitiveDesc(
+    const memory::desc& src, const memory::desc& weights,
+    const memory::desc& dst, const std::vector<int>& strides,
+    const std::vector<int>& paddings, const mkldnn::engine& engine) {
+  mkldnn::memory::dims stride_dims = {strides[0], strides[1]};
+  mkldnn::memory::dims padding_dims = {paddings[0], paddings[1]};
+  auto conv_desc = mkldnn::convolution_forward::desc(
+      mkldnn::prop_kind::forward, mkldnn::convolution_direct, src, weights, dst,
+      stride_dims, padding_dims, padding_dims, mkldnn::padding_kind::zero);
+  auto p_conv_pd = new convolution_forward::primitive_desc(conv_desc, engine);
+  return std::unique_ptr<mkldnn::convolution_forward::primitive_desc>(
+      p_conv_pd);
+}
+convolution_backward_weights::primitive_desc ConvBwdWeightsPrimitiveDesc(
+    const memory::desc& src, const memory::desc& diff_weights,
+    const memory::desc& diff_dst, const std::vector<int>& strides,
+    const std::vector<int>& paddings,
+    const convolution_forward::primitive_desc& conv_pd,
+    const mkldnn::engine& engine) {
+  auto conv_bwd_weights_desc = convolution_backward_weights::desc(
+      convolution_direct, src, diff_weights, diff_dst, strides, paddings,
+      paddings, padding_kind::zero);
+  return convolution_backward_weights::primitive_desc(conv_bwd_weights_desc,
+                                                      engine, conv_pd);
+}
+convolution_backward_data::primitive_desc ConvBwdDataPrimitiveDesc(
+    const memory::desc& diff_src, const memory::desc& weights,
+    const memory::desc& diff_dst, const std::vector<int>& strides,
+    const std::vector<int>& paddings,
+    const convolution_forward::primitive_desc& conv_pd,
+    const mkldnn::engine& engine) {
+  auto conv_bwd_data_desc = convolution_backward_data::desc(
+      convolution_direct, diff_src, weights, diff_dst, strides, paddings,
+      paddings, padding_kind::zero);
+  return convolution_backward_data::primitive_desc(conv_bwd_data_desc, engine,
+                                                   conv_pd);
+}
+}  // anonymous namespace
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP_KERNEL(conv2d, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::ConvOpMkldnnKernel<float>);
+REGISTER_OP_KERNEL(conv2d_grad, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::ConvGradOpMkldnnKernel<float>);
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -13,6 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/conv_op.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/cudnn_helper.h"
+#endif
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
 namespace paddle {
 namespace operators {
@@ -64,22 +70,21 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
 framework::OpKernelType ConvOp::GetExpectedKernelType(
    const framework::ExecutionContext& ctx) const {
-  bool use_cudnn = ctx.Attr<bool>("use_cudnn");
+  framework::LibraryType library_{framework::LibraryType::kPlain};
-  use_cudnn &= platform::is_gpu_place(ctx.GetPlace());
 #ifdef PADDLE_WITH_CUDA
-  if (platform::is_gpu_place(ctx.GetPlace())) {
+  if (platform::CanCUDNNBeUsed(ctx)) {
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    library_ = framework::LibraryType::kCUDNN;
-    use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
  }
 #endif
-  framework::LibraryType library_;
+#ifdef PADDLE_WITH_MKLDNN
-  if (use_cudnn) {
+  if (library_ == framework::LibraryType::kPlain &&
-    library_ = framework::LibraryType::kCUDNN;
+      platform::CanMKLDNNBeUsed(ctx)) {
-  } else {
+    library_ = framework::LibraryType::kMKLDNN;
-    library_ = framework::LibraryType::kPlain;
  }
+#endif
  std::string data_format = ctx.Attr<std::string>("data_format");
+  // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
  return framework::OpKernelType(
      framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
@@ -131,6 +136,9 @@ Conv2DOpMaker::Conv2DOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      "use_cudnn",
      "(bool, default false) Only used in cudnn kernel, need install cudnn")
      .SetDefault(false);
+  AddAttr<bool>("use_mkldnn",
+                "(bool, default false) Only used in mkldnn kernel")
+      .SetDefault(false);
  AddAttr<std::string>(
      "data_format",
      "(string, default NCHW) Only used in "
@@ -224,6 +232,9 @@ Conv3DOpMaker::Conv3DOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      "use_cudnn",
      "(bool, default false) Only used in cudnn kernel, need install cudnn")
      .SetDefault(false);
+  AddAttr<bool>("use_mkldnn",
+                "(bool, default false) Only used in mkldnn kernel")
+      .SetDefault(false);
  AddAttr<std::string>(
      "data_format",
      "(string, default NCHW) Only used in "
@@ -284,23 +295,21 @@ void ConvOpGrad::InferShape(framework::InferShapeContext* ctx) const {
 framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
    const framework::ExecutionContext& ctx) const {
-  bool use_cudnn = ctx.Attr<bool>("use_cudnn");
+  framework::LibraryType library_{framework::LibraryType::kPlain};
-  use_cudnn &= platform::is_gpu_place(ctx.GetPlace());
 #ifdef PADDLE_WITH_CUDA
-  if (platform::is_gpu_place(ctx.GetPlace())) {
+  if (platform::CanCUDNNBeUsed(ctx)) {
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    library_ = framework::LibraryType::kCUDNN;
-    use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
  }
 #endif
+#ifdef PADDLE_WITH_MKLDNN
-  framework::LibraryType library_;
+  if (library_ == framework::LibraryType::kPlain &&
-  if (use_cudnn) {
+      platform::CanMKLDNNBeUsed(ctx)) {
-    library_ = framework::LibraryType::kCUDNN;
+    library_ = framework::LibraryType::kMKLDNN;
-  } else {
-    library_ = framework::LibraryType::kPlain;
  }
+#endif
  std::string data_format = ctx.Attr<std::string>("data_format");
+  // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
  return framework::OpKernelType(
      framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),

--- a/paddle/fluid/operators/create_reader_op.cc
+++ b/paddle/fluid/operators/create_reader_op.cc
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/reader.h"
-namespace paddle {
-namespace operators {
-static std::vector<framework::DDim> RestoreShapes(
-    const std::vector<int>& shape_concat, const std::vector<int>& ranks) {
-  std::vector<framework::DDim> res;
-  int offset = 0;
-  for (int len : ranks) {
-    auto start_it = shape_concat.begin() + offset;
-    auto end_it = start_it + len;
-    res.push_back(framework::make_ddim(std::vector<int>(start_it, end_it)));
-    offset += len;
-  }
-  return res;
-}
-// general infershape for file readers
-class CreateFileReaderInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "The output file reader should not be null.");
-    const auto shape_concat =
-        ctx->Attrs().Get<std::vector<int>>("shape_concat");
-    const auto ranks = ctx->Attrs().Get<std::vector<int>>("ranks");
-    std::vector<framework::DDim> shapes = RestoreShapes(shape_concat, ranks);
-    ctx->SetReaderDims("Out", shapes);
-    if (ctx->IsRuntime()) {
-      const auto lod_levels = ctx->Attrs().Get<std::vector<int>>("lod_levels");
-      PADDLE_ENFORCE_EQ(
-          lod_levels.size(), shapes.size(),
-          "The number of 'lod_levels'(%d) doesn't match the number "
-          "of 'shapes'(%d).",
-          lod_levels.size(), shapes.size());
-      framework::VarDesc* reader =
-          boost::get<framework::VarDesc*>(ctx->GetOutputVarPtrs("Out")[0]);
-      reader->SetLoDLevels(lod_levels);
-    }
-  }
-};
-// general infershape for decorated readers
-class CreateDecoratedReaderInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("UnderlyingReader"),
-                   "Input(UnderlyingReader) should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "The output decorated reader should not be null.");
-    ctx->SetReaderDims("Out", ctx->GetReaderDims("UnderlyingReader"));
-    if (ctx->IsRuntime()) {
-      framework::VarDesc* in_reader = boost::get<framework::VarDesc*>(
-          ctx->GetInputVarPtrs("UnderlyingReader")[0]);
-      framework::VarDesc* out_reader =
-          boost::get<framework::VarDesc*>(ctx->GetOutputVarPtrs("Out")[0]);
-      out_reader->SetLoDLevels(in_reader->GetLoDLevels());
-    }
-  }
-};
-// general var type inference for file readers
-class CreateFileReaderInferVarType : public framework::VarTypeInference {
- public:
-  void operator()(const framework::OpDesc& op_desc,
-                  framework::BlockDesc* block) const override {
-    std::string reader_name = op_desc.Output("Out")[0];
-    framework::VarDesc* reader = block->FindVarRecursive(reader_name);
-    reader->SetType(framework::proto::VarType::READER);
-  }
-};
-// general var type inference for decorated readers
-class CreateDecoratedReaderInferVarType : public framework::VarTypeInference {
- public:
-  void operator()(const framework::OpDesc& op_desc,
-                  framework::BlockDesc* block) const override {
-    std::string in_reader_name = op_desc.Input("UnderlyingReader")[0];
-    framework::VarDesc* in_reader = block->FindVarRecursive(in_reader_name);
-    std::string out_reader_name = op_desc.Output("Out")[0];
-    framework::VarDesc* out_reader = block->FindVarRecursive(out_reader_name);
-    out_reader->SetType(framework::proto::VarType::READER);
-    out_reader->SetDataTypes(in_reader->GetDataTypes());
-  }
-};
-template <typename T>
-class CreateRandomDataGeneratorOp : public framework::OperatorBase {
- public:
-  using framework::OperatorBase::OperatorBase;
- private:
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& dev_place) const override {
-    const auto& shape_concat = Attr<std::vector<int>>("shape_concat");
-    const auto& ranks = Attr<std::vector<int>>("ranks");
-    PADDLE_ENFORCE(!shape_concat.empty() && !ranks.empty());
-    PADDLE_ENFORCE_EQ(std::accumulate(ranks.begin(), ranks.end(), 0),
-                      int(shape_concat.size()),
-                      "The accumulate of all ranks should be equal to the "
-                      "shape concat's length.");
-    std::vector<framework::DDim> shapes = RestoreShapes(shape_concat, ranks);
-    auto* out = scope.FindVar(Output("Out"))
-                    ->template GetMutable<framework::ReaderHolder>();
-    out->Reset(new framework::RandomDataGenerator<T>(shapes, Attr<float>("min"),
-                                                     Attr<float>("max")));
-  }
-};
-class CreateRandomDataGeneratorOpMaker
-    : public framework::OpProtoAndCheckerMaker {
- public:
-  CreateRandomDataGeneratorOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(op_proto, op_checker) {
-    AddOutput("Out", "(ReaderHolder) The created random reader.");
-    AddAttr<std::vector<int>>("shape_concat",
-                              "The concat of all data's shapes.");
-    AddAttr<std::vector<int>>(
-        "ranks",
-        "The ranks of each data."
-        "e.g."
-        "shape_concat = [2,3,4,5,6]"
-        "ranks = [3,2]"
-        "It means the reader will generate two data each time,"
-        "whose shapes are [2,3,4] and [5,6] respectively.");
-    AddAttr<std::vector<int>>("lod_levels", "The LoD levels of each data.");
-    AddAttr<float>("min", "The lower bound of reader's uniform distribution.");
-    AddAttr<float>("max", "The upper bound of reader's uniform distribution.");
-    AddComment(R"DOC(
-      CreateRandomDataGenerator Operator
-      This Op creates a random reader.
-      The reader generates random data instead of really reading from files.
-      Generated data follow an uniform distribution between 'min' and 'max'.
-    )DOC");
-  }
-};
-class CreateShuffleReaderOp : public framework::OperatorBase {
- public:
-  using framework::OperatorBase::OperatorBase;
- private:
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& dev_place) const override {
-    const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
-                                        ->Get<framework::ReaderHolder>();
-    auto* out = scope.FindVar(Output("Out"))
-                    ->template GetMutable<framework::ReaderHolder>();
-    out->Reset(new framework::ShuffleReader(underlying_reader.Get(),
-                                            Attr<int>("buffer_size")));
-  }
-};
-class CreateShuffleReaderOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  CreateShuffleReaderOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(op_proto, op_checker) {
-    AddInput(
-        "UnderlyingReader",
-        "(ReaderHolder) The underlying reader for creating a shuffle reader.");
-    AddOutput("Out", "(ReaderHolder) The created shuffle reader.");
-    AddAttr<int>("buffer_size", "The shuffle buffer size.").GreaterThan(0);
-    AddComment(R"DOC(
-      CreateShuffleReader Operator
-      A shuffle reader takes another reader as its 'underlying reader'
-      and yields the underlying reader's outputs in a shuffled order.
-    )DOC");
-  }
-};
-class CreateBatchReaderOp : public framework::OperatorBase {
- public:
-  using framework::OperatorBase::OperatorBase;
- private:
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& dev_place) const override {
-    const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
-                                        ->Get<framework::ReaderHolder>();
-    auto* out = scope.FindVar(Output("Out"))
-                    ->template GetMutable<framework::ReaderHolder>();
-    out->Reset(new framework::BatchReader(underlying_reader.Get(),
-                                          Attr<int>("batch_size")));
-  }
-};
-class CreateBatchReaderOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  CreateBatchReaderOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(op_proto, op_checker) {
-    AddInput(
-        "UnderlyingReader",
-        "(ReaderHolder) The underlying reader for creating a batch reader.");
-    AddOutput("Out", "(ReaderHolder) The created batch reader.");
-    AddAttr<int>("batch_size",
-                 "How many instances the batch reader yields each time.")
-        .GreaterThan(0);
-    AddComment(R"DOC(
-      CreateBatchReader Operator
-      A batch reader takes another reader as its 'underlying reader',
-      gathers the underlying reader's outputs and then yields them in batches.
-    )DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(create_random_data_generator,
-                  ops::CreateRandomDataGeneratorOp<float>,
-                  ops::CreateFileReaderInferShape,
-                  ops::CreateRandomDataGeneratorOpMaker,
-                  paddle::framework::EmptyGradOpMaker,
-                  ops::CreateFileReaderInferVarType);
-REGISTER_OPERATOR(create_shuffle_reader, ops::CreateShuffleReaderOp,
-                  ops::CreateDecoratedReaderInferShape,
-                  ops::CreateShuffleReaderOpMaker,
-                  paddle::framework::EmptyGradOpMaker,
-                  ops::CreateDecoratedReaderInferVarType);
-REGISTER_OPERATOR(create_batch_reader, ops::CreateBatchReaderOp,
-                  ops::CreateDecoratedReaderInferShape,
-                  ops::CreateBatchReaderOpMaker,
-                  paddle::framework::EmptyGradOpMaker,
-                  ops::CreateDecoratedReaderInferVarType);
--- a/paddle/fluid/operators/detail/CMakeLists.txt
+++ b/paddle/fluid/operators/detail/CMakeLists.txt
-grpc_library(sendrecvop_grpc SRCS sendrecvop_utils.cc grpc_client.cc grpc_server.cc PROTO send_recv.proto DEPS lod_tensor selected_rows)
+if(WITH_DISTRIBUTE)
+  grpc_library(sendrecvop_grpc SRCS sendrecvop_utils.cc grpc_client.cc grpc_server.cc PROTO send_recv.proto DEPS lod_tensor selected_rows)
+endif()
--- a/paddle/fluid/operators/detection_map_op.cc
+++ b/paddle/fluid/operators/detection_map_op.cc
@@ -47,11 +47,10 @@ class DetectionMAPOp : public framework::OperatorWithKernel {
    PADDLE_ENFORCE_EQ(det_dims[1], 6UL,
                      "The shape is of Input(DetectRes) [N, 6].");
    auto label_dims = ctx->GetInputDim("Label");
-    PADDLE_ENFORCE_EQ(label_dims.size(), 2UL,
+    PADDLE_ENFORCE_EQ(label_dims.size(), 2,
                      "The rank of Input(Label) must be 2, "
                      "the shape is [N, 6].");
-    PADDLE_ENFORCE_EQ(label_dims[1], 6UL,
+    PADDLE_ENFORCE_EQ(label_dims[1], 6, "The shape is of Input(Label) [N, 6].");
-                      "The shape is of Input(Label) [N, 6].");
    if (ctx->HasInput("PosCount")) {
      PADDLE_ENFORCE(ctx->HasInput("TruePos"),
@@ -96,6 +95,10 @@ class DetectionMAPOpMaker : public framework::OpProtoAndCheckerMaker {
             "instance, the offsets in first dimension are called LoD, "
             "the number of offset is N + 1, if LoD[i + 1] - LoD[i] == 0, "
             "means there is no ground-truth data.");
+    AddInput("HasState",
+             "(Tensor<int>) A tensor with shape [1], 0 means ignoring input "
+             "states, which including PosCount, TruePos, FalsePos.")
+        .AsDispensable();
    AddInput("PosCount",
             "(Tensor) A tensor with shape [Ncls, 1], store the "
             "input positive example count of each class, Ncls is the count of "
@@ -139,13 +142,21 @@ class DetectionMAPOpMaker : public framework::OpProtoAndCheckerMaker {
    AddOutput("MAP",
              "(Tensor) A tensor with shape [1], store the mAP evaluate "
              "result of the detection.");
+    AddAttr<int>("class_num",
+                 "(int) "
+                 "The class number.");
+    AddAttr<int>(
+        "background_label",
+        "(int, defalut: 0) "
+        "The index of background label, the background label will be ignored. "
+        "If set to -1, then all categories will be considered.")
+        .SetDefault(0);
    AddAttr<float>(
        "overlap_threshold",
        "(float) "
        "The lower bound jaccard overlap threshold of detection output and "
        "ground-truth data.")
-        .SetDefault(.3f);
+        .SetDefault(.5f);
    AddAttr<bool>("evaluate_difficult",
                  "(bool, default true) "
                  "Switch to control whether the difficult data is evaluated.")

--- a/paddle/fluid/operators/detection_map_op.h
+++ b/paddle/fluid/operators/detection_map_op.h
@@ -69,6 +69,7 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
    float overlap_threshold = ctx.Attr<float>("overlap_threshold");
    float evaluate_difficult = ctx.Attr<bool>("evaluate_difficult");
    auto ap_type = GetAPType(ctx.Attr<std::string>("ap_type"));
+    int class_num = ctx.Attr<int>("class_num");
    auto label_lod = in_label->lod();
    auto detect_lod = in_detect->lod();
@@ -87,19 +88,27 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
    std::map<int, std::vector<std::pair<T, int>>> true_pos;
    std::map<int, std::vector<std::pair<T, int>>> false_pos;
-    if (in_pos_count != nullptr) {
+    auto* has_state = ctx.Input<framework::LoDTensor>("HasState");
+    int state = 0;
+    if (has_state) {
+      state = has_state->data<int>()[0];
+    }
+    if (in_pos_count != nullptr && state) {
      GetInputPos(*in_pos_count, *in_true_pos, *in_false_pos, label_pos_count,
-                  true_pos, false_pos);
+                  true_pos, false_pos, class_num);
    }
    CalcTrueAndFalsePositive(gt_boxes, detect_boxes, evaluate_difficult,
                             overlap_threshold, label_pos_count, true_pos,
                             false_pos);
-    T map = CalcMAP(ap_type, label_pos_count, true_pos, false_pos);
+    int background_label = ctx.Attr<int>("background_label");
+    T map = CalcMAP(ap_type, label_pos_count, true_pos, false_pos,
+                    background_label);
    GetOutputPos(ctx, label_pos_count, true_pos, false_pos, *out_pos_count,
-                 *out_true_pos, *out_false_pos);
+                 *out_true_pos, *out_false_pos, class_num);
    T* map_data = out_map->mutable_data<T>(ctx.GetPlace());
    map_data[0] = map;
@@ -184,24 +193,21 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
      const std::map<int, std::vector<std::pair<T, int>>>& false_pos,
      framework::Tensor& output_pos_count,
      framework::LoDTensor& output_true_pos,
-      framework::LoDTensor& output_false_pos) const {
+      framework::LoDTensor& output_false_pos, const int class_num) const {
-    int max_class_id = 0;
    int true_pos_count = 0;
    int false_pos_count = 0;
-    for (auto it = label_pos_count.begin(); it != label_pos_count.end(); ++it) {
+    for (auto it = true_pos.begin(); it != true_pos.end(); ++it) {
-      int label = it->first;
+      auto tp = it->second;
-      if (label > max_class_id) max_class_id = label;
+      true_pos_count += tp.size();
-      int label_num_pos = it->second;
+    }
-      if (label_num_pos == 0 || true_pos.find(label) == true_pos.end())
+    for (auto it = false_pos.begin(); it != false_pos.end(); ++it) {
-        continue;
+      auto fp = it->second;
-      auto label_true_pos = true_pos.find(label)->second;
+      false_pos_count += fp.size();
-      auto label_false_pos = false_pos.find(label)->second;
-      true_pos_count += label_true_pos.size();
-      false_pos_count += label_false_pos.size();
    }
    int* pos_count_data = output_pos_count.mutable_data<int>(
-        framework::make_ddim({max_class_id + 1, 1}), ctx.GetPlace());
+        framework::make_ddim({class_num, 1}), ctx.GetPlace());
    T* true_pos_data = output_true_pos.mutable_data<T>(
        framework::make_ddim({true_pos_count, 2}), ctx.GetPlace());
    T* false_pos_data = output_false_pos.mutable_data<T>(
@@ -210,7 +216,7 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
    false_pos_count = 0;
    std::vector<size_t> true_pos_starts = {0};
    std::vector<size_t> false_pos_starts = {0};
-    for (int i = 0; i <= max_class_id; ++i) {
+    for (int i = 0; i < class_num; ++i) {
      auto it_count = label_pos_count.find(i);
      pos_count_data[i] = 0;
      if (it_count != label_pos_count.end()) {
@@ -251,17 +257,16 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
    return;
  }
-  void GetInputPos(
+  void GetInputPos(const framework::Tensor& input_pos_count,
-      const framework::Tensor& input_pos_count,
+                   const framework::LoDTensor& input_true_pos,
-      const framework::LoDTensor& input_true_pos,
+                   const framework::LoDTensor& input_false_pos,
-      const framework::LoDTensor& input_false_pos,
+                   std::map<int, int>& label_pos_count,
-      std::map<int, int>& label_pos_count,
+                   std::map<int, std::vector<std::pair<T, int>>>& true_pos,
-      std::map<int, std::vector<std::pair<T, int>>>& true_pos,
+                   std::map<int, std::vector<std::pair<T, int>>>& false_pos,
-      std::map<int, std::vector<std::pair<T, int>>>& false_pos) const {
+                   const int class_num) const {
    constexpr T kEPS = static_cast<T>(1e-6);
-    int class_number = input_pos_count.dims()[0];
    const int* pos_count_data = input_pos_count.data<int>();
-    for (int i = 0; i < class_number; ++i) {
+    for (int i = 0; i < class_num; ++i) {
      label_pos_count[i] = pos_count_data[i];
    }
@@ -384,17 +389,19 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
    }
  }
-  T CalcMAP(
+  T CalcMAP(APType ap_type, const std::map<int, int>& label_pos_count,
-      APType ap_type, const std::map<int, int>& label_pos_count,
+            const std::map<int, std::vector<std::pair<T, int>>>& true_pos,
-      const std::map<int, std::vector<std::pair<T, int>>>& true_pos,
+            const std::map<int, std::vector<std::pair<T, int>>>& false_pos,
-      const std::map<int, std::vector<std::pair<T, int>>>& false_pos) const {
+            const int background_label) const {
    T mAP = 0.0;
    int count = 0;
    for (auto it = label_pos_count.begin(); it != label_pos_count.end(); ++it) {
      int label = it->first;
      int label_num_pos = it->second;
-      if (label_num_pos == 0 || true_pos.find(label) == true_pos.end())
+      if (label_num_pos == background_label ||
+          true_pos.find(label) == true_pos.end()) {
        continue;
+      }
      auto label_true_pos = true_pos.find(label)->second;
      auto label_false_pos = false_pos.find(label)->second;
      // Compute average precision.
@@ -443,7 +450,7 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
      }
    }
    if (count != 0) mAP /= count;
-    return mAP * 100;
+    return mAP;
  }
 };  // namespace operators

--- a/paddle/fluid/operators/elementwise_mul_op.h
+++ b/paddle/fluid/operators/elementwise_mul_op.h
@@ -40,80 +40,14 @@ class ElementwiseMulKernel : public framework::OpKernel<T> {
 };
 template <typename T>
-struct ElementwiseMulGradFunctor {
+struct IdentityGrad_DX {
-  template <typename Device, typename X, typename Y, typename Z, typename dX,
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout * y; }
-            typename dY, typename dZ>
-  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) {
-    auto x_e = framework::EigenVector<T>::Flatten(*x);
-    auto y_e = framework::EigenVector<T>::Flatten(*y);
-    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
-    if (dx) {
-      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
-      dx_e.device(d) = dz_e * y_e;
-    }
-    if (dy) {
-      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
-      dy_e.device(d) = x_e * dz_e;
-    }
-  }
 };
 template <typename T>
-struct ElementwiseMulBroadCastGradFunctor {
+struct IdentityGrad_DY {
-  template <typename Device, typename X, typename Y, typename Z, typename dX,
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout * x; }
-            typename dY, typename dZ, typename Pre, typename N>
-  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) {
-    auto x_e = framework::EigenVector<T>::Flatten(*x);
-    auto y_e = framework::EigenVector<T>::Flatten(*y);
-    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
-    auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 2>(1, n))
-                         .broadcast(Eigen::DSizes<int, 2>(pre, 1))
-                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));
-    if (dx) {
-      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
-      dx_e.device(d) = dz_e * y_e_bcast;
-    }
-    if (dy) {
-      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
-      dy_e.device(d) = (x_e * dz_e)
-                           .reshape(Eigen::DSizes<int, 2>(pre, n))
-                           .sum(Eigen::array<int, 1>{{0}});
-    }
-  }
 };
-template <typename T>
-struct ElementwiseMulBroadCast2GradFunctor {
-  template <typename Device, typename X, typename Y, typename Z, typename dX,
-            typename dY, typename dZ, typename Pre, typename N, typename Post>
-  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n,
-                  Post post) {
-    auto x_e = framework::EigenVector<T>::Flatten(*x);
-    auto y_e = framework::EigenVector<T>::Flatten(*y);
-    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
-    auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 3>(1, n, 1))
-                         .broadcast(Eigen::DSizes<int, 3>(pre, 1, post))
-                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));
-    if (dx) {
-      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
-      dx_e.device(d) = dz_e * y_e_bcast;
-    }
-    if (dy) {
-      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
-      dy_e.device(d) = (x_e * dz_e)
-                           .reshape(Eigen::DSizes<int, 3>(pre, n, post))
-                           .sum(Eigen::array<int, 2>{{0, 2}});
-    }
-  }
-};
 template <typename DeviceContext, typename T>
 class ElementwiseMulGradKernel : public framework::OpKernel<T> {
 public:
@@ -127,12 +61,11 @@ class ElementwiseMulGradKernel : public framework::OpKernel<T> {
    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
    int axis = ctx.Attr<int>("axis");
-    ElementwiseGradCompute<DeviceContext, T, ElementwiseMulGradFunctor<T>,
+    ElemwiseGradCompute<DeviceContext, T, IdentityGrad_DX<T>,
-                           ElementwiseMulBroadCastGradFunctor<T>,
+                        IdentityGrad_DY<T>>(ctx, *x, *y, *out, *dout, axis, dx,
-                           ElementwiseMulBroadCast2GradFunctor<T>>(
+                                            dy, IdentityGrad_DX<T>(),
-        ctx, x, y, out, dout, axis, dx, dy);
+                                            IdentityGrad_DY<T>());
  }
 };
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise_op_function.h
@@ -301,7 +301,7 @@ struct ElemwiseGradNoBroadcast {
      dx_[i] = dx_op_(x_[i], y_[i], out_[i], dout_[i]);
    }
    if (dy_ != nullptr) {
-      dy_[i] = dx_op_(x_[i], y_[i], out_[i], dout_[i]);
+      dy_[i] = dy_op_(x_[i], y_[i], out_[i], dout_[i]);
    }
  }

--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -44,6 +44,9 @@ math_library(lstm_compute DEPS activation_functions)
 math_library(gru_compute DEPS activation_functions)
 if(WITH_GPU)
    nv_library(depthwise_conv SRCS depthwise_conv.cu DEPS device_context)
+    nv_library(concat_functor SRCS concat.cc concat.cu DEPS device_context tensor)
+else()
+    cc_library(concat_functor SRCS concat.cc DEPS device_context tensor)
 endif()
 cc_test(math_function_test SRCS math_function_test.cc)
@@ -55,3 +58,4 @@ if(WITH_GPU)
    nv_test(math_function_gpu_test SRCS math_function_test.cu)
    nv_test(selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu DEPS selected_rows_functor)
 endif()
+cc_test(concat_test SRCS concat_test.cc DEPS concat_functor tensor)
--- a/paddle/fluid/operators/math/concat.cc
+++ b/paddle/fluid/operators/math/concat.cc
+/* Copyright (c) 2018 paddlepaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/math/concat.h"
+namespace paddle {
+namespace operators {
+namespace math {
+/*
+ * All tensors' dimension should be the same and the values of
+ * each dimension are the same, except the axis dimension.
+ */
+template <typename T>
+class ConcatFunctor<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const std::vector<framework::Tensor>& input, const int axis,
+                  framework::Tensor* output) {
+    // TODO(zcd): Add input data validity checking
+    int num = input.size();
+    int rows = 1;
+    auto dim_0 = input[0].dims();
+    for (int i = 0; i < axis; ++i) {
+      rows *= dim_0[i];
+    }
+    int out_rows = rows, out_cols = 0;
+    std::vector<int64_t> input_cols(input.size());
+    for (int i = 0; i < num; ++i) {
+      int t_cols = input[i].numel() / rows;
+      out_cols += t_cols;
+      input_cols[i] = t_cols;
+    }
+    auto& cpu_place = boost::get<platform::CPUPlace>(context.GetPlace());
+    // computation
+    for (int k = 0; k < out_rows; ++k) {
+      T* dst_ptr = output->data<T>() + k * out_cols;
+      int col_idx = 0;
+      for (int j = 0; j < num; ++j) {
+        int col_len = input_cols[j];
+        const T* src_prt = input[j].data<T>() + k * col_len;
+        memory::Copy(cpu_place, dst_ptr + col_idx, cpu_place, src_prt,
+                     sizeof(T) * col_len);
+        col_idx += col_len;
+      }
+    }
+  }
+};
+/*
+ * All tensors' dimension should be the same and the values of
+ * each dimension are the same, except the axis dimension.
+ */
+template <typename T>
+class ConcatGradFunctor<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input, const int axis,
+                  std::vector<framework::Tensor>& outputs) {
+    // TODO(zcd): Add input data validity checking
+    int num = outputs.size();
+    int input_rows = 1;
+    auto dim_0 = outputs[0].dims();
+    for (int i = 0; i < axis; ++i) {
+      input_rows *= dim_0[i];
+    }
+    int input_cols = 0;
+    std::vector<int64_t> output_cols(outputs.size());
+    for (int i = 0; i < num; ++i) {
+      int t_cols = outputs[i].numel() / input_rows;
+      input_cols += t_cols;
+      output_cols[i] = t_cols;
+    }
+    auto& cpu_place = boost::get<platform::CPUPlace>(context.GetPlace());
+    // computation
+    for (int k = 0; k < input_rows; ++k) {
+      const T* src_ptr = input.data<T>() + k * input_cols;
+      int col_idx = 0;
+      for (int j = 0; j < num; ++j) {
+        int col_len = output_cols[j];
+        T* dst_ptr = outputs[j].data<T>() + k * col_len;
+        memory::Copy(cpu_place, dst_ptr, cpu_place, src_ptr + col_idx,
+                     sizeof(T) * col_len);
+        col_idx += col_len;
+      }
+    }
+  }
+};
+template class ConcatFunctor<platform::CPUDeviceContext, int>;
+template class ConcatFunctor<platform::CPUDeviceContext, int64_t>;
+template class ConcatFunctor<platform::CPUDeviceContext, float>;
+template class ConcatFunctor<platform::CPUDeviceContext, double>;
+template class ConcatGradFunctor<platform::CPUDeviceContext, int>;
+template class ConcatGradFunctor<platform::CPUDeviceContext, int64_t>;
+template class ConcatGradFunctor<platform::CPUDeviceContext, float>;
+template class ConcatGradFunctor<platform::CPUDeviceContext, double>;
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/math/concat.cu
+++ b/paddle/fluid/operators/math/concat.cu
+/* Copyright (c) 2018 paddlepaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/framework/mixed_vector.h"
+#include "paddle/fluid/operators/math/concat.h"
+#include "paddle/fluid/platform/cuda_helper.h"
+namespace paddle {
+namespace operators {
+namespace math {
+template <typename T>
+__device__ T upper_bound(const T* first, T count, T val) {
+  const T* orig = first;
+  const T* it = nullptr;
+  T step = 0;
+  while (count > 0) {
+    it = first;
+    step = count / 2;
+    it += step;
+    if (!(val < *it)) {
+      first = ++it;
+      count -= step + 1;
+    } else {
+      count = step;
+    }
+  }
+  return first - orig;
+}
+template <typename T>
+__global__ void KernelConcat(T** inputs, const int* input_cols, int col_size,
+                             const int output_rows, const int output_cols,
+                             T* output) {
+  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
+  int segment = upper_bound<int>(input_cols, col_size, tid_x) - 1;
+  int curr_offset = input_cols[segment];
+  int curr_segment = segment;
+  for (; tid_x < output_cols; tid_x += blockDim.x * gridDim.x) {
+    T curr_col_offset;
+    while ((curr_col_offset = input_cols[curr_segment + 1]) <= tid_x) {
+      curr_offset = curr_col_offset;
+      ++curr_segment;
+    }
+    int local_col = tid_x - curr_offset;
+    int segment_width = curr_col_offset - curr_offset;
+    T* input_ptr = inputs[curr_segment];
+    int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
+    for (; tid_y < output_rows; tid_y += blockDim.y * gridDim.y)
+      output[tid_y * output_cols + tid_x] =
+          input_ptr[tid_y * segment_width + local_col];
+  }
+}
+template <typename T>
+__global__ void KernelConcat(T** inputs, const int input_col,
+                             const int output_rows, const int output_cols,
+                             T* output) {
+  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
+  double inv_input_col = 1.0 / input_col;
+  for (; tid_x < output_cols; tid_x += blockDim.x * gridDim.x) {
+    int split = tid_x * inv_input_col;
+    int in_offset = tid_x - split * input_col;
+    T* input_ptr = inputs[split];
+    int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
+    for (; tid_y < output_rows; tid_y += blockDim.y * gridDim.y) {
+      output[tid_y * output_cols + tid_x] =
+          input_ptr[tid_y * input_col + in_offset];
+    }
+  }
+}
+template <typename T>
+__global__ void KernelConcatGrad(const T* input, const int input_row,
+                                 const int input_col, const int* output_cols,
+                                 int col_size, T** outputs) {
+  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
+  int segment = upper_bound<int>(output_cols, col_size, tid_x) - 1;
+  int curr_offset = output_cols[segment];
+  int curr_segment = segment;
+  for (; tid_x < input_col; tid_x += blockDim.x * gridDim.x) {
+    T curr_col_offset;
+    while ((curr_col_offset = output_cols[curr_segment + 1]) <= tid_x) {
+      curr_offset = curr_col_offset;
+      ++curr_segment;
+    }
+    int local_col = tid_x - curr_offset;
+    int segment_width = curr_col_offset - curr_offset;
+    T* output_ptr = outputs[curr_segment];
+    int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
+    for (; tid_y < input_row; tid_y += blockDim.y * gridDim.y)
+      output_ptr[tid_y * segment_width + local_col] =
+          input[tid_y * input_col + tid_x];
+  }
+}
+template <typename T>
+__global__ void KernelConcatGrad(const T* input, const int input_row,
+                                 const int input_col, const int output_cols,
+                                 T** outputs) {
+  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
+  double inv_input_col = 1.0 / input_col;
+  for (; tid_x < input_col; tid_x += blockDim.x * gridDim.x) {
+    int split = tid_x * inv_input_col;
+    int in_offset = tid_x - split * input_col;
+    T* output_ptr = outputs[split];
+    int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
+    for (; tid_y < input_row; tid_y += blockDim.y * gridDim.y)
+      output_ptr[tid_y * output_cols + in_offset] =
+          input[tid_y * input_col + tid_x];
+  }
+}
+/*
+ * All tensors' dimension should be the same and the values of
+ * each dimension are the same, except the axis dimension.
+ */
+template <typename T>
+class ConcatFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const std::vector<framework::Tensor>& input, const int axis,
+                  framework::Tensor* output) {
+    // TODO(zcd): Add input data validity checking
+    int num = input.size();
+    int rows = 1;
+    auto dim_0 = input[0].dims();
+    for (int i = 0; i < axis; ++i) {
+      rows *= dim_0[i];
+    }
+    int cols = input[0].numel() / rows;
+    int out_rows = rows, out_cols = 0;
+    framework::Vector<int16_t> inputs_data(num * sizeof(T*) / 2);
+    framework::Vector<int> inputs_cols(num + 1);
+    inputs_cols[0] = 0;
+    T** inputs_ptr = reinterpret_cast<T**>(inputs_data.data());
+    bool sameShape = true;
+    for (int i = 0; i < num; ++i) {
+      int t_cols = input[i].numel() / rows;
+      if (sameShape) {
+        if (t_cols != cols) sameShape = false;
+      }
+      out_cols += t_cols;
+      inputs_cols[i + 1] = out_cols;
+      inputs_ptr[i] = const_cast<T*>(input[i].data<T>());
+    }
+    T** ins_gpu =
+        reinterpret_cast<T**>(inputs_data.CUDAMutableData(context.GetPlace()));
+    const int* ins_col_gpu = inputs_cols.CUDAData(context.GetPlace());
+    // computation
+    // set the thread block and grid according to CurrentDeviceId
+    const int kThreadsPerBlock = 1024;
+    int block_cols = kThreadsPerBlock;
+    if (out_cols < kThreadsPerBlock) {  // block_cols is aligned by 32.
+      block_cols = ((out_cols + 31) >> 5) << 5;
+    }
+    int block_rows = kThreadsPerBlock / block_cols;
+    dim3 block_size = dim3(block_cols, block_rows, 1);
+    int max_threads = context.GetMaxPhysicalThreadCount();
+    int max_blocks = std::max(max_threads / kThreadsPerBlock, 1);
+    int grid_cols =
+        std::min((out_cols + block_cols - 1) / block_cols, max_blocks);
+    int grid_rows =
+        std::min(max_blocks / grid_cols, std::max(out_rows / block_rows, 1));
+    dim3 grid_size = dim3(grid_cols, grid_rows, 1);
+    if (sameShape) {
+      KernelConcat<<<grid_size, block_size, 0, context.stream()>>>(
+          ins_gpu, cols, out_rows, out_cols, output->data<T>());
+    } else {
+      KernelConcat<<<grid_size, block_size, 0, context.stream()>>>(
+          ins_gpu, ins_col_gpu, static_cast<int>(inputs_cols.size()), out_rows,
+          out_cols, output->data<T>());
+    }
+  }
+};
+/*
+ * All tensors' dimension should be the same and the values of
+ * each dimension are the same, except the axis dimension.
+ */
+template <typename T>
+class ConcatGradFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& input, const int axis,
+                  std::vector<framework::Tensor>& outputs) {
+    // TODO(zcd): Add input data validity checking
+    int num = outputs.size();
+    int input_row = 1;
+    auto dim_0 = outputs[0].dims();
+    for (int i = 0; i < axis; ++i) {
+      input_row *= dim_0[i];
+    }
+    int output_col_0 = outputs[0].numel() / input_row;
+    int input_col = 0;
+    bool sameShape = true;
+    framework::Vector<int16_t> outputs_data(num * sizeof(T*) / 2);
+    framework::Vector<int> outputs_cols(num + 1);
+    outputs_cols[0] = 0;
+    T** outputs_ptr = reinterpret_cast<T**>(outputs_data.data());
+    for (int i = 0; i < num; ++i) {
+      int t_col = outputs[i].numel() / input_row;
+      if (sameShape) {
+        if (t_col != output_col_0) sameShape = false;
+      }
+      input_col += t_col;
+      outputs_cols[i + 1] = input_col;
+      outputs_ptr[i] = outputs[i].data<T>();
+    }
+    T** outs_gpu =
+        reinterpret_cast<T**>(outputs_data.CUDAMutableData(context.GetPlace()));
+    const int* outs_col_gpu = outputs_cols.CUDAData(context.GetPlace());
+    // computation
+    const int kThreadsPerBlock = 1024;
+    int block_cols = kThreadsPerBlock;
+    if (input_col < kThreadsPerBlock) {  // block_cols is aligned by 32.
+      block_cols = ((input_col + 31) >> 5) << 5;
+    }
+    int block_rows = kThreadsPerBlock / block_cols;
+    dim3 block_size = dim3(block_cols, block_rows, 1);
+    int max_threads = context.GetMaxPhysicalThreadCount();
+    int max_blocks = std::max(max_threads / kThreadsPerBlock, 1);
+    int grid_cols =
+        std::min((input_col + block_cols - 1) / block_cols, max_blocks);
+    int grid_rows =
+        std::min(max_blocks / grid_cols, std::max(input_row / block_rows, 1));
+    dim3 grid_size = dim3(grid_cols, grid_rows, 1);
+    if (sameShape) {
+      KernelConcatGrad<<<grid_size, block_size, 0, context.stream()>>>(
+          input.data<T>(), input_row, input_col, output_col_0, outs_gpu);
+    } else {
+      KernelConcatGrad<<<grid_size, block_size, 0, context.stream()>>>(
+          input.data<T>(), input_row, input_col, outs_col_gpu,
+          static_cast<int>(outputs_cols.size()), outs_gpu);
+    }
+  }
+};
+template class ConcatFunctor<platform::CUDADeviceContext, int>;
+template class ConcatFunctor<platform::CUDADeviceContext, int64_t>;
+template class ConcatFunctor<platform::CUDADeviceContext, float>;
+template class ConcatFunctor<platform::CUDADeviceContext, double>;
+template class ConcatGradFunctor<platform::CUDADeviceContext, int>;
+template class ConcatGradFunctor<platform::CUDADeviceContext, int64_t>;
+template class ConcatGradFunctor<platform::CUDADeviceContext, float>;
+template class ConcatGradFunctor<platform::CUDADeviceContext, double>;
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/math/concat.h
+++ b/paddle/fluid/operators/math/concat.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "paddle/fluid/framework/tensor.h"
+namespace paddle {
+namespace operators {
+namespace math {
+/*
+ * \brief Concatenate the input tensors along the dimension axis.
+ *  TODO(zcd): maybe it needs to be more detailed.
+ *  Examples:
+ *     Input[0] = [[1,2],[3,4]]
+ *     Input[1] = [[5,6]]
+ *     axis = 0
+ *
+ *     Output = [[1,2],
+ *               [3,4],
+ *               [5,6]]
+ */
+template <typename DeviceContext, typename T>
+class ConcatFunctor {
+ public:
+  void operator()(const DeviceContext& context,
+                  const std::vector<framework::Tensor>& input, const int axis,
+                  framework::Tensor* output);
+};
+/*
+ * \brief Split the input tensors along the dimension axis into outputs.
+ *  TODO(zcd): maybe it needs to be more detailed.
+ *  Examples:
+ *     Input = [[1,2],
+ *              [3,4],
+ *              [5,6]]
+ *     axis = 0
+ *
+ *     Output[0] = [[1,2],[3,4]]
+ *     Output[1] = [[5,6]]
+ */
+template <typename DeviceContext, typename T>
+class ConcatGradFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  const int axis, std::vector<framework::Tensor>& outputs);
+};
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/math/concat_test.cc
+++ b/paddle/fluid/operators/math/concat_test.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/math/concat.h"
+#include <gtest/gtest.h>
+#include <vector>
+#include "paddle/fluid/framework/tensor_util.h"
+using namespace paddle::framework;
+using namespace paddle::platform;
+template <typename DeviceContext, typename Place>
+void testConcat() {
+  Tensor input_a_cpu;
+  Tensor input_b_cpu;
+  Tensor out_cpu;
+  Tensor input_a;
+  Tensor input_b;
+  Tensor out;
+  DeviceContext* context = new DeviceContext(Place());
+  //  DeviceContext context(Place());
+  /**
+   * cast1:
+   *    inputs:
+   *        t_a.shape: [2, 3, 4]
+   *        t_b.shape: [3, 3, 4]
+   *    output:
+   *        out.shape: [5, 3, 4]
+   */
+  auto dim_a = make_ddim({2, 3, 4});
+  auto dim_b = make_ddim({3, 3, 4});
+  auto dim_out = make_ddim({5, 3, 4});
+  input_a.mutable_data<int>(dim_a, Place());
+  input_b.mutable_data<int>(dim_b, Place());
+  out.mutable_data<int>(dim_out, Place());
+  if (paddle::platform::is_gpu_place(Place())) {
+    input_a_cpu.mutable_data<int>(dim_a, CPUPlace());
+    input_b_cpu.mutable_data<int>(dim_b, CPUPlace());
+    out_cpu.mutable_data<int>(dim_out, CPUPlace());
+  }
+  int* a_ptr;
+  int* b_ptr;
+  if (paddle::platform::is_gpu_place(Place())) {
+    a_ptr = input_a_cpu.data<int>();
+    b_ptr = input_b_cpu.data<int>();
+  } else {
+    a_ptr = input_a.data<int>();
+    b_ptr = input_b.data<int>();
+  }
+  for (int i = 0; i < 2 * 3 * 4; ++i) {
+    a_ptr[i] = i;
+  }
+  for (int i = 0; i < 3 * 3 * 4; ++i) {
+    b_ptr[i] = i;
+  }
+  if (paddle::platform::is_gpu_place(Place())) {
+    TensorCopy(input_a_cpu, Place(), *context, &input_a);
+    TensorCopy(input_b_cpu, Place(), *context, &input_b);
+  }
+  std::vector<Tensor> input;
+  input.push_back(input_a);
+  input.push_back(input_b);
+  paddle::operators::math::ConcatFunctor<DeviceContext, int> concat_functor;
+  concat_functor(*context, input, 0, &out);
+  // check the dim of input_a, input_b
+  PADDLE_ENFORCE_EQ(input_a.dims(), dim_a);
+  PADDLE_ENFORCE_EQ(input_b.dims(), dim_b);
+  int* out_ptr;
+  if (paddle::platform::is_gpu_place(Place())) {
+    TensorCopy(out, CPUPlace(), *context, &out_cpu);
+    out_ptr = out_cpu.data<int>();
+  } else {
+    out_ptr = out.data<int>();
+  }
+  int cols = 2 * 3 * 4;
+  int idx_a = 0, idx_b = 0;
+  for (int j = 0; j < 5 * 3 * 4; ++j) {
+    if (j >= cols) {
+      PADDLE_ENFORCE_EQ(out_ptr[j], b_ptr[idx_b]);
+      ++idx_b;
+    } else {
+      PADDLE_ENFORCE_EQ(out_ptr[j], a_ptr[idx_a]);
+      ++idx_a;
+    }
+  }
+  //
+  /**
+    * cast2:
+    *    inputs:
+    *        t_a.shape: [2, 3, 4]
+    *        t_b.shape: [2, 4, 4]
+    *    output:
+    *        out.shape: [2, 7, 4]
+    */
+  dim_a = make_ddim({2, 3, 4});
+  dim_b = make_ddim({2, 4, 4});
+  dim_out = make_ddim({2, 7, 4});
+  input_a.Resize(dim_a);
+  input_b.Resize(dim_b);
+  out.Resize(dim_out);
+  if (paddle::platform::is_gpu_place(Place())) {
+    input_a_cpu.Resize(dim_a);
+    input_b_cpu.Resize(dim_b);
+    out_cpu.Resize(dim_out);
+  }
+  if (paddle::platform::is_gpu_place(Place())) {
+    a_ptr = input_a_cpu.data<int>();
+    b_ptr = input_b_cpu.data<int>();
+  } else {
+    a_ptr = input_a.data<int>();
+    b_ptr = input_b.data<int>();
+  }
+  for (int i = 0; i < 2 * 3 * 4; ++i) {
+    a_ptr[i] = i;
+  }
+  for (int i = 0; i < 2 * 4 * 4; ++i) {
+    b_ptr[i] = i;
+  }
+  if (paddle::platform::is_gpu_place(Place())) {
+    TensorCopy(input_a_cpu, Place(), *context, &input_a);
+    TensorCopy(input_b_cpu, Place(), *context, &input_b);
+  }
+  input.clear();
+  input.push_back(input_a);
+  input.push_back(input_b);
+  concat_functor(*context, input, 1, &out);
+  // check the dim of input_a, input_b
+  PADDLE_ENFORCE_EQ(input_a.dims(), dim_a);
+  PADDLE_ENFORCE_EQ(input_b.dims(), dim_b);
+  if (paddle::platform::is_gpu_place(Place())) {
+    TensorCopy(out, CPUPlace(), *context, &out_cpu);
+    out_ptr = out_cpu.data<int>();
+  } else {
+    out_ptr = out.data<int>();
+  }
+  cols = 3 * 4;
+  idx_a = 0, idx_b = 0;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 28; ++j) {
+      if (j >= cols) {
+        PADDLE_ENFORCE_EQ(out_ptr[i * 28 + j], b_ptr[idx_b]);
+        ++idx_b;
+      } else {
+        PADDLE_ENFORCE_EQ(out_ptr[i * 28 + j], a_ptr[idx_a]);
+        ++idx_a;
+      }
+    }
+  }
+  /**
+    * cast3:
+    *    inputs:
+    *        t_a.shape: [2, 3, 5]
+    *        t_b.shape: [2, 3, 4]
+    *    output:
+    *        out.shape: [2, 3, 9]
+    */
+  dim_a = make_ddim({2, 3, 4});
+  dim_b = make_ddim({2, 3, 5});
+  dim_out = make_ddim({2, 3, 9});
+  input_a.Resize(dim_a);
+  input_b.Resize(dim_b);
+  out.Resize(dim_out);
+  if (paddle::platform::is_gpu_place(Place())) {
+    input_a_cpu.Resize(dim_a);
+    input_b_cpu.Resize(dim_b);
+    out_cpu.Resize(dim_out);
+  }
+  if (paddle::platform::is_gpu_place(Place())) {
+    a_ptr = input_a_cpu.data<int>();
+    b_ptr = input_b_cpu.data<int>();
+  } else {
+    a_ptr = input_a.data<int>();
+    b_ptr = input_b.data<int>();
+  }
+  for (int i = 0; i < 2 * 3 * 4; ++i) {
+    a_ptr[i] = i;
+  }
+  for (int i = 0; i < 2 * 3 * 5; ++i) {
+    b_ptr[i] = i;
+  }
+  if (paddle::platform::is_gpu_place(Place())) {
+    TensorCopy(input_a_cpu, Place(), *context, &input_a);
+    TensorCopy(input_b_cpu, Place(), *context, &input_b);
+  }
+  input.clear();
+  input.push_back(input_a);
+  input.push_back(input_b);
+  concat_functor(*context, input, 2, &out);
+  // check the dim of input_a, input_b
+  PADDLE_ENFORCE_EQ(input_a.dims(), dim_a);
+  PADDLE_ENFORCE_EQ(input_b.dims(), dim_b);
+  if (paddle::platform::is_gpu_place(Place())) {
+    TensorCopy(out, CPUPlace(), *context, &out_cpu);
+    out_ptr = out_cpu.data<int>();
+  } else {
+    out_ptr = out.data<int>();
+  }
+  // check the data
+  cols = 4;
+  idx_a = 0, idx_b = 0;
+  for (int i = 0; i < 6; ++i) {
+    for (int j = 0; j < 9; ++j) {
+      if (j >= cols) {
+        PADDLE_ENFORCE_EQ(out_ptr[i * 9 + j], b_ptr[idx_b]);
+        ++idx_b;
+      } else {
+        PADDLE_ENFORCE_EQ(out_ptr[i * 9 + j], a_ptr[idx_a]);
+        ++idx_a;
+      }
+    }
+  }
+  /**
+    * cast4:
+    *    inputs:
+    *        axis = 1
+    *        t_a.shape: [2, 3, 4]
+    *        t_b.shape: [2, 3, 4]
+    *    output:
+    *        out.shape: [2, 6, 4]
+    */
+  dim_a = make_ddim({2, 3, 4});
+  dim_b = make_ddim({2, 3, 4});
+  dim_out = make_ddim({2, 6, 4});
+  input_a.Resize(dim_a);
+  input_b.Resize(dim_b);
+  out.Resize(dim_out);
+  if (paddle::platform::is_gpu_place(Place())) {
+    input_a_cpu.Resize(dim_a);
+    input_b_cpu.Resize(dim_b);
+    out_cpu.Resize(dim_out);
+  }
+  if (paddle::platform::is_gpu_place(Place())) {
+    a_ptr = input_a_cpu.data<int>();
+    b_ptr = input_b_cpu.data<int>();
+  } else {
+    a_ptr = input_a.data<int>();
+    b_ptr = input_b.data<int>();
+  }
+  for (int i = 0; i < 2 * 3 * 4; ++i) {
+    a_ptr[i] = i;
+  }
+  for (int i = 0; i < 2 * 3 * 4; ++i) {
+    b_ptr[i] = i;
+  }
+  if (paddle::platform::is_gpu_place(Place())) {
+    TensorCopy(input_a_cpu, Place(), *context, &input_a);
+    TensorCopy(input_b_cpu, Place(), *context, &input_b);
+  }
+  input.clear();
+  input.push_back(input_a);
+  input.push_back(input_b);
+  concat_functor(*context, input, 1, &out);
+  // check the dim of input_a, input_b
+  PADDLE_ENFORCE_EQ(input_a.dims(), dim_a);
+  PADDLE_ENFORCE_EQ(input_b.dims(), dim_b);
+  if (paddle::platform::is_gpu_place(Place())) {
+    TensorCopy(out, CPUPlace(), *context, &out_cpu);
+    out_ptr = out_cpu.data<int>();
+  } else {
+    out_ptr = out.data<int>();
+  }
+  // check the data
+  cols = 12;
+  idx_a = 0, idx_b = 0;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 24; ++j) {
+      if (j >= cols) {
+        PADDLE_ENFORCE_EQ(out_ptr[i * 24 + j], b_ptr[idx_b]);
+        ++idx_b;
+      } else {
+        PADDLE_ENFORCE_EQ(out_ptr[i * 24 + j], a_ptr[idx_a]);
+        ++idx_a;
+      }
+    }
+  }
+}
+TEST(math, concat) {
+  testConcat<paddle::platform::CPUDeviceContext, paddle::platform::CPUPlace>();
+#ifdef PADDLE_WITH_CUDA
+  testConcat<paddle::platform::CUDADeviceContext,
+             paddle::platform::CUDAPlace>();
+#endif
+}
--- a/paddle/fluid/operators/math/math_function.cc
+++ b/paddle/fluid/operators/math/math_function.cc
@@ -245,11 +245,13 @@ template struct SetConstant<platform::CPUDeviceContext, int>;
 template struct SetConstant<platform::CPUDeviceContext, int64_t>;
 template struct SetConstant<platform::CPUDeviceContext, bool>;
-#define DEFINE_CPU_TRANS(RANK)                                          \
+#define DEFINE_CPU_TRANS(RANK)                                             \
-  template struct Transpose<platform::CPUDeviceContext, float, RANK>;   \
+  template struct Transpose<platform::CPUDeviceContext, platform::float16, \
-  template struct Transpose<platform::CPUDeviceContext, double, RANK>;  \
+                            RANK>;                                         \
-  template struct Transpose<platform::CPUDeviceContext, int, RANK>;     \
+  template struct Transpose<platform::CPUDeviceContext, float, RANK>;      \
-  template struct Transpose<platform::CPUDeviceContext, int64_t, RANK>; \
+  template struct Transpose<platform::CPUDeviceContext, double, RANK>;     \
+  template struct Transpose<platform::CPUDeviceContext, int, RANK>;        \
+  template struct Transpose<platform::CPUDeviceContext, int64_t, RANK>;    \
  template struct Transpose<platform::CPUDeviceContext, bool, RANK>;
 DEFINE_CPU_TRANS(1);

--- a/paddle/fluid/operators/mine_hard_examples_op.cc
+++ b/paddle/fluid/operators/mine_hard_examples_op.cc
@@ -247,7 +247,7 @@ class MineHardExamplesOp : public framework::OperatorWithKernel {
      const framework::ExecutionContext& ctx) const override {
    return framework::OpKernelType(
        framework::ToDataType(ctx.Input<framework::Tensor>("ClsLoss")->type()),
-        ctx.device_context());
+        platform::CPUPlace());
  }
 };

--- a/paddle/fluid/operators/multiclass_nms_op.cc
+++ b/paddle/fluid/operators/multiclass_nms_op.cc
@@ -62,7 +62,7 @@ class MultiClassNMSOp : public framework::OperatorWithKernel {
    return framework::OpKernelType(
        framework::ToDataType(
            ctx.Input<framework::LoDTensor>("Scores")->type()),
-        ctx.device_context());
+        platform::CPUPlace());
  }
 };
@@ -324,7 +324,7 @@ class MultiClassNMSOpMaker : public framework::OpProtoAndCheckerMaker {
             " Please note, M is equal to the 1st dimension of BBoxes. ");
    AddAttr<int>(
        "background_label",
-        "(int64_t, defalut: 0) "
+        "(int, defalut: 0) "
        "The index of background label, the background label will be ignored. "
        "If set to -1, then all categories will be considered.")
        .SetDefault(0);

--- a/paddle/fluid/operators/nccl/nccl_gpu_common.cc
+++ b/paddle/fluid/operators/nccl/nccl_gpu_common.cc
@@ -16,5 +16,50 @@ limitations under the License. */
 #include "paddle/fluid/platform/gpu_info.h"
 namespace paddle {
-namespace platform {}  // namespace platform
+namespace platform {
+namespace {
+// TODO(panyx0718): Where to destroy them.
+std::unique_ptr<std::vector<ncclComm_t>> global_comms;
+std::unique_ptr<std::unordered_map<int, int>> comm_id_map;
+bool inited = false;
+size_t last_num_gpus = -1;
+// TODO(panyx0718): Need to decide whether Paddle supports parallel
+// runs with different number GPUs. If true, current solution is not enough.
+std::mutex comm_mu;
+}
+int Communicator::GetCommId(int device_id) const {
+  std::lock_guard<std::mutex> guard(comm_mu);
+  return comm_id_map->at(device_id);
+}
+void Communicator::InitAll(const std::vector<int>& gpus) {
+  std::lock_guard<std::mutex> guard(comm_mu);
+  if (inited && last_num_gpus == gpus.size()) {
+    return;
+  }
+  last_num_gpus = gpus.size();
+  if (global_comms) {
+    for (size_t i = 0; i < global_comms->size(); ++i) {
+      // FIXME(dzh) : PADDLE_ENFORCE return void
+      dynload::ncclCommDestroy((*global_comms)[i]);
+    }
+  }
+  global_comms.reset(new std::vector<ncclComm_t>());
+  comm_id_map.reset(new std::unordered_map<int, int>());
+  global_comms->resize(gpus.size());
+  for (size_t i = 0; i < gpus.size(); ++i) {
+    (*comm_id_map)[gpus[i]] = i;
+  }
+  PADDLE_ENFORCE(
+      dynload::ncclCommInitAll(global_comms->data(), gpus.size(), gpus.data()));
+  inited = true;
+}
+const std::vector<ncclComm_t>& Communicator::comms() const {
+  std::lock_guard<std::mutex> guard(comm_mu);
+  return *global_comms;
+}
+}  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/operators/nccl/nccl_gpu_common.h
+++ b/paddle/fluid/operators/nccl/nccl_gpu_common.h
@@ -29,39 +29,16 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 constexpr int kInvalidGPUId = -1;
 struct Communicator {
-  std::vector<ncclComm_t> comms_;
-  std::unordered_map<int, int> comm_id_map_;
-  bool inited_;
  Communicator() {}
-  int GetCommId(int device_id) const { return comm_id_map_.at(device_id); }
+  int GetCommId(int device_id) const;
-  void InitAll(const std::vector<int>& gpus) {
-    comms_.resize(gpus.size());
-    inited_ = false;
-    for (size_t i = 0; i < gpus.size(); ++i) {
-      comm_id_map_[gpus[i]] = i;
-    }
-    PADDLE_ENFORCE(
-        dynload::ncclCommInitAll(comms_.data(), gpus.size(), gpus.data()));
-    inited_ = true;
-  }
-  ~Communicator() {
+  void InitAll(const std::vector<int>& gpus);
-    if (inited_) {
-      for (size_t i = 0; i < comms_.size(); ++i) {
-        // FIXME(dzh) : PADDLE_ENFORCE return void
-        dynload::ncclCommDestroy(comms_[i]);
-      }
-    }
-  }
-  DISABLE_COPY_AND_ASSIGN(Communicator);
+  const std::vector<ncclComm_t>& comms() const;
 };
 }  // namespace platform

--- a/paddle/fluid/operators/nccl_op.cu.cc
+++ b/paddle/fluid/operators/nccl_op.cu.cc
@@ -78,7 +78,7 @@ class NCCLAllReduceKernel : public framework::OpKernel<T> {
      PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
          ins[i]->data<T>(), outs[i]->mutable_data<T>(ctx.GetPlace()),
          outs[i]->numel(), NCCLTypeWrapper<T>::type, reduction_op_,
-          comm->comms_[idx], stream));
+          comm->comms().at(idx), stream));
      PADDLE_ENFORCE(cudaStreamSynchronize(stream));
      VLOG(1) << "gpu : "
@@ -127,7 +127,7 @@ class NCCLReduceKernel : public framework::OpKernel<T> {
    std::hash<std::string> hasher;
    for (size_t i = 0; i < ins.size(); ++i) {
      if (root == platform::kInvalidGPUId) {
-        root = hasher(ins_names[i]) % comm->comms_.size();
+        root = hasher(ins_names[i]) % comm->comms().size();
      }
      T* recvbuffer = nullptr;
      if (root == gpu_id) {
@@ -139,7 +139,7 @@ class NCCLReduceKernel : public framework::OpKernel<T> {
      PADDLE_ENFORCE(platform::dynload::ncclReduce(
          ins[i]->data<T>(), recvbuffer, ins[i]->numel(),
-          NCCLTypeWrapper<T>::type, reduction_op_, root, comm->comms_[idx],
+          NCCLTypeWrapper<T>::type, reduction_op_, root, comm->comms().at(idx),
          stream));
      PADDLE_ENFORCE(cudaStreamSynchronize(stream));
@@ -176,7 +176,7 @@ class NCCLBcastKernel : public framework::OpKernel<T> {
        VLOG(1) << " before ncclBcast";
        PADDLE_ENFORCE(platform::dynload::ncclBcast(
            (void*)ins[i]->data<T>(), ins[i]->numel(), NCCLTypeWrapper<T>::type,
-            root, comm->comms_[idx], stream));
+            root, comm->comms().at(idx), stream));
        VLOG(1) << " after ncclBcast";
        PADDLE_ENFORCE(cudaStreamSynchronize(stream));
@@ -190,7 +190,7 @@ class NCCLBcastKernel : public framework::OpKernel<T> {
        PADDLE_ENFORCE(platform::dynload::ncclBcast(
            outs[i]->mutable_data<T>(ctx.GetPlace()), outs[i]->numel(),
-            NCCLTypeWrapper<T>::type, root, comm->comms_[idx], stream));
+            NCCLTypeWrapper<T>::type, root, comm->comms().at(idx), stream));
        PADDLE_ENFORCE(cudaStreamSynchronize(stream));
        VLOG(1) << "gpu : " << gpu_id << " finished Bcast. recv "

--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
@@ -17,8 +17,15 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
-int PoolOutputSize(int input_size, int filter_size, int padding, int stride) {
+int PoolOutputSize(int input_size, int filter_size, int padding, int stride,
-  int output_size = (input_size - filter_size + 2 * padding) / stride + 1;
+                   bool ceil_mode) {
+  int output_size;
+  if (!ceil_mode) {
+    output_size = (input_size - filter_size + 2 * padding) / stride + 1;
+  } else {
+    output_size =
+        (input_size - filter_size + 2 * padding + stride - 1) / stride + 1;
+  }
  PADDLE_ENFORCE(output_size > 0,
                 "Due to the settings of padding(%d), filter_size(%d) and "
                 "stride(%d), the output size is less than 0, please check "
@@ -38,6 +45,7 @@ void PoolOp::InferShape(framework::InferShapeContext *ctx) const {
  std::vector<int> ksize = ctx->Attrs().Get<std::vector<int>>("ksize");
  std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
  std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+  bool ceil_mode = ctx->Attrs().Get<bool>("ceil_mode");
  PADDLE_ENFORCE(in_x_dims.size() == 4 || in_x_dims.size() == 5,
                 "Pooling intput should be 4-D or 5-D tensor.");
@@ -59,8 +67,8 @@ void PoolOp::InferShape(framework::InferShapeContext *ctx) const {
  std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1]});
  for (size_t i = 0; i < ksize.size(); ++i) {
-    output_shape.push_back(
+    output_shape.push_back(PoolOutputSize(in_x_dims[i + 2], ksize[i],
-        PoolOutputSize(in_x_dims[i + 2], ksize[i], paddings[i], strides[i]));
+                                          paddings[i], strides[i], ceil_mode));
  }
  ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
  ctx->ShareLoD("X", "Out");
@@ -167,6 +175,12 @@ Pool2dOpMaker::Pool2dOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      "use_cudnn",
      "(bool, default false) Only used in cudnn kernel, need install cudnn")
      .SetDefault(false);
+  AddAttr<bool>(
+      "ceil_mode",
+      "(bool, default false) Wether to use the ceil function to calculate "
+      "output height and width. False is the default. If it is set to False, "
+      "the floor function will be used.")
+      .SetDefault(false);
  AddAttr<std::string>(
      "data_format",
      "(string, default NCHW) Only used in "
@@ -187,16 +201,21 @@ Parameters(ksize, strides, paddings) are two elements.
 These two elements represent height and width, respectively.
 The input(X) size and output(Out) size may be different.
-Example:   
+Example:
  Input:
       X shape: $(N, C, H_{in}, W_{in})$
  Output:
       Out shape: $(N, C, H_{out}, W_{out})$
-  Where
+  For ceil_mode = false:
-       $$ 
+       $$
       H_{out} = \frac{(H_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\
       W_{out} = \frac{(W_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1
       $$
+  For ceil_mode = true:
+       $$
+       H_{out} = \frac{(H_{in} - ksize[0] + 2 * paddings[0] + strides[0] - 1)}{strides[0]} + 1 \\
+       W_{out} = \frac{(W_{in} - ksize[1] + 2 * paddings[1] + strides[1] - 1)}{strides[1]} + 1
+       $$
 )DOC");
 }
@@ -251,6 +270,12 @@ Pool3dOpMaker::Pool3dOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      "use_cudnn",
      "(bool, default false) Only used in cudnn kernel, need install cudnn")
      .SetDefault(false);
+  AddAttr<bool>(
+      "ceil_mode",
+      "(bool, default false) Wether to use the ceil function to calculate "
+      "output height and width. False is the default. If it is set to False, "
+      "the floor function will be used.")
+      .SetDefault(false);
  AddAttr<std::string>(
      "data_format",
      "(string, default NCHW) Only used in "
@@ -267,8 +292,8 @@ The pooling3d operation calculates the output based on
 the input, pooling_type, ksize, strides, and paddings parameters.
 Input(X) and output(Out) are in NCDHW format, where N is batch
 size, C is the number of channels, and D, H and W are the depth, height and
-width of the feature, respectively. Parameters(ksize, strides, paddings) 
+width of the feature, respectively. Parameters(ksize, strides, paddings)
-are three elements. These three elements represent depth, height and 
+are three elements. These three elements represent depth, height and
 width, respectively. The input(X) size and output(Out) size may be different.
 Example:
@@ -276,12 +301,18 @@ Example:
       X shape: $(N, C, D_{in}, H_{in}, W_{in})$
  Output:
       Out shape: $(N, C, D_{out}, H_{out}, W_{out})$
-  Where
+  For ceil_mode = false:
  $$
       D_{out} = \frac{(D_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\
       H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1 \\
       W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2])}{strides[2]} + 1
  $$
+  For ceil_mode = true:
+  $$
+       D_{out} = \frac{(D_{in} - ksize[0] + 2 * paddings[0] + strides[0] -1)}{strides[0]} + 1 \\
+       H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1] + strides[1] -1)}{strides[1]} + 1 \\
+       W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2] + strides[2] -1)}{strides[2]} + 1
+  $$
 )DOC");
 }

--- a/paddle/fluid/operators/prior_box_op.cc
+++ b/paddle/fluid/operators/prior_box_op.cc
@@ -67,6 +67,14 @@ class PriorBoxOp : public framework::OperatorWithKernel {
    ctx->SetOutputDim("Boxes", framework::make_ddim(dim_vec));
    ctx->SetOutputDim("Variances", framework::make_ddim(dim_vec));
  }
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("Input")->type()),
+        platform::CPUPlace());
+  }
 };
 class PriorBoxOpMaker : public framework::OpProtoAndCheckerMaker {

--- a/paddle/fluid/operators/reader/CMakeLists.txt
+++ b/paddle/fluid/operators/reader/CMakeLists.txt
+cc_library(reader_op_registry SRCS reader_op_registry.cc DEPS operator op_registry reader)
+op_library(create_random_data_generator_op SRCS create_random_data_generator_op.cc DEPS reader_op_registry)
+op_library(create_shuffle_reader_op SRCS create_shuffle_reader_op.cc DEPS reader_op_registry)
+op_library(create_batch_reader_op SRCS create_batch_reader_op.cc DEPS reader_op_registry)
+set(READER_LIBRARY create_random_data_generator_op create_shuffle_reader_op create_batch_reader_op PARENT_SCOPE)
--- a/paddle/fluid/operators/reader/create_batch_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_batch_reader_op.cc
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/operators/reader/reader_op_registry.h"
+namespace paddle {
+namespace operators {
+namespace reader {
+class BatchReader : public framework::DecoratedReader {
+ public:
+  BatchReader(ReaderBase* reader, int batch_size)
+      : DecoratedReader(reader), batch_size_(batch_size) {
+    buffer_.reserve(batch_size_);
+  }
+  void ReadNext(std::vector<framework::LoDTensor>* out) override;
+ private:
+  int batch_size_;
+  std::vector<std::vector<framework::LoDTensor>> buffer_;
+};
+class CreateBatchReaderOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+ private:
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
+    const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
+                                        ->Get<framework::ReaderHolder>();
+    auto* out = scope.FindVar(Output("Out"))
+                    ->template GetMutable<framework::ReaderHolder>();
+    out->Reset(
+        new BatchReader(underlying_reader.Get(), Attr<int>("batch_size")));
+  }
+};
+class CreateBatchReaderOpMaker : public DecoratedReaderMakerBase {
+ public:
+  CreateBatchReaderOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
+      : DecoratedReaderMakerBase(op_proto, op_checker) {
+    AddAttr<int>("batch_size",
+                 "How many instances the batch reader yields each time.")
+        .GreaterThan(0);
+    AddComment(R"DOC(
+      CreateBatchReader Operator
+      A batch reader takes another reader as its 'underlying reader',
+      gathers the underlying reader's outputs and then yields them in batches.
+    )DOC");
+  }
+};
+void BatchReader::ReadNext(std::vector<framework::LoDTensor>* out) {
+  buffer_.clear();
+  buffer_.reserve(batch_size_);
+  for (int i = 0; i < batch_size_; ++i) {
+    if (reader_->HasNext()) {
+      buffer_.push_back(std::vector<framework::LoDTensor>());
+      reader_->ReadNext(&buffer_.back());
+    } else {
+      break;
+    }
+  }
+  // Concat instances
+  out->clear();
+  if (buffer_.empty()) {
+    // if buffer_ is empty, the 'out' will return as an empty vector.
+    return;
+  }
+  int out_num = buffer_[0].size();
+  out->reserve(out_num);
+  for (int j = 0; j < out_num; ++j) {
+    // Merge shape and check date type
+    std::type_index batch_type = buffer_[0][j].type();
+    framework::DDim batch_shape = buffer_[0][j].dims();
+    for (size_t i = 1; i < buffer_.size(); ++i) {
+      std::type_index ins_type = buffer_[i][j].type();
+      framework::DDim ins_shape = buffer_[i][j].dims();
+      PADDLE_ENFORCE_EQ(batch_type, ins_type);
+      PADDLE_ENFORCE_EQ(slice_ddim(batch_shape, 1, batch_shape.size()),
+                        slice_ddim(ins_shape, 1, ins_shape.size()));
+      PADDLE_ENFORCE_GT(ins_shape[0], 0);
+      batch_shape[0] += ins_shape[0];
+    }
+    framework::LoDTensor out_tensor;
+    out_tensor.Resize(batch_shape);
+    out_tensor.mutable_data(platform::CPUPlace(), batch_type);
+    int64_t dst_offset = 0;
+    // Merge lod and data
+    framework::LoD batch_lod;
+    for (size_t i = 0; i < buffer_.size(); ++i) {
+      framework::DDim ins_shape = buffer_[i][j].dims();
+      framework::LoD ins_lod = buffer_[i][j].lod();
+      if (i == 0) {
+        batch_lod = ins_lod;
+      } else {
+        PADDLE_ENFORCE_EQ(batch_lod.size(), ins_lod.size());
+        for (size_t level_idx = 0; level_idx < batch_lod.size(); ++level_idx) {
+          auto& lod_level = batch_lod[level_idx];
+          for (size_t k = 1; k < ins_lod[level_idx].size(); ++k) {
+            lod_level.push_back(ins_lod[level_idx][k] + lod_level.back());
+          }
+        }
+      }
+      auto dst = out_tensor.Slice(dst_offset, dst_offset + ins_shape[0]);
+      TensorCopy(buffer_[i][j], platform::CPUPlace(), &dst);
+      dst_offset += ins_shape[0];
+    }
+    out_tensor.set_lod(batch_lod);
+    out->push_back(out_tensor);
+  }
+}
+}  // namespace reader
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators::reader;
+REGISTER_DECORATED_READER_OPERATOR(create_batch_reader,
+                                   ops::CreateBatchReaderOp,
+                                   ops::CreateBatchReaderOpMaker);
--- a/paddle/fluid/operators/reader/create_random_data_generator_op.cc
+++ b/paddle/fluid/operators/reader/create_random_data_generator_op.cc
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/operators/reader/reader_op_registry.h"
+namespace paddle {
+namespace operators {
+namespace reader {
+template <typename T>
+class RandomDataGenerator : public framework::FileReader {
+ public:
+  RandomDataGenerator(const std::vector<framework::DDim>& shapes, float min,
+                      float max)
+      : FileReader(shapes), min_(min), max_(max) {
+    PADDLE_ENFORCE_LE(
+        min, max, "'min' shouldn't be greater than 'max'.(%f vs %f)", min, max);
+    unsigned int seed = std::random_device()();
+    engine_.seed(seed);
+    dist_ = std::uniform_real_distribution<float>(min_, max_);
+  }
+  void ReadNext(std::vector<framework::LoDTensor>* out) override {
+    out->clear();
+    out->reserve(shapes_.size());
+    for (const framework::DDim& shape : shapes_) {
+      PADDLE_ENFORCE_GE(
+          shape.size(), 2,
+          "The rank of reader's output data should be 2 at least.(Now it's %d)",
+          shape.size());
+      framework::LoDTensor out_tensor;
+      out_tensor.Resize(shape);
+      T* data = out_tensor.mutable_data<T>(platform::CPUPlace());
+      int64_t numel = framework::product(shape);
+      for (int64_t i = 0; i < numel; ++i) {
+        data[i] = dist_(engine_);
+      }
+      out->push_back(out_tensor);
+    }
+  }
+  bool HasNext() const override { return true; }
+  void ReInit() override { return; }
+ private:
+  float min_;
+  float max_;
+  std::minstd_rand engine_;
+  std::uniform_real_distribution<float> dist_;
+};
+template <typename T>
+class CreateRandomDataGeneratorOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+ private:
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
+    const auto& shape_concat = Attr<std::vector<int>>("shape_concat");
+    const auto& ranks = Attr<std::vector<int>>("ranks");
+    PADDLE_ENFORCE(!shape_concat.empty() && !ranks.empty());
+    PADDLE_ENFORCE_EQ(std::accumulate(ranks.begin(), ranks.end(), 0),
+                      int(shape_concat.size()),
+                      "The accumulate of all ranks should be equal to the "
+                      "shape concat's length.");
+    std::vector<framework::DDim> shapes = RestoreShapes(shape_concat, ranks);
+    auto* out = scope.FindVar(Output("Out"))
+                    ->template GetMutable<framework::ReaderHolder>();
+    out->Reset(new RandomDataGenerator<T>(shapes, Attr<float>("min"),
+                                          Attr<float>("max")));
+  }
+};
+class CreateRandomDataGeneratorOpMaker : public FileReaderMakerBase {
+ public:
+  CreateRandomDataGeneratorOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
+      : FileReaderMakerBase(op_proto, op_checker) {
+    AddAttr<float>("min", "The lower bound of reader's uniform distribution.");
+    AddAttr<float>("max", "The upper bound of reader's uniform distribution.");
+    AddComment(R"DOC(
+      CreateRandomDataGenerator Operator
+      This Op creates a random reader.
+      The reader generates random data instead of really reading from files.
+      Generated data follow an uniform distribution between 'min' and 'max'.
+    )DOC");
+  }
+};
+}  // namespace reader
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators::reader;
+REGISTER_FILE_READER_OPERATOR(create_random_data_generator,
+                              ops::CreateRandomDataGeneratorOp<float>,
+                              ops::CreateRandomDataGeneratorOpMaker);
--- a/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/operators/reader/reader_op_registry.h"
+namespace paddle {
+namespace operators {
+namespace reader {
+class ShuffleReader : public framework::DecoratedReader {
+ public:
+  ShuffleReader(ReaderBase* reader, int buffer_size)
+      : DecoratedReader(reader), buffer_size_(buffer_size), iteration_pos_(0) {
+    buffer_.reserve(buffer_size);
+  }
+  void ReadNext(std::vector<framework::LoDTensor>* out) override;
+ private:
+  int buffer_size_;
+  std::vector<std::vector<framework::LoDTensor>> buffer_;
+  size_t iteration_pos_;
+};
+void ShuffleReader::ReadNext(std::vector<framework::LoDTensor>* out) {
+  if (iteration_pos_ >= buffer_.size()) {
+    // Reload buffer with new data
+    buffer_.clear();
+    buffer_.reserve(buffer_size_);
+    for (int i = 0; i < buffer_size_; ++i) {
+      if (reader_->HasNext()) {
+        buffer_.push_back(std::vector<framework::LoDTensor>());
+        reader_->ReadNext(&buffer_.back());
+      } else {
+        break;
+      }
+    }
+    // TODO(fengjiayi): 'std::random_shuffle' can be very slow. It needs to be
+    // optimize.
+    std::random_shuffle(buffer_.begin(), buffer_.end());
+    iteration_pos_ = 0;
+  }
+  out->clear();
+  if (!buffer_.empty()) {
+    std::swap(*out, buffer_[iteration_pos_++]);
+  }
+  // if buffer_ is empty, the 'out' will return as an empty vector.
+}
+class CreateShuffleReaderOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+ private:
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
+    const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
+                                        ->Get<framework::ReaderHolder>();
+    auto* out = scope.FindVar(Output("Out"))
+                    ->template GetMutable<framework::ReaderHolder>();
+    out->Reset(
+        new ShuffleReader(underlying_reader.Get(), Attr<int>("buffer_size")));
+  }
+};
+class CreateShuffleReaderOpMaker : public DecoratedReaderMakerBase {
+ public:
+  CreateShuffleReaderOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
+      : DecoratedReaderMakerBase(op_proto, op_checker) {
+    AddAttr<int>("buffer_size", "The shuffle buffer size.").GreaterThan(0);
+    AddComment(R"DOC(
+      CreateShuffleReader Operator
+      A shuffle reader takes another reader as its 'underlying reader'
+      and yields the underlying reader's outputs in a shuffled order.
+    )DOC");
+  }
+};
+}  // namespace reader
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators::reader;
+REGISTER_DECORATED_READER_OPERATOR(create_shuffle_reader,
+                                   ops::CreateShuffleReaderOp,
+                                   ops::CreateShuffleReaderOpMaker);
--- a/paddle/fluid/operators/reader/reader_op_registry.cc
+++ b/paddle/fluid/operators/reader/reader_op_registry.cc
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "reader_op_registry.h"
+namespace paddle {
+namespace operators {
+namespace reader {
+std::vector<framework::DDim> RestoreShapes(const std::vector<int>& shape_concat,
+                                           const std::vector<int>& ranks) {
+  std::vector<framework::DDim> res;
+  int offset = 0;
+  for (int len : ranks) {
+    auto start_it = shape_concat.begin() + offset;
+    auto end_it = start_it + len;
+    res.push_back(framework::make_ddim(std::vector<int>(start_it, end_it)));
+    offset += len;
+  }
+  return res;
+}
+FileReaderMakerBase::FileReaderMakerBase(
+    framework::OpProtoAndCheckerMaker::OpProto* op_proto,
+    framework::OpAttrChecker* op_checker)
+    : OpProtoAndCheckerMaker(op_proto, op_checker) {
+  AddOutput("Out", "(ReaderHolder) The created random reader.");
+  AddAttr<std::vector<int>>("shape_concat", "The concat of all data's shapes.");
+  AddAttr<std::vector<int>>(
+      "ranks",
+      "The ranks of each data."
+      "e.g."
+      "shape_concat = [2,3,4,5,6]"
+      "ranks = [3,2]"
+      "It means the reader will generate two data each time,"
+      "whose shapes are [2,3,4] and [5,6] respectively.");
+  AddAttr<std::vector<int>>("lod_levels", "The LoD levels of each data.");
+}
+void FileReaderInferShape::operator()(framework::InferShapeContext* ctx) const {
+  PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                 "The output file reader should not be null.");
+  const auto shape_concat = ctx->Attrs().Get<std::vector<int>>("shape_concat");
+  const auto ranks = ctx->Attrs().Get<std::vector<int>>("ranks");
+  std::vector<framework::DDim> shapes = RestoreShapes(shape_concat, ranks);
+  ctx->SetReaderDims("Out", shapes);
+  if (ctx->IsRuntime()) {
+    const auto lod_levels = ctx->Attrs().Get<std::vector<int>>("lod_levels");
+    PADDLE_ENFORCE_EQ(lod_levels.size(), shapes.size(),
+                      "The number of 'lod_levels'(%d) doesn't match the number "
+                      "of 'shapes'(%d).",
+                      lod_levels.size(), shapes.size());
+    framework::VarDesc* reader =
+        boost::get<framework::VarDesc*>(ctx->GetOutputVarPtrs("Out")[0]);
+    reader->SetLoDLevels(lod_levels);
+  }
+}
+void FileReaderInferVarType::operator()(const framework::OpDesc& op_desc,
+                                        framework::BlockDesc* block) const {
+  std::string reader_name = op_desc.Output("Out")[0];
+  framework::VarDesc* reader = block->FindVarRecursive(reader_name);
+  reader->SetType(framework::proto::VarType::READER);
+}
+void DecoratedReaderInferShape::operator()(
+    framework::InferShapeContext* ctx) const {
+  PADDLE_ENFORCE(ctx->HasInput("UnderlyingReader"),
+                 "Input(UnderlyingReader) should not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                 "The output decorated reader should not be null.");
+  ctx->SetReaderDims("Out", ctx->GetReaderDims("UnderlyingReader"));
+  if (ctx->IsRuntime()) {
+    framework::VarDesc* in_reader = boost::get<framework::VarDesc*>(
+        ctx->GetInputVarPtrs("UnderlyingReader")[0]);
+    framework::VarDesc* out_reader =
+        boost::get<framework::VarDesc*>(ctx->GetOutputVarPtrs("Out")[0]);
+    out_reader->SetLoDLevels(in_reader->GetLoDLevels());
+  }
+}
+void DecoratedReaderInferVarType::operator()(
+    const framework::OpDesc& op_desc, framework::BlockDesc* block) const {
+  std::string in_reader_name = op_desc.Input("UnderlyingReader")[0];
+  framework::VarDesc* in_reader = block->FindVarRecursive(in_reader_name);
+  std::string out_reader_name = op_desc.Output("Out")[0];
+  framework::VarDesc* out_reader = block->FindVarRecursive(out_reader_name);
+  out_reader->SetType(framework::proto::VarType::READER);
+  out_reader->SetDataTypes(in_reader->GetDataTypes());
+}
+DecoratedReaderMakerBase::DecoratedReaderMakerBase(
+    framework::OpProtoAndCheckerMaker::OpProto* op_proto,
+    framework::OpAttrChecker* op_checker)
+    : OpProtoAndCheckerMaker(op_proto, op_checker) {
+  AddInput("UnderlyingReader",
+           "(ReaderHolder) The underlying reader for creating a batch reader.");
+  AddOutput("Out", "(ReaderHolder) The created batch reader.");
+}
+}  // namespace reader
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/reader/reader_op_registry.h
+++ b/paddle/fluid/operators/reader/reader_op_registry.h
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/reader.h"
+namespace paddle {
+namespace operators {
+namespace reader {
+extern std::vector<framework::DDim> RestoreShapes(
+    const std::vector<int>& shape_concat, const std::vector<int>& ranks);
+class FileReaderMakerBase : public framework::OpProtoAndCheckerMaker {
+ public:
+  FileReaderMakerBase(OpProto* op_proto, OpAttrChecker* op_checker);
+};
+class FileReaderInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* ctx) const override;
+};
+class FileReaderInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc& op_desc,
+                  framework::BlockDesc* block) const override;
+};
+// general infershape for decorated reader
+class DecoratedReaderInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* ctx) const override;
+};
+// general var type inference for decorated reader
+class DecoratedReaderInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc& op_desc,
+                  framework::BlockDesc* block) const override;
+};
+class DecoratedReaderMakerBase : public framework::OpProtoAndCheckerMaker {
+ public:
+  DecoratedReaderMakerBase(OpProto* op_proto, OpAttrChecker* op_checker);
+};
+}  // namespace reader
+}  // namespace operators
+}  // namespace paddle
+#define REGISTER_FILE_READER_OPERATOR(op_name, ...)                  \
+  REGISTER_OPERATOR(op_name, __VA_ARGS__,                            \
+                    paddle::operators::reader::FileReaderInferShape, \
+                    paddle::framework::EmptyGradOpMaker,             \
+                    paddle::operators::reader::FileReaderInferVarType)
+#define REGISTER_DECORATED_READER_OPERATOR(op_name, ...)                  \
+  REGISTER_OPERATOR(op_name, __VA_ARGS__,                                 \
+                    paddle::operators::reader::DecoratedReaderInferShape, \
+                    paddle::framework::EmptyGradOpMaker,                  \
+                    paddle::operators::reader::DecoratedReaderInferVarType)
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -84,6 +84,9 @@ class ReshapeOpMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<std::vector<int>>("shape",
                              "(vector<int>) "
                              "Target shape of reshape operator.");
+    AddAttr<bool>("inplace",
+                  "Change the source tensor's shape without copy memory.")
+        .SetDefault(true);
    AddComment(R"DOC(
 Reshape Operator.

--- a/paddle/fluid/operators/reshape_op.h
+++ b/paddle/fluid/operators/reshape_op.h
@@ -26,10 +26,16 @@ class ReshapeKernel : public framework::OpKernel<T> {
  void Compute(const framework::ExecutionContext& ctx) const {
    auto* out = ctx.Output<framework::Tensor>("Out");
    auto* in = ctx.Input<framework::Tensor>("X");
+    bool inplace = ctx.Attr<bool>("inplace");
    auto out_dims = out->dims();
-    out->mutable_data<T>(ctx.GetPlace());
+    if (!inplace) {
-    framework::TensorCopy(*in, ctx.GetPlace(), ctx.device_context(), out);
+      out->mutable_data<T>(ctx.GetPlace());
-    out->Resize(out_dims);
+      framework::TensorCopy(*in, ctx.GetPlace(), ctx.device_context(), out);
+      out->Resize(out_dims);
+    } else {
+      out->ShareDataWith(*in);
+      out->Resize(out_dims);
+    }
  }
 };
@@ -40,10 +46,16 @@ class ReshapeGradKernel : public framework::OpKernel<T> {
    auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
    auto* d_x = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
    d_x->mutable_data<T>(ctx.GetPlace());
+    bool inplace = ctx.Attr<bool>("inplace");
    auto in_dims = d_x->dims();
-    framework::TensorCopy(*d_out, ctx.GetPlace(), ctx.device_context(), d_x);
+    if (!inplace) {
-    d_x->Resize(in_dims);
+      framework::TensorCopy(*d_out, ctx.GetPlace(), ctx.device_context(), d_x);
+      d_x->Resize(in_dims);
+    } else {
+      d_x->ShareDataWith(*d_out);
+      d_x->Resize(in_dims);
+    }
  }
 };
 }  // namespace operators

--- a/paddle/fluid/platform/cudnn_helper.h
+++ b/paddle/fluid/platform/cudnn_helper.h
@@ -15,6 +15,8 @@ limitations under the License. */
 #pragma once
 #include <vector>
+#include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/dynload/cudnn.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/macros.h"
@@ -282,5 +284,17 @@ class ScopedPoolingDescriptor {
  DISABLE_COPY_AND_ASSIGN(ScopedPoolingDescriptor);
 };
+inline bool CanCUDNNBeUsed(const framework::ExecutionContext& ctx) {
+  bool use_cudnn = ctx.Attr<bool>("use_cudnn");
+  use_cudnn &= paddle::platform::is_gpu_place(ctx.GetPlace());
+#ifdef PADDLE_WITH_CUDA
+  if (use_cudnn) {
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
+  }
+#endif
+  return use_cudnn;
+}
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -33,9 +33,15 @@ DeviceContextPool::DeviceContextPool(
  PADDLE_ENFORCE_GT(places.size(), 0);
  for (size_t i = 0; i < places.size(); i++) {
    if (platform::is_cpu_place(places[i])) {
+#ifdef PADDLE_WITH_MKLDNN
+      device_contexts_.emplace(places[i],
+                               new platform::MKLDNNDeviceContext(
+                                   boost::get<platform::CPUPlace>(places[i])));
+#else
      device_contexts_.emplace(places[i],
                               new platform::CPUDeviceContext(
                                   boost::get<platform::CPUPlace>(places[i])));
+#endif
    } else if (platform::is_gpu_place(places[i])) {
 #ifdef PADDLE_WITH_CUDA
      device_contexts_.emplace(places[i],
@@ -121,6 +127,8 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
 CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) {
  SetDeviceId(place_.device);
+  multi_process = GetCUDAMultiProcessors(place_.device);
+  max_threads_per_mp = GetCUDAMaxThreadsPerMultiProcessor(place_.device);
  PADDLE_ENFORCE(cudaStreamCreate(&stream_));
  eigen_stream_.reset(new EigenCudaStreamDevice());
  eigen_stream_->Reinitialize(&stream_, place);
@@ -154,6 +162,10 @@ void CUDADeviceContext::Wait() const {
  PADDLE_ENFORCE(cudaGetLastError());
 }
+int CUDADeviceContext::GetMaxPhysicalThreadCount() const {
+  return multi_process * max_threads_per_mp;
+}
 Eigen::GpuDevice* CUDADeviceContext::eigen_device() const {
  return eigen_device_.get();
 }
@@ -170,64 +182,38 @@ cudaStream_t CUDADeviceContext::stream() const { return stream_; }
 #ifdef PADDLE_WITH_MKLDNN
 MKLDNNDeviceContext::MKLDNNDeviceContext(CPUPlace place)
-    : CPUDeviceContext(place), ready_(false) {
+    : CPUDeviceContext(place), engine_(mkldnn::engine::cpu, 0), p_blobs_() {
-  stream_.reset(new mkldnn::stream(mkldnn::stream::kind::eager));
+  p_blobs_.reset(new std::unordered_map<std::string, std::shared_ptr<void>>());
-  engine_.reset(new mkldnn::engine(mkldnn::engine::cpu, 0));
 }
-template <typename T>
+void MKLDNNDeviceContext::SetBlob(const std::string& name,
-void MKLDNNDeviceContext::AddElement(const std::string& op_key,
+                                  std::shared_ptr<void> data) const {
-                                     const T& value) {
+  std::unordered_map<std::string, std::shared_ptr<void>>* p;
-  if (GetElement<T>(op_key)) {
+  p = p_blobs_.get();
-    return;
-  }
-  GetElementPool<T>().emplace(op_key, std::move(value));
-}
-template <typename T>
+  auto it = p->find(name);
-const T& MKLDNNDeviceContext::GetElement(const std::string& op_key) const {
-  auto it = GetElementPool<T>().find(op_key);
-  return it == GetElementPool<T>().end() ? nullptr : it->second;
-}
-template <>
+  if (it == p->end()) {
-const std::unordered_map<const std::string, const MKLDNNMemoryPtr,
+    (*p)[name] = data;  // create new blob
-                         std::hash<std::string>>&
+  } else {
-MKLDNNDeviceContext::GetElementPool<MKLDNNMemoryPtr>() const {
+    it->second = data;  // set data to existing blob
-  return memory_pool_;
+  }
-}
-template <>
+  return;
-const std::unordered_map<const std::string, const MKLDNNPrimitivePtr,
-                         std::hash<std::string>>&
-MKLDNNDeviceContext::GetElementPool<MKLDNNPrimitivePtr>() const {
-  return primitive_pool_;
 }
-template <>
+std::shared_ptr<void> MKLDNNDeviceContext::GetBlob(
-const std::unordered_map<const std::string, const MKLDNNPrimitiveDescPtr,
+    const std::string& name) const {
-                         std::hash<std::string>>&
+  std::unordered_map<std::string, std::shared_ptr<void>>* p;
-MKLDNNDeviceContext::GetElementPool<MKLDNNPrimitiveDescPtr>() const {
+  p = p_blobs_.get();
-  return primitive_desc_pool_;
-}
-void MKLDNNDeviceContext::Execute(bool block) {
+  auto it = p->find(name);
-  if (pipeline_.empty()) {
-    return;
-  }
-  ResetStream();
-  stream_->submit(pipeline_).wait(block);
-  ready_ = false;
-  pipeline_.clear();
-}
-void MKLDNNDeviceContext::ResetStream() {
+  if (it != p->end()) {
-  if (ready_) {
+    return it->second;
-    return;
  }
-  // TODO(TJ): change me when mkldnn have specific method to reset this state
-  stream_.reset(new mkldnn::stream(mkldnn::stream::kind::eager));
+  return nullptr;
-  ready_ = true;
 }
 #endif

--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -22,7 +22,7 @@ limitations under the License. */
 #endif
 #ifdef PADDLE_WITH_MKLDNN
-#include "paddle/fluid/platform/mkldnn_helper.h"
+#include <mkldnn.hpp>
 #endif
 #include "paddle/fluid/platform/enforce.h"
@@ -79,6 +79,9 @@ class CUDADeviceContext : public DeviceContext {
  /*! \brief  Return place in the device context. */
  Place GetPlace() const override;
+  /*! \brief  Return the max physical thread count in the device context */
+  int GetMaxPhysicalThreadCount() const;
  /*! \brief  Return eigen device in the device context. */
  Eigen::GpuDevice* eigen_device() const;
@@ -100,6 +103,9 @@ class CUDADeviceContext : public DeviceContext {
  cudaStream_t stream_;
  cudnnHandle_t cudnn_handle_;
  cublasHandle_t cublas_handle_;
+  int multi_process;
+  int max_threads_per_mp;
 };
 template <>
@@ -114,46 +120,19 @@ class MKLDNNDeviceContext : public CPUDeviceContext {
 public:
  explicit MKLDNNDeviceContext(CPUPlace place);
-  /* \brief  Add new element: memory, primitive or primitive desc */
-  template <typename T>
-  void AddElement(const std::string& op_key, const T& value);
-  /* \brief  Get existed element: memory, primitive or primitive desc */
-  template <typename T>
-  const T& GetElement(const std::string& op_key) const;
-  /* \brief  Get element pool: memory, primitive or primitive desc pool */
-  template <typename T>
-  const std::unordered_map<const std::string, const T, std::hash<std::string>>&
-  GetElementPool() const;
  /* \brief  Get the active engine */
-  const MKLDNNEngine& engine() const { return *engine_; }
+  const mkldnn::engine& GetEngine() const { return engine_; }
-  /* \brief  Submit primitive to pipeline */
-  void Submit(const MKLDNNPrimitivePtr& p) { pipeline_.push_back(*p); }
-  /*! \brief  Execute all submitted primitives in pipeline */
+  // Set data to blob (i.e. name/data pair). Create blob if not existing
-  void Execute(bool block = true);
+  void SetBlob(const std::string& name, std::shared_ptr<void> data) const;
- protected:
+  // Find a saved blob. Return nullptr if not found
-  /*! \brief  Reset the stream to prepare next exectue */
+  std::shared_ptr<void> GetBlob(const std::string& name) const;
-  void ResetStream();
 private:
-  std::unordered_map<const std::string, const MKLDNNMemoryPtr,
+  mkldnn::engine engine_;
-                     std::hash<std::string>>
+  std::shared_ptr<std::unordered_map<std::string, std::shared_ptr<void>>>
-      memory_pool_;
+      p_blobs_;
-  std::unordered_map<const std::string, const MKLDNNPrimitivePtr,
-                     std::hash<std::string>>
-      primitive_pool_;
-  std::unordered_map<const std::string, const MKLDNNPrimitiveDescPtr,
-                     std::hash<std::string>>
-      primitive_desc_pool_;
-  std::vector<MKLDNNPrimitive> pipeline_;
-  MKLDNNStreamPtr stream_;
-  MKLDNNEnginePtr engine_;
-  bool ready_;
 };
 #endif

--- a/paddle/fluid/platform/device_tracer.cc
+++ b/paddle/fluid/platform/device_tracer.cc
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <map>
 #include <mutex>
 #include <numeric>
+#include <thread>
 #include "glog/logging.h"
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/string/printf.h"
@@ -54,6 +55,36 @@ uint64_t kAlignSize = 8;
    }                                                                      \
  } while (0)
+std::string MemcpyKind(CUpti_ActivityMemcpyKind kind) {
+  switch (kind) {
+    case CUPTI_ACTIVITY_MEMCPY_KIND_HTOD:
+      return "MEMCPY_HtoD";
+    case CUPTI_ACTIVITY_MEMCPY_KIND_DTOH:
+      return "MEMCPY_DtoH";
+    case CUPTI_ACTIVITY_MEMCPY_KIND_HTOA:
+      return "MEMCPY_HtoA";
+    case CUPTI_ACTIVITY_MEMCPY_KIND_ATOH:
+      return "MEMCPY_AtoH";
+    case CUPTI_ACTIVITY_MEMCPY_KIND_ATOA:
+      return "MEMCPY_AtoA";
+    case CUPTI_ACTIVITY_MEMCPY_KIND_ATOD:
+      return "MEMCPY_AtoD";
+    case CUPTI_ACTIVITY_MEMCPY_KIND_DTOA:
+      return "MEMCPY_DtoA";
+    case CUPTI_ACTIVITY_MEMCPY_KIND_DTOD:
+      return "MEMCPY_DtoD";
+    case CUPTI_ACTIVITY_MEMCPY_KIND_HTOH:
+      return "MEMCPY_HtoH";
+    case CUPTI_ACTIVITY_MEMCPY_KIND_PTOP:
+      return "MEMCPY_PtoP";
+    case CUPTI_ACTIVITY_MEMCPY_KIND_FORCE_INT:
+      return "MEMCPY_FORCE_INT";
+    default:
+      break;
+  }
+  return "MEMCPY";
+}
 void EnableActivity() {
  // Device activity record is created when CUDA initializes, so we
  // want to enable it before cuInit() or any CUDA runtime call.
@@ -110,6 +141,26 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
                                     kernel->correlationId);
            break;
          }
+          case CUPTI_ACTIVITY_KIND_MEMCPY: {
+            auto *memcpy =
+                reinterpret_cast<const CUpti_ActivityMemcpy *>(record);
+            tracer->AddMemRecords(
+                MemcpyKind(
+                    static_cast<CUpti_ActivityMemcpyKind>(memcpy->copyKind)),
+                memcpy->start, memcpy->end, memcpy->deviceId, memcpy->streamId,
+                memcpy->correlationId, memcpy->bytes);
+            break;
+          }
+          case CUPTI_ACTIVITY_KIND_MEMCPY2: {
+            auto *memcpy =
+                reinterpret_cast<const CUpti_ActivityMemcpy2 *>(record);
+            tracer->AddMemRecords(
+                MemcpyKind(
+                    static_cast<CUpti_ActivityMemcpyKind>(memcpy->copyKind)),
+                memcpy->start, memcpy->end, memcpy->deviceId, memcpy->streamId,
+                memcpy->correlationId, memcpy->bytes);
+            break;
+          }
          default: { break; }
        }
      } else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) {
@@ -140,6 +191,26 @@ class DeviceTracerImpl : public DeviceTracer {
    correlations_[id] = anno;
  }
+  void AddCPURecords(const char *anno, uint64_t start_ns, uint64_t end_ns) {
+    if (!anno) {
+      // TODO(panyx0718): Currently, it doesn't support nested situation
+      // Up-level can be cleared by low-level and therefore get nullptr
+      // here.
+      return;
+    }
+    std::lock_guard<std::mutex> l(trace_mu_);
+    cpu_records_.push_back(
+        CPURecord{anno, start_ns, end_ns,
+                  std::hash<std::thread::id>{}(std::this_thread::get_id())});
+  }
+  void AddMemRecords(const std::string &name, uint64_t start_ns,
+                     uint64_t end_ns, uint32_t device_id, uint32_t stream_id,
+                     uint32_t correlation_id, uint64_t bytes) {
+    mem_records_.push_back(MemRecord{name, start_ns, end_ns, device_id,
+                                     stream_id, correlation_id, bytes});
+  }
  void AddKernelRecords(uint64_t start, uint64_t end, uint32_t device_id,
                        uint32_t stream_id, uint32_t correlation_id) {
    std::lock_guard<std::mutex> l(trace_mu_);
@@ -175,7 +246,6 @@ class DeviceTracerImpl : public DeviceTracer {
    CUPTI_CALL(
        dynload::cuptiEnableCallback(1, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API,
                                     CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel));
    CUPTI_CALL(dynload::cuptiGetTimestamp(&start_ns_));
    enabled_ = true;
  }
@@ -185,7 +255,6 @@ class DeviceTracerImpl : public DeviceTracer {
    proto::Profile profile_pb;
    profile_pb.set_start_ns(start_ns_);
    profile_pb.set_end_ns(end_ns_);
-    std::map<std::string, std::vector<uint64_t>> event_times;
    for (const KernelRecord &r : kernel_records_) {
      if (correlations_.find(r.correlation_id) == correlations_.end()) {
        fprintf(stderr, "cannot relate a kernel activity\n");
@@ -197,7 +266,24 @@ class DeviceTracerImpl : public DeviceTracer {
      event->set_end_ns(r.end_ns);
      event->set_stream_id(r.stream_id);
      event->set_device_id(r.device_id);
-      event_times[event->name()].push_back(r.end_ns - r.start_ns);
+    }
+    for (const CPURecord &r : cpu_records_) {
+      auto *event = profile_pb.add_events();
+      event->set_name(r.name);
+      event->set_start_ns(r.start_ns);
+      event->set_end_ns(r.end_ns);
+      event->set_stream_id(r.thread_id);
+      event->set_device_id(-1);
+    }
+    for (const MemRecord &r : mem_records_) {
+      auto *event = profile_pb.add_events();
+      event->set_name(r.name);
+      event->set_start_ns(r.start_ns);
+      event->set_end_ns(r.end_ns);
+      event->set_stream_id(r.stream_id);
+      event->set_device_id(r.device_id);
+      event->mutable_memcopy()->set_bytes(r.bytes);
    }
    std::string profile_str;
    google::protobuf::TextFormat::PrintToString(profile_pb, &profile_str);
@@ -242,6 +328,8 @@ class DeviceTracerImpl : public DeviceTracer {
  uint64_t start_ns_;
  uint64_t end_ns_;
  std::vector<KernelRecord> kernel_records_;
+  std::vector<MemRecord> mem_records_;
+  std::vector<CPURecord> cpu_records_;
  std::unordered_map<uint32_t, std::string> correlations_;
  CUpti_SubscriberHandle subscriber_;
 };
@@ -254,6 +342,12 @@ class DeviceTracerDummy : public DeviceTracer {
  void AddAnnotation(uint64_t id, const std::string &anno) {}
+  void AddCPURecords(const char *anno, uint64_t start_ns, uint64_t end_ns) {}
+  void AddMemRecords(const std::string &name, uint64_t start_ns,
+                     uint64_t end_ns, uint32_t device_id, uint32_t stream_id,
+                     uint32_t correlation_id, uint64_t bytes) {}
  void AddKernelRecords(uint64_t start, uint64_t end, uint32_t device_id,
                        uint32_t stream_id, uint32_t correlation_id) {}
@@ -285,5 +379,7 @@ void SetCurAnnotation(const char *anno) { cur_annotation = anno; }
 void ClearCurAnnotation() { cur_annotation = nullptr; }
+const char *CurAnnotation() { return cur_annotation; }
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/device_tracer.h
+++ b/paddle/fluid/platform/device_tracer.h
@@ -36,6 +36,21 @@ class DeviceTracer {
    uint32_t stream_id;
    uint32_t correlation_id;
  };
+  struct CPURecord {
+    std::string name;
+    uint64_t start_ns;
+    uint64_t end_ns;
+    uint64_t thread_id;
+  };
+  struct MemRecord {
+    std::string name;
+    uint64_t start_ns;
+    uint64_t end_ns;
+    uint32_t device_id;
+    uint32_t stream_id;
+    uint32_t correlation_id;
+    uint64_t bytes;
+  };
  virtual ~DeviceTracer() {}
  // Needs to be called once before use.
@@ -48,6 +63,14 @@ class DeviceTracer {
  // human-readable annotations.
  virtual void AddAnnotation(uint64_t id, const std::string& anno) = 0;
+  virtual void AddMemRecords(const std::string& name, uint64_t start_ns,
+                             uint64_t end_ns, uint32_t device_id,
+                             uint32_t stream_id, uint32_t correlation_id,
+                             uint64_t bytes) = 0;
+  virtual void AddCPURecords(const char* anno, uint64_t start_ns,
+                             uint64_t end_ns) = 0;
  // Add a cuda kernel stats. `correlation_id` will be mapped to annotation
  // added before for human readability.
  virtual void AddKernelRecords(uint64_t start, uint64_t end,
@@ -67,6 +90,7 @@ DeviceTracer* GetDeviceTracer();
 void SetCurAnnotation(const char* anno);
 // Clear the name after the operation is done.
 void ClearCurAnnotation();
+// Current name of the operation being run in the thread.
+const char* CurAnnotation();
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/dynload/cupti.h
+++ b/paddle/fluid/platform/dynload/cupti.h
@@ -74,7 +74,8 @@ extern void *cupti_dso_handle;
  __macro(cuptiFinalize);                     \
  __macro(cuptiSubscribe);                    \
  __macro(cuptiUnsubscribe);                  \
-  __macro(cuptiEnableCallback);
+  __macro(cuptiEnableCallback);               \
+  __macro(cuptiEnableDomain);
 CUPTI_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUPTI_WRAP);

--- a/paddle/fluid/platform/float16.h
+++ b/paddle/fluid/platform/float16.h
@@ -20,10 +20,6 @@ limitations under the License. */
 #include <cuda.h>
 #endif  // PADDLE_WITH_CUDA
-#include "unsupported/Eigen/CXX11/Tensor"
-#include "paddle/fluid/platform/hostdevice.h"
 #ifdef __GNUC__
 #define PADDLE_GNUC_VER (__GNUC__ * 10 + __GNUC_MINOR__)
 #else
@@ -64,6 +60,18 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
+// Forward declare float16 for eigen.h
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/platform/hostdevice.h"
+namespace paddle {
+namespace platform {
 // Use PADDLE_ALIGNED(2) to ensure that each float16 will be allocated
 // and aligned at least on a 2-byte boundary, which leads to efficient
 // memory access of float16 struct and also makes float16 compatible
@@ -729,6 +737,22 @@ HOSTDEVICE inline bool operator>=(const float16& a, const float16& b) {
 }
 #endif
+HOSTDEVICE inline bool(isnan)(const float16& a) {
+#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hisnan(half(a));
+#else
+  return (a.x & 0x7fff) > 0x7c00;
+#endif
+}
+HOSTDEVICE inline bool(isinf)(const float16& a) {
+  return (a.x & 0x7fff) == 0x7c00;
+}
+HOSTDEVICE inline bool(isfinite)(const float16& a) {
+  return !((isnan)(a)) && !((isinf)(a));
+}
 }  // namespace platform
 }  // namespace paddle
@@ -750,3 +774,27 @@ struct is_pod<paddle::platform::float16> {
 };
 }  // namespace std
+namespace Eigen {
+namespace numext {
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isnan)(
+    const paddle::platform::float16& a) {
+  return (paddle::platform::isnan)(a);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isinf)(
+    const paddle::platform::float16& a) {
+  return (paddle::platform::isinf)(a);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isfinite)(
+    const paddle::platform::float16& a) {
+  return (paddle::platform::isfinite)(a);
+}
+}  // namespace numext
+}  // namespace Eigen
--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
@@ -33,6 +33,26 @@ int GetCUDADeviceCount() {
  return count;
 }
+int GetCUDAMultiProcessors(int id) {
+  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
+  int count;
+  PADDLE_ENFORCE(
+      cudaDeviceGetAttribute(&count, cudaDevAttrMultiProcessorCount, id),
+      "cudaDeviceGetAttribute failed in "
+      "paddle::platform::GetCUDAMultiProcessors");
+  return count;
+}
+int GetCUDAMaxThreadsPerMultiProcessor(int id) {
+  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
+  int count;
+  PADDLE_ENFORCE(cudaDeviceGetAttribute(
+                     &count, cudaDevAttrMaxThreadsPerMultiProcessor, id),
+                 "cudaDeviceGetAttribute failed in "
+                 "paddle::platform::GetCUDAMaxThreadsPerMultiProcessor");
+  return count;
+}
 int GetCurrentDeviceId() {
  int device_id;
  PADDLE_ENFORCE(

--- a/paddle/fluid/platform/gpu_info.h
+++ b/paddle/fluid/platform/gpu_info.h
@@ -30,6 +30,12 @@ const std::string kEnvFractionGpuMemoryToUse =
 //! Get the total number of GPU devices in system.
 int GetCUDADeviceCount();
+//! Get the MultiProcessors of the ith GPU.
+int GetCUDAMultiProcessors(int i);
+//! Get the MaxThreads of each MultiProcessor of the ith GPU.
+int GetCUDAMaxThreadsPerMultiProcessor(int i);
 //! Get the current GPU device id in system.
 int GetCurrentDeviceId();

--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@@ -16,12 +16,15 @@ limitations under the License. */
 #include <mkldnn.hpp>
+#include "paddle/fluid/framework/operator.h"
 namespace paddle {
 namespace platform {
 using MKLDNNStream = mkldnn::stream;
 using MKLDNNEngine = mkldnn::engine;
 using MKLDNNMemory = mkldnn::memory;
+using MKLDNNMemoryDescriptor = mkldnn::memory::desc;
 using MKLDNNPrimitive = mkldnn::primitive;
 using MKLDNNPrimitiveDesc = mkldnn::handle<mkldnn_primitive_desc_t>;
@@ -31,5 +34,17 @@ typedef std::unique_ptr<MKLDNNMemory> MKLDNNMemoryPtr;
 typedef std::unique_ptr<MKLDNNPrimitive> MKLDNNPrimitivePtr;
 typedef std::unique_ptr<MKLDNNPrimitiveDesc> MKLDNNPrimitiveDescPtr;
+inline mkldnn::memory::desc MKLDNNMemDesc(const std::vector<int>& dims,
+                                          mkldnn::memory::data_type data_type,
+                                          mkldnn::memory::format format) {
+  mkldnn::memory::dims tz = dims;
+  return mkldnn::memory::desc({tz}, data_type, format);
+}
+inline bool CanMKLDNNBeUsed(const framework::ExecutionContext& ctx) {
+  bool use_mkldnn = ctx.Attr<bool>("use_mkldnn");
+  return use_mkldnn && platform::is_cpu_place(ctx.GetPlace());
+}
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/platform/profiler.h"
+#include <sys/time.h>
+#include <time.h>
 #include <iomanip>
 #include <map>
 #ifdef PADDLE_WITH_CUDA
@@ -52,6 +54,12 @@ inline uint64_t GetTimeInNsec() {
      .count();
 }
+inline uint64_t PosixInNsec() {
+  struct timeval tv;
+  gettimeofday(&tv, nullptr);
+  return 1000 * (static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec);
+}
 Event::Event(EventKind kind, std::string name, uint32_t thread_id,
             const DeviceContext* dev_ctx)
    : kind_(kind), name_(name), thread_id_(thread_id), has_cuda_(false) {
@@ -132,8 +140,8 @@ void PopEvent(const std::string& name, const DeviceContext* dev_ctx) {
  GetEventList().Record(EventKind::kPopRange, name, g_thread_id, dev_ctx);
 }
-RecordEvent::RecordEvent(const std::string& name,
+RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx)
-                         const DeviceContext* dev_ctx) {
+    : start_ns_(PosixInNsec()) {
  if (g_state == ProfilerState::kDisabled) return;
  dev_ctx_ = dev_ctx;
  name_ = name;
@@ -144,6 +152,10 @@ RecordEvent::RecordEvent(const std::string& name,
 RecordEvent::~RecordEvent() {
  if (g_state == ProfilerState::kDisabled) return;
+  DeviceTracer* tracer = GetDeviceTracer();
+  if (tracer) {
+    tracer->AddCPURecords(CurAnnotation(), start_ns_, PosixInNsec());
+  }
  ClearCurAnnotation();
  PopEvent(name_, dev_ctx_);
 }
@@ -207,15 +219,14 @@ void DisableProfiler(EventSortingKey sorted_key,
  Mark("_stop_profiler_", nullptr);
  g_state = ProfilerState::kDisabled;
+  std::vector<std::vector<Event>> all_events = GetAllEvents();
+  ParseEvents(all_events, sorted_key);
+  ResetProfiler();
  DeviceTracer* tracer = GetDeviceTracer();
  if (g_profiler_place == "All" && tracer && tracer->IsEnabled()) {
    tracer->Disable();
    tracer->GenProfile(profile_path);
  }
-  std::vector<std::vector<Event>> all_events = GetAllEvents();
-  ParseEvents(all_events, sorted_key);
-  ResetProfiler();
 }
 void ParseEvents(std::vector<std::vector<Event>>& events,

--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@@ -108,6 +108,7 @@ struct RecordEvent {
  ~RecordEvent();
+  uint64_t start_ns_;
  // The device context is used by Event to get the current cuda stream.
  const DeviceContext* dev_ctx_;
  // Event name

--- a/paddle/fluid/platform/profiler.proto
+++ b/paddle/fluid/platform/profiler.proto
@@ -15,12 +15,17 @@ limitations under the License. */
 syntax = "proto2";
 package paddle.platform.proto;
+message MemCopy { optional uint64 bytes = 3; }
 message Event {
  optional string name = 1;
  optional uint64 start_ns = 2;
  optional uint64 end_ns = 3;
-  optional uint32 device_id = 5;
+  // When positive, it represents gpu id. When -1, it represents CPU.
+  optional int64 device_id = 5;
  optional uint32 stream_id = 6;
+  optional MemCopy memcopy = 7;
 }
 message Profile {

--- a/paddle/fluid/recordio/CMakeLists.txt
+++ b/paddle/fluid/recordio/CMakeLists.txt
+# internal library.
+cc_library(header SRCS header.cc)
+cc_test(header_test SRCS header_test.cc DEPS header)
+cc_library(chunk SRCS chunk.cc DEPS snappystream snappy header zlib)
+cc_test(chunk_test SRCS chunk_test.cc DEPS chunk)
+cc_library(recordio DEPS chunk header)
--- a/paddle/fluid/recordio/chunk.cc
+++ b/paddle/fluid/recordio/chunk.cc
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/recordio/chunk.h"
+#include <memory>
+#include <sstream>
+#include "paddle/fluid/platform/enforce.h"
+#include "snappystream.hpp"
+#include "zlib.h"
+namespace paddle {
+namespace recordio {
+constexpr size_t kMaxBufSize = 1024;
+template <typename Callback>
+static void ReadStreamByBuf(std::istream& in, int limit, Callback callback) {
+  char buf[kMaxBufSize];
+  std::streamsize actual_size;
+  size_t counter = 0;
+  do {
+    auto actual_max =
+        limit > 0 ? std::min(limit - counter, kMaxBufSize) : kMaxBufSize;
+    actual_size = in.readsome(buf, actual_max);
+    if (actual_size == 0) {
+      break;
+    }
+    callback(buf, actual_size);
+    if (limit > 0) {
+      counter += actual_size;
+    }
+  } while (actual_size == kMaxBufSize);
+}
+static void PipeStream(std::istream& in, std::ostream& os) {
+  ReadStreamByBuf(
+      in, -1, [&os](const char* buf, size_t len) { os.write(buf, len); });
+}
+static uint32_t Crc32Stream(std::istream& in, int limit = -1) {
+  auto crc = crc32(0, nullptr, 0);
+  ReadStreamByBuf(in, limit, [&crc](const char* buf, size_t len) {
+    crc = crc32(crc, reinterpret_cast<const Bytef*>(buf), len);
+  });
+  return crc;
+}
+bool Chunk::Write(std::ostream& os, Compressor ct) const {
+  // NOTE(dzhwinter): don't check records.numBytes instead, because
+  // empty records are allowed.
+  if (records_.empty()) {
+    return false;
+  }
+  std::stringstream sout;
+  std::unique_ptr<std::ostream> compressed_stream;
+  switch (ct) {
+    case Compressor::kNoCompress:
+      break;
+    case Compressor::kSnappy:
+      compressed_stream.reset(new snappy::oSnappyStream(sout));
+      break;
+    default:
+      PADDLE_THROW("Not implemented");
+  }
+  std::ostream& buf_stream = compressed_stream ? *compressed_stream : sout;
+  for (auto& record : records_) {
+    size_t sz = record.size();
+    buf_stream.write(reinterpret_cast<const char*>(&sz), sizeof(uint32_t))
+        .write(record.data(), record.size());
+  }
+  if (compressed_stream) {
+    compressed_stream.reset();
+  }
+  auto end_pos = sout.tellg();
+  sout.seekg(0, std::ios::beg);
+  uint32_t len = static_cast<uint32_t>(end_pos - sout.tellg());
+  uint32_t crc = Crc32Stream(sout);
+  sout.seekg(0, std::ios::beg);
+  Header hdr(static_cast<uint32_t>(records_.size()), crc, ct, len);
+  hdr.Write(os);
+  PipeStream(sout, os);
+  return true;
+}
+void Chunk::Parse(std::istream& sin) {
+  Header hdr;
+  hdr.Parse(sin);
+  auto beg_pos = sin.tellg();
+  auto crc = Crc32Stream(sin, hdr.CompressSize());
+  PADDLE_ENFORCE_EQ(hdr.Checksum(), crc);
+  Clear();
+  sin.seekg(beg_pos, std::ios::beg);
+  std::unique_ptr<std::istream> compressed_stream;
+  switch (hdr.CompressType()) {
+    case Compressor::kNoCompress:
+      break;
+    case Compressor::kSnappy:
+      compressed_stream.reset(new snappy::iSnappyStream(sin));
+      break;
+    default:
+      PADDLE_THROW("Not implemented");
+  }
+  std::istream& stream = compressed_stream ? *compressed_stream : sin;
+  for (uint32_t i = 0; i < hdr.NumRecords(); ++i) {
+    uint32_t rec_len;
+    stream.read(reinterpret_cast<char*>(&rec_len), sizeof(uint32_t));
+    std::string buf;
+    buf.resize(rec_len);
+    stream.read(&buf[0], rec_len);
+    Add(buf);
+  }
+}
+}  // namespace recordio
+}  // namespace paddle
--- a/paddle/fluid/recordio/chunk.h
+++ b/paddle/fluid/recordio/chunk.h
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <string>
+#include <vector>
+#include "paddle/fluid/platform/macros.h"
+#include "paddle/fluid/recordio/header.h"
+namespace paddle {
+namespace recordio {
+// A Chunk contains the Header and optionally compressed records.
+class Chunk {
+public:
+  Chunk() : num_bytes_(0) {}
+  void Add(std::string buf) {
+    records_.push_back(buf);
+    num_bytes_ += buf.size();
+  }
+  // dump the chunk into w, and clears the chunk and makes it ready for
+  // the next add invocation.
+  bool Write(std::ostream& fo, Compressor ct) const;
+  void Clear() {
+    records_.clear();
+    num_bytes_ = 0;
+  }
+  void Parse(std::istream& sin);
+  size_t NumBytes() { return num_bytes_; }
+  const std::string& Record(int i) const { return records_[i]; }
+private:
+  std::vector<std::string> records_;
+  // sum of record lengths in bytes.
+  size_t num_bytes_;
+  DISABLE_COPY_AND_ASSIGN(Chunk);
+};
+size_t CompressData(const char* in, size_t in_length, Compressor ct, char* out);
+void DeflateData(const char* in, size_t in_length, Compressor ct, char* out);
+}  // namespace recordio
+}  // namespace paddle
--- a/paddle/fluid/recordio/chunk_test.cc
+++ b/paddle/fluid/recordio/chunk_test.cc
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/recordio/chunk.h"
+#include <sstream>
+#include "gtest/gtest.h"
+using namespace paddle::recordio;
+TEST(Chunk, SaveLoad) {
+  Chunk ch;
+  ch.Add(std::string("12345", 6));
+  ch.Add(std::string("123", 4));
+  std::stringstream ss;
+  ch.Write(ss, Compressor::kNoCompress);
+  ch.Clear();
+  ch.Parse(ss);
+  ASSERT_EQ(ch.NumBytes(), 10U);
+}
+TEST(Chunk, Compressor) {
+  Chunk ch;
+  ch.Add(std::string("12345", 6));
+  ch.Add(std::string("123", 4));
+  ch.Add(std::string("123", 4));
+  ch.Add(std::string("123", 4));
+  std::stringstream ss;
+  ch.Write(ss, Compressor::kSnappy);
+  std::stringstream ss2;
+  ch.Write(ss2, Compressor::kNoCompress);
+  ASSERT_LE(ss.tellp(), ss2.tellp());  // Compress should contain less data;
+  ch.Clear();
+  ch.Parse(ss);
+  ASSERT_EQ(ch.NumBytes(), 18);
+}
--- a/paddle/fluid/recordio/header.cc
+++ b/paddle/fluid/recordio/header.cc
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/recordio/header.h"
+namespace paddle {
+namespace recordio {
+Header::Header()
+    : num_records_(0),
+      checksum_(0),
+      compressor_(Compressor::kNoCompress),
+      compress_size_(0) {}
+Header::Header(uint32_t num, uint32_t sum, Compressor c, uint32_t cs)
+    : num_records_(num), checksum_(sum), compressor_(c), compress_size_(cs) {}
+void Header::Parse(std::istream& is) {
+  is.read(reinterpret_cast<char*>(&num_records_), sizeof(uint32_t))
+      .read(reinterpret_cast<char*>(&checksum_), sizeof(uint32_t))
+      .read(reinterpret_cast<char*>(&compressor_), sizeof(uint32_t))
+      .read(reinterpret_cast<char*>(&compress_size_), sizeof(uint32_t));
+}
+void Header::Write(std::ostream& os) const {
+  os.write(reinterpret_cast<const char*>(&num_records_), sizeof(uint32_t))
+      .write(reinterpret_cast<const char*>(&checksum_), sizeof(uint32_t))
+      .write(reinterpret_cast<const char*>(&compressor_), sizeof(uint32_t))
+      .write(reinterpret_cast<const char*>(&compress_size_), sizeof(uint32_t));
+}
+std::ostream& operator<<(std::ostream& os, Header h) {
+  os << h.NumRecords() << h.Checksum()
+     << static_cast<uint32_t>(h.CompressType()) << h.CompressSize();
+  return os;
+}
+bool operator==(Header l, Header r) {
+  return l.NumRecords() == r.NumRecords() && l.Checksum() == r.Checksum() &&
+         l.CompressType() == r.CompressType() &&
+         l.CompressSize() == r.CompressSize();
+}
+}  // namespace recordio
+}  // namespace paddle
--- a/paddle/fluid/recordio/header.h
+++ b/paddle/fluid/recordio/header.h
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <sstream>
+namespace paddle {
+namespace recordio {
+// Default ChunkSize
+constexpr size_t kDefaultMaxChunkSize = 32 * 1024 * 1024;
+// MagicNumber for memory checking
+constexpr uint32_t kMagicNumber = 0x01020304;
+enum class Compressor : uint32_t {
+  // NoCompression means writing raw chunk data into files.
+  // With other choices, chunks are compressed before written.
+  kNoCompress = 0,
+  // Snappy had been the default compressing algorithm widely
+  // used in Google.  It compromises between speech and
+  // compression ratio.
+  kSnappy = 1,
+  // Gzip is a well-known compression algorithm.  It is
+  // recommmended only you are looking for compression ratio.
+  kGzip = 2,
+};
+// Header is the metadata of Chunk
+class Header {
+public:
+  Header();
+  Header(uint32_t num, uint32_t sum, Compressor ct, uint32_t cs);
+  void Write(std::ostream& os) const;
+  void Parse(std::istream& is);
+  uint32_t NumRecords() const { return num_records_; }
+  uint32_t Checksum() const { return checksum_; }
+  Compressor CompressType() const { return compressor_; }
+  uint32_t CompressSize() const { return compress_size_; }
+private:
+  uint32_t num_records_;
+  uint32_t checksum_;
+  Compressor compressor_;
+  uint32_t compress_size_;
+};
+// Allow Header Loggable
+std::ostream& operator<<(std::ostream& os, Header h);
+bool operator==(Header l, Header r);
+}  // namespace recordio
+}  // namespace paddle
--- a/paddle/fluid/recordio/header_test.cc
+++ b/paddle/fluid/recordio/header_test.cc
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/recordio/header.h"
+#include <sstream>
+#include "gtest/gtest.h"
+using namespace paddle::recordio;
+TEST(Recordio, ChunkHead) {
+  Header hdr(0, 1, Compressor::kGzip, 3);
+  std::stringstream ss;
+  hdr.Write(ss);
+  ss.seekg(0, std::ios::beg);
+  Header hdr2;
+  hdr2.Parse(ss);
+  EXPECT_TRUE(hdr == hdr2);
+}
--- a/paddle/scripts/docker/build_android.sh
+++ b/paddle/scripts/docker/build_android.sh
@@ -31,7 +31,6 @@ ${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh \
 BUILD_ROOT=/paddle/build_android
 DEST_ROOT=/paddle/install_android
-rm -rf $BUILD_ROOT 2>/dev/null || true
 mkdir -p $BUILD_ROOT
 cd $BUILD_ROOT

--- a/paddle/scripts/travis/build_android.sh
+++ b/paddle/scripts/travis/build_android.sh
-#!/bin/bash
-set -e
-ANDROID_STANDALONE_TOOLCHAIN=$HOME/android-toolchain-gcc
-TMP_DIR=$HOME/$JOB/tmp
-mkdir -p $TMP_DIR
-cd $TMP_DIR
-wget -q https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip
-unzip -q android-ndk-r14b-linux-x86_64.zip
-chmod +x $TMP_DIR/android-ndk-r14b/build/tools/make-standalone-toolchain.sh
-$TMP_DIR/android-ndk-r14b/build/tools/make-standalone-toolchain.sh --force --arch=arm --platform=android-21 --install-dir=$ANDROID_STANDALONE_TOOLCHAIN
-cd $HOME
-rm -rf $TMP_DIR
-# Create the build directory for CMake.
-mkdir -p $TRAVIS_BUILD_DIR/build_android
-cd $TRAVIS_BUILD_DIR/build_android
-# Compile paddle binaries
-cmake -DCMAKE_SYSTEM_NAME=Android \
-      -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_STANDALONE_TOOLCHAIN \
-      -DANDROID_ABI=armeabi-v7a \
-      -DANDROID_ARM_NEON=ON \
-      -DANDROID_ARM_MODE=ON \
-      -DUSE_EIGEN_FOR_BLAS=ON \
-      -DWITH_C_API=ON \
-      -DWITH_SWIG_PY=OFF \
-      -DWITH_STYLE_CHECK=OFF \
-      ..
-make -j `nproc`
--- a/paddle/scripts/travis/build_doc.sh
+++ b/paddle/scripts/travis/build_doc.sh
@@ -12,80 +12,6 @@ make -j `nproc` copy_paddle_pybind
 make -j `nproc` paddle_docs paddle_docs_cn paddle_api_docs
 # check websites for broken links
-linkchecker doc/en/html/index.html
+linkchecker doc/v2/en/html/index.html
-linkchecker doc/cn/html/index.html
+linkchecker doc/v2/cn/html/index.html
-linkchecker doc/api/en/html/index.html
+linkchecker doc/v2/api/en/html/index.html
-# Parse Github URL
-REPO=`git config remote.origin.url`
-SSH_REPO=${REPO/https:\/\/github.com\//git@github.com:}
-SHA=`git rev-parse --verify HEAD`
-# Documentation branch name
-# gh-pages branch is used for PaddlePaddle.org. The English version of
-# documentation in `doc` directory, and the chinese version in `doc_cn`
-# directory.
-TARGET_BRANCH="gh-pages"
-# Only deploy master branch to build latest documentation.
-SOURCE_BRANCH="master"
-# Clone the repo to output directory
-mkdir output
-git clone $REPO output
-cd output
-function deploy_docs() {
-  SOURCE_BRANCH=$1
-  DIR=$2
-  # If is not a Github pull request
-  if [ "$TRAVIS_PULL_REQUEST" != "false" ]; then
-    exit 0
-  fi
-  # If it is not watched branch.
-  if [ "$TRAVIS_BRANCH" != "$SOURCE_BRANCH" ]; then
-    return
-  fi
-  # checkout github page branch
-  git checkout $TARGET_BRANCH || git checkout --orphan $TARGET_BRANCH
-  mkdir -p ${DIR}
-  # remove old docs. mv new docs.
-  set +e
-  rm -rf ${DIR}/doc ${DIR}/doc_cn ${DIR}/api_doc
-  set -e
-  cp -r ../doc/cn/html ${DIR}/doc_cn
-  cp -r ../doc/en/html ${DIR}/doc
-  cp -r ../doc/api/en/html ${DIR}/api_doc
-  git add .
-}
-deploy_docs "master" "."
-deploy_docs "develop" "./develop/"
-# Check is there anything changed.
-set +e
-git diff --cached --exit-code >/dev/null
-if [ $? -eq 0 ]; then
-  echo "No changes to the output on this push; exiting."
-  exit 0
-fi
-set -e
-if [ -n $SSL_KEY ]; then  # Only push updated docs for github.com/PaddlePaddle/Paddle.
-  # Commit
-  git add .
-  git config user.name "Travis CI"
-  git config user.email "paddle-dev@baidu.com"
-  git commit -m "Deploy to GitHub Pages: ${SHA}"
-  # Set ssh private key
-  openssl aes-256-cbc -K $SSL_KEY -iv $SSL_IV -in ../../paddle/scripts/travis/deploy_key.enc -out deploy_key -d
-  chmod 600 deploy_key
-  eval `ssh-agent -s`
-  ssh-add deploy_key
-  # Push
-  git push $SSH_REPO $TARGET_BRANCH
-fi
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -26,9 +26,9 @@ import initializer
 import layers
 import nets
 import optimizer
-import learning_rate_decay
 import backward
 import regularizer
+import average
 from param_attr import ParamAttr, WeightNormParamAttr
 from data_feeder import DataFeeder
 from core import LoDTensor, CPUPlace, CUDAPlace

--- a/python/paddle/fluid/average.py
+++ b/python/paddle/fluid/average.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+"""
+    Class of all kinds of Average.
+    All Averages are accomplished via Python totally. 
+    They do not change Paddle's Program, nor do anything to
+    modify NN model's configuration. They are completely 
+    wrappers of Python functions.
+"""
+def _is_number_(var):
+    return isinstance(var, int) or isinstance(var, float) or (isinstance(
+        var, np.ndarray) and var.shape == (1, ))
+def _is_number_or_matrix_(var):
+    return _is_number_(var) or isinstance(var, np.ndarray)
+class WeightedAverage(object):
+    def __init__(self):
+        self.reset()
+    def reset(self):
+        self.numerator = None
+        self.denominator = None
+    def add(self, value, weight):
+        if not _is_number_or_matrix_(value):
+            raise ValueError(
+                "The 'value' must be a number(int, float) or a numpy ndarray.")
+        if not _is_number_(weight):
+            raise ValueError("The 'weight' must be a number(int, float).")
+        if self.numerator is None or self.denominator is None:
+            self.numerator = value * weight
+            self.denominator = weight
+        else:
+            self.numerator += value * weight
+            self.denominator += weight
+    def eval(self):
+        if self.numerator is None or self.denominator is None:
+            raise ValueError(
+                "There is no data to be averaged in WeightedAverage.")
+        return self.numerator / self.denominator
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -486,7 +486,7 @@ def append_backward(loss, parameter_list=None, no_grad_set=None,
    params_and_grads = []
    for param in parameters:
        if param not in grad_info_map:
-            raise ValueError("param %s is not in map" % param)
+            continue
        grad_info = grad_info_map[param]
        grad_block = grad_info[1]
        if not grad_block.has_var(grad_info[0]):

--- a/python/paddle/fluid/concurrency.py
+++ b/python/paddle/fluid/concurrency.py
@@ -12,11 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# TODO: Variables: make_channel
-# TODO: Operators: send, close_channel, recv, go, select
 from layers.control_flow import BlockGuard
-from layer_helper import LayerHelper
+from layer_helper import LayerHelper, unique_name
+from layers import fill_constant
 import core
 __all__ = [
    'Go',
    'make_channel',
@@ -46,27 +46,35 @@ class Go(BlockGuard):
        parent_block = main_program.block(main_program.current_block()
                                          .parent_idx)
+        inner_outputs = set()
        x_name_list = set()
-        out_vars = []
        for op in go_block.ops:
            # Iterate over all operators, get all the inputs
            # and add as input to the Go operator.
            for iname in op.input_names:
                for in_var_name in op.input(iname):
-                    x_name_list.add(in_var_name)
+                    if in_var_name not in inner_outputs:
+                        x_name_list.add(in_var_name)
-            # Iterate over all operators , get all the outputs
-            # add to the output list of Go operator only if
-            # they exist in the parent block.
            for oname in op.output_names:
                for out_var_name in op.output(oname):
-                    if out_var_name in parent_block.vars:
+                    inner_outputs.add(out_var_name)
-                        out_vars.add(parent_block.var(out_var_name))
+        # Iterate over all operators , get all the outputs
+        # add to the output list of Go operator only if
+        # they exist in the parent block.
+        out_vars = []
+        for inner_out_name in inner_outputs:
+            if inner_out_name in parent_block.vars:
+                out_vars.append(parent_block.var(inner_out_name))
        parent_block.append_op(
            type='go',
-            inputs={'X': [parent_block.var(x_name) for x_name in x_name_list]},
+            inputs={
-            outputs={'Out': out_vars},
+                'X':
+                [parent_block.var_recursive(x_name) for x_name in x_name_list]
+            },
+            outputs={},
            attrs={'sub_block': go_block})
@@ -88,8 +96,8 @@ def make_channel(dtype, capacity=0):
    `channel_close`, and `Go` to design a concurrent Paddle program.
    Args:
-        dtype (ParamAttr|int): Data type of the data sent in the channel.
+        dtype (ParamAttr|string): Data type of the data sent in the channel.
-        This data type should be one of the Paddle supported data types.
+        This data type should be the string name of a numpy data type.
        capacity (ParamAttr|int): Size of the channel. Defaults to 0 for
        to create an unbuffered channel.
@@ -106,14 +114,16 @@ def make_channel(dtype, capacity=0):
          fluid.channel_send(ch, 100)
          fluid.channel_close(ch)
    """
-    helper = LayerHelper('make_channel', **locals())
+    helper = LayerHelper('channel_create', **locals())
    main_program = helper.main_program
    make_channel_block = main_program.current_block()
    # Make a channel variable (using the channel data type) and make sure it
    # persists into the global scope.
    channel = helper.create_variable(
-        dtype=core.VarDesc.VarType.CHANNEL, persistable=True)
+        name=unique_name.generate('channel'),
+        type=core.VarDesc.VarType.CHANNEL,
+        persistable=True)
    create_channel_op = make_channel_block.append_op(
        type="channel_create",
@@ -121,7 +131,7 @@ def make_channel(dtype, capacity=0):
        attrs={"data_type": dtype,
               "capacity": capacity})
-    return create_channel_op
+    return channel
 def channel_send(channel, value):
@@ -133,7 +143,7 @@ def channel_send(channel, value):
    Args:
        channel (Variable|Channel): Channel variable created using
        `make_channel`.
+        value (Variable): Value to send to channel
    Returns:
        Variable: The boolean status on whether or not the channel
                  successfully sent the passed value.
@@ -149,7 +159,11 @@ def channel_send(channel, value):
    helper = LayerHelper('channel_send', **locals())
    main_program = helper.main_program
    channel_send_block = main_program.current_block()
-    status = helper.create_variable(dtype=core.VarDesc.VarType.TENSOR)
+    status = helper.create_variable(
+        name=unique_name.generate('status'),
+        type=core.VarDesc.VarType.LOD_TENSOR,
+        dtype=core.VarDesc.VarType.BOOL)
    channel_send_op = channel_send_block.append_op(
        type="channel_send",
@@ -159,10 +173,10 @@ def channel_send(channel, value):
        },
        outputs={"Status": status})
-    return channel_send_op
+    return status
-def channel_recv(channel, dtype):
+def channel_recv(channel, return_value):
    """
    Receives a value through a channel variable. Used by an unbuffered or
    buffered channel within a concurrent Go block to get data from originally
@@ -172,11 +186,10 @@ def channel_recv(channel, dtype):
    Args:
        channel (Variable|Channel): Channel variable created using
        `make_channel`.
-        dtype (Variable|int): Data type of the data expected to be read in the
+        return_value (Variable): Variable to set as a result of running channel_recv_op
-        channel. This data type should be one of the Paddle supported data
-        types.
    Returns:
+        Variable: The received value from the channel.
        Variable: The boolean status on whether or not the channel
                  successfully received the passed value.
@@ -185,7 +198,7 @@ def channel_recv(channel, dtype):
          ch = fluid.make_channel(dtype='int32', capacity=10)
          with fluid.Go():
-            fluid.channel_recv(ch, 'int32')
+            returned_value = fluid.channel_recv(ch, 'int32')
          # Code to send data through the channel.
    """
@@ -193,8 +206,10 @@ def channel_recv(channel, dtype):
    main_program = helper.main_program
    channel_recv_block = main_program.current_block()
-    return_value = helper.create_variable(dtype=dtype)
+    status = helper.create_variable(
-    status = helper.create_variable(dtype=core.VarDesc.VarType.TENSOR)
+        name=unique_name.generate('status'),
+        type=core.VarDesc.VarType.LOD_TENSOR,
+        dtype=core.VarDesc.VarType.BOOL)
    channel_recv_op = channel_recv_block.append_op(
        type="channel_recv",
@@ -202,7 +217,7 @@ def channel_recv(channel, dtype):
        outputs={"Out": return_value,
                 "Status": status})
-    return channel_recv_op
+    return return_value, status
 def channel_close(channel):
@@ -228,5 +243,3 @@ def channel_close(channel):
    channel_close_op = channel_close_block.append_op(
        type="channel_close", inputs={"Channel": channel})
-    return channel_close_op
--- a/python/paddle/fluid/distribute_transpiler.py
+++ b/python/paddle/fluid/distribute_transpiler.py
@@ -279,7 +279,6 @@ class DistributeTranspiler:
                type=v.type,
                dtype=v.dtype,
                shape=v.shape)
-            print("create origin var: ", orig_var_name)
            for trainer_id in xrange(self.trainers):
                var = pserver_program.global_block().create_var(
                    name="%s.trainer_%d" % (orig_var_name, trainer_id),
@@ -288,7 +287,6 @@ class DistributeTranspiler:
                    dtype=v.dtype,
                    shape=v.shape)
                recv_inputs.append(var)
-                print("create per trainer var: ", var.name)
        # step3
        optimize_block = pserver_program.create_block(0)
        # step 4

--- a/python/paddle/fluid/evaluator.py
+++ b/python/paddle/fluid/evaluator.py
@@ -18,10 +18,13 @@ import layers
 from framework import Program, Variable, program_guard
 import unique_name
 from layer_helper import LayerHelper
+from initializer import Constant
 __all__ = [
    'Accuracy',
    'ChunkEvaluator',
+    'EditDistance',
+    'DetectionMAP',
 ]
@@ -105,44 +108,6 @@ class Evaluator(object):
        return state
-class Accuracy(Evaluator):
-    """
-    Average Accuracy for multiple mini-batches.
-    """
-    def __init__(self, input, label, k=1, **kwargs):
-        super(Accuracy, self).__init__("accuracy", **kwargs)
-        main_program = self.helper.main_program
-        if main_program.current_block().idx != 0:
-            raise ValueError("You can only invoke Evaluator in root block")
-        self.total = self.create_state(dtype='int64', shape=[1], suffix='total')
-        self.correct = self.create_state(
-            dtype='int64', shape=[1], suffix='correct')
-        total = self.helper.create_tmp_variable(dtype='int')
-        correct = self.helper.create_tmp_variable(dtype='int')
-        acc = layers.accuracy(
-            input=input, label=label, k=k, total=total, correct=correct)
-        total = layers.cast(x=total, dtype='int64')
-        correct = layers.cast(x=correct, dtype='int64')
-        layers.sums(input=[self.total, total], out=self.total)
-        layers.sums(input=[self.correct, correct], out=self.correct)
-        self.metrics.append(acc)
-    def eval(self, executor, eval_program=None):
-        if eval_program is None:
-            eval_program = Program()
-        block = eval_program.current_block()
-        with program_guard(main_program=eval_program):
-            total = _clone_var_(block, self.total)
-            correct = _clone_var_(block, self.correct)
-            total = layers.cast(total, dtype='float32')
-            correct = layers.cast(correct, dtype='float32')
-            out = layers.elementwise_div(x=correct, y=total)
-        return np.array(executor.run(eval_program, fetch_list=[out])[0])
 class ChunkEvaluator(Evaluator):
    """
    Accumulate counter numbers output by chunk_eval from mini-batches and
@@ -211,7 +176,7 @@ class ChunkEvaluator(Evaluator):
 class EditDistance(Evaluator):
    """
    Accumulate edit distance sum and sequence number from mini-batches and
-    compute the average edit_distance of all batches.
+    compute the average edit_distance and instance error of all batches.
    Args:
        input: the sequences predicted by network.
@@ -227,14 +192,12 @@ class EditDistance(Evaluator):
        for epoch in PASS_NUM:
            distance_evaluator.reset(exe)
            for data in batches:
-                loss, sum_distance = exe.run(fetch_list=[cost] + distance_evaluator.metrics)
+                loss = exe.run(fetch_list=[cost])
-                avg_distance = distance_evaluator.eval(exe)
+            distance, instance_error = distance_evaluator.eval(exe)
-            pass_distance = distance_evaluator.eval(exe)
        In the above example:
-        'sum_distance' is the sum of the batch's edit distance.
+        'distance' is the average of the edit distance in a pass.
-        'avg_distance' is the average of edit distance from the firt batch to the current batch.
+        'instance_error' is the instance error rate in a pass.
-        'pass_distance' is the average of edit distance from all the pass.
    """
@@ -244,25 +207,172 @@ class EditDistance(Evaluator):
        if main_program.current_block().idx != 0:
            raise ValueError("You can only invoke Evaluator in root block")
-        self.total_error = self.create_state(
+        self.total_distance = self.create_state(
-            dtype='float32', shape=[1], suffix='total_error')
+            dtype='float32', shape=[1], suffix='total_distance')
        self.seq_num = self.create_state(
            dtype='int64', shape=[1], suffix='seq_num')
-        error, seq_num = layers.edit_distance(
+        self.instance_error = self.create_state(
+            dtype='int64', shape=[1], suffix='instance_error')
+        distances, seq_num = layers.edit_distance(
            input=input, label=label, ignored_tokens=ignored_tokens)
-        #error = layers.cast(x=error, dtype='float32')
-        sum_error = layers.reduce_sum(error)
+        zero = layers.fill_constant(shape=[1], value=0.0, dtype='float32')
-        layers.sums(input=[self.total_error, sum_error], out=self.total_error)
+        compare_result = layers.equal(distances, zero)
+        compare_result_int = layers.cast(x=compare_result, dtype='int')
+        seq_right_count = layers.reduce_sum(compare_result_int)
+        instance_error_count = layers.elementwise_sub(
+            x=seq_num, y=seq_right_count)
+        total_distance = layers.reduce_sum(distances)
+        layers.sums(
+            input=[self.total_distance, total_distance],
+            out=self.total_distance)
        layers.sums(input=[self.seq_num, seq_num], out=self.seq_num)
-        self.metrics.append(sum_error)
+        layers.sums(
+            input=[self.instance_error, instance_error_count],
+            out=self.instance_error)
+        self.metrics.append(total_distance)
+        self.metrics.append(instance_error_count)
    def eval(self, executor, eval_program=None):
        if eval_program is None:
            eval_program = Program()
        block = eval_program.current_block()
        with program_guard(main_program=eval_program):
-            total_error = _clone_var_(block, self.total_error)
+            total_distance = _clone_var_(block, self.total_distance)
            seq_num = _clone_var_(block, self.seq_num)
+            instance_error = _clone_var_(block, self.instance_error)
            seq_num = layers.cast(x=seq_num, dtype='float32')
-            out = layers.elementwise_div(x=total_error, y=seq_num)
+            instance_error = layers.cast(x=instance_error, dtype='float32')
-        return np.array(executor.run(eval_program, fetch_list=[out])[0])
+            avg_distance = layers.elementwise_div(x=total_distance, y=seq_num)
+            avg_instance_error = layers.elementwise_div(
+                x=instance_error, y=seq_num)
+            result = executor.run(
+                eval_program, fetch_list=[avg_distance, avg_instance_error])
+        return np.array(result[0]), np.array(result[1])
+class DetectionMAP(Evaluator):
+    """
+    Calculate the detection mean average precision (mAP).
+    TODO (Dang Qingqing): update the following doc.
+    The general steps are as follows:
+    1. calculate the true positive and false positive according to the input
+        of detection and labels.
+    2. calculate mAP value, support two versions: '11 point' and 'integral'.
+    Please get more information from the following articles:
+      https://sanchom.wordpress.com/tag/average-precision/
+      https://arxiv.org/abs/1512.02325
+    Args:
+        input (Variable): The detection results, which is a LoDTensor with shape
+            [M, 6]. The layout is [label, confidence, xmin, ymin, xmax, ymax].
+        gt_label (Variable): The ground truth label index, which is a LoDTensor
+            with shape [N, 1]. 
+        gt_difficult (Variable): Whether this ground truth is a difficult
+            bounding box (bbox), which is a LoDTensor [N, 1].
+        gt_box (Variable): The ground truth bounding box (bbox), which is a
+            LoDTensor with shape [N, 6]. The layout is [xmin, ymin, xmax, ymax].
+        class_num (int): The class number.
+        background_label (int): The index of background label, the background
+            label will be ignored. If set to -1, then all categories will be
+            considered, 0 by defalut.
+        overlap_threshold (float): The threshold for deciding true/false
+            positive, 0.5 by defalut.
+        evaluate_difficult (bool): Whether to consider difficult ground truth
+            for evaluation, True by defalut.
+        ap_version (string): The average precision calculation ways, it must be
+            'integral' or '11point'. Please check
+            https://sanchom.wordpress.com/tag/average-precision/ for details.
+            - 11point: the 11-point interpolated average precision.
+            - integral: the natural integral of the precision-recall curve.
+    Example:
+        exe = fluid.executor(place)
+        map_evaluator = fluid.Evaluator.DetectionMAP(input,
+            gt_label, gt_difficult, gt_box)
+        cur_map, accum_map = map_evaluator.get_map_var()
+        fetch = [cost, cur_map, accum_map]
+        for epoch in PASS_NUM:
+            map_evaluator.reset(exe)
+            for data in batches:
+                loss, cur_map_v, accum_map_v = exe.run(fetch_list=fetch)
+        In the above example:
+        'cur_map_v' is the mAP of current mini-batch.
+        'accum_map_v' is the accumulative mAP of one pass.
+    """
+    def __init__(self,
+                 input,
+                 gt_label,
+                 gt_box,
+                 gt_difficult,
+                 class_num,
+                 background_label=0,
+                 overlap_threshold=0.5,
+                 evaluate_difficult=True,
+                 ap_version='integral'):
+        super(DetectionMAP, self).__init__("map_eval")
+        gt_label = layers.cast(x=gt_label, dtype=gt_box.dtype)
+        gt_difficult = layers.cast(x=gt_difficult, dtype=gt_box.dtype)
+        label = layers.concat([gt_label, gt_difficult, gt_box], axis=1)
+        # calculate mean average precision (mAP) of current mini-batch
+        map = layers.detection_map(
+            input,
+            label,
+            class_num,
+            background_label,
+            overlap_threshold=overlap_threshold,
+            evaluate_difficult=evaluate_difficult,
+            ap_version=ap_version)
+        self.create_state(dtype='int32', shape=None, suffix='accum_pos_count')
+        self.create_state(dtype='float32', shape=None, suffix='accum_true_pos')
+        self.create_state(dtype='float32', shape=None, suffix='accum_false_pos')
+        self.has_state = None
+        var = self.helper.create_variable(
+            persistable=True, dtype='int32', shape=[1])
+        self.helper.set_variable_initializer(
+            var, initializer=Constant(value=int(0)))
+        self.has_state = var
+        # calculate accumulative mAP
+        accum_map = layers.detection_map(
+            input,
+            label,
+            class_num,
+            background_label,
+            overlap_threshold=overlap_threshold,
+            evaluate_difficult=evaluate_difficult,
+            has_state=self.has_state,
+            input_states=self.states,
+            out_states=self.states,
+            ap_version=ap_version)
+        layers.fill_constant(
+            shape=self.has_state.shape,
+            value=1,
+            dtype=self.has_state.dtype,
+            out=self.has_state)
+        self.cur_map = map
+        self.accum_map = accum_map
+    def get_map_var(self):
+        return self.cur_map, self.accum_map
+    def reset(self, executor, reset_program=None):
+        if reset_program is None:
+            reset_program = Program()
+        with program_guard(main_program=reset_program):
+            var = _clone_var_(reset_program.current_block(), self.has_state)
+            layers.fill_constant(
+                shape=var.shape, value=0, dtype=var.dtype, out=var)
+        executor.run(reset_program)
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -163,6 +163,22 @@ def fetch_var(name, scope=None, return_numpy=True):
    return tensor
+def get_program_cache_key(feed, fetch_list):
+    feed_var_names = feed.keys()
+    def to_name_str(var):
+        if isinstance(var, Variable):
+            return var.desc.name()
+        elif isinstance(var, str):
+            return var
+        else:
+            raise TypeError(str(var) + " should be Variable or str")
+    fetch_var_names = map(to_name_str, fetch_list)
+    return str(feed_var_names + fetch_var_names)
 class Executor(object):
    def __init__(self, places):
        if not isinstance(places, list) and not isinstance(places, tuple):
@@ -177,6 +193,7 @@ class Executor(object):
        # TODO(dzhwinter) : only use the first place
        self.executor = core.Executor(act_places[0])
        self.places = places
+        self.program_caches = dict()
    def aslodtensor(self, data):
        def accumulate(data):
@@ -225,9 +242,30 @@ class Executor(object):
            feed_var_name='feed',
            fetch_var_name='fetch',
            scope=None,
-            return_numpy=True):
+            return_numpy=True,
+            use_program_cache=False):
+        """ Run program by this Executor. Feed data by feed map, fetch result by fetch_list.
+        Python executor takes a program, add feed operators and fetch operators to this program according
+        to feed map and fetch_list. Feed map provides input data for the program. fetch_list provides
+        the variables(or names) that user want to get after program run. Note: the executor will run all
+        operators in the program but not only the operators dependent by the fetch_list
+        :param program: the program that need to run, if not provied, then default_main_program will be used.
+        :param feed: feed variable map, e.g. {"image": ImageData, "label": LableData}
+        :param fetch_list: a list of variable or variable names that user want to get, run will return them according
+        to this list.
+        :param feed_var_name: the name for the input variable of feed Operator.
+        :param fetch_var_name: the name for the output variable of feed Operator.
+        :param scope: the scope used to run this program, you can switch it to different scope. default is global_scope
+        :param return_numpy: if convert the fetched tensor to numpy
+        :param use_program_cache: set use_program_cache to true if program not changed compare to the last step.
+        :return: result according to fetch_list.
+        """
        if feed is None:
            feed = {}
+        if not isinstance(feed, dict):
+            raise TypeError("feed should be a map")
        if fetch_list is None:
            fetch_list = []
@@ -240,35 +278,64 @@ class Executor(object):
        if scope is None:
            scope = global_scope()
-        program = program.clone()
+        program_cache = None
-        global_block = program.global_block()
+        program_cache_key = get_program_cache_key(feed, fetch_list)
-        if feed_var_name in global_block.vars:
+        if use_program_cache:
-            feed_var = global_block.var(feed_var_name)
+            # find program cache by cache_key
+            program_cache = self.program_caches.get(program_cache_key, None)
+            # TODO(qiao): Should check program_cache and program are exactly the same.
        else:
-            feed_var = global_block.create_var(
+            self.program_caches.pop(program_cache_key, None)
-                name=feed_var_name,
-                type=core.VarDesc.VarType.FEED_MINIBATCH,
-                persistable=True)
-        if fetch_var_name in global_block.vars:
+        if program_cache is None:
-            fetch_var = global_block.var(fetch_var_name)
+            program_cache = program.clone()
-        else:
-            fetch_var = global_block.create_var(
+            if use_program_cache:
-                name=fetch_var_name,
+                self.program_caches[program_cache_key] = program_cache
-                type=core.VarDesc.VarType.FETCH_LIST,
-                persistable=True)
+            global_block = program_cache.global_block()
-        if not has_feed_operators(global_block, feed, feed_var_name):
+            if feed_var_name in global_block.vars:
-            for i, name in enumerate(feed):
+                feed_var = global_block.var(feed_var_name)
-                out = global_block.var(name)
+            else:
-                global_block.prepend_op(
+                feed_var = global_block.create_var(
-                    type='feed',
+                    name=feed_var_name,
-                    inputs={'X': [feed_var]},
+                    type=core.VarDesc.VarType.FEED_MINIBATCH,
-                    outputs={'Out': [out]},
+                    persistable=True)
-                    attrs={'col': i})
+            if fetch_var_name in global_block.vars:
-        for op in global_block.ops:
+                fetch_var = global_block.var(fetch_var_name)
+            else:
+                fetch_var = global_block.create_var(
+                    name=fetch_var_name,
+                    type=core.VarDesc.VarType.FETCH_LIST,
+                    persistable=True)
+            # prepend feed operators
+            if not has_feed_operators(global_block, feed, feed_var_name):
+                for i, name in enumerate(feed):
+                    out = global_block.var(name)
+                    global_block.prepend_op(
+                        type='feed',
+                        inputs={'X': [feed_var]},
+                        outputs={'Out': [out]},
+                        attrs={'col': i})
+            # append fetch_operators
+            if not has_fetch_operators(global_block, fetch_list,
+                                       fetch_var_name):
+                for i, var in enumerate(fetch_list):
+                    assert isinstance(var, Variable) or isinstance(var, str), (
+                        "Wrong type for fetch_list[%s]: %s" % (i, type(var)))
+                    global_block.append_op(
+                        type='fetch',
+                        inputs={'X': [var]},
+                        outputs={'Out': [fetch_var]},
+                        attrs={'col': i})
+        # feed var to framework
+        for op in program_cache.global_block().ops:
            if op.desc.type() == 'feed':
                feed_target_name = op.desc.output('Out')[0]
                cur_feed = feed[feed_target_name]
@@ -279,17 +346,7 @@ class Executor(object):
            else:
                break
-        if not has_fetch_operators(global_block, fetch_list, fetch_var_name):
+        self.executor.run(program_cache.desc, scope, 0, True, True)
-            for i, var in enumerate(fetch_list):
-                assert isinstance(var, Variable) or isinstance(var, str), (
-                    "Wrong type for fetch_list[%s]: %s" % (i, type(var)))
-                global_block.append_op(
-                    type='fetch',
-                    inputs={'X': [var]},
-                    outputs={'Out': [fetch_var]},
-                    attrs={'col': i})
-        self.executor.run(program.desc, scope, 0, True, True)
        outs = [
            core.get_fetch_variable(scope, fetch_var_name, i)
            for i in xrange(len(fetch_list))

--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -773,7 +773,7 @@ class Block(object):
            stop_gradient = v.stop_gradient
        else:
            raise ValueError("unsupported var type: %s", type(v))
+        orig_var_type = v.type
        self.desc.rename_var(name, new_name)
        # NOTE: v is destroyed by C++ after calling rename_var.
        d = self.desc.find_var(new_name)
@@ -782,6 +782,7 @@ class Block(object):
                self,
                d.shape(),
                d.dtype(),
+                type=orig_var_type,
                name=new_name,
                stop_gradient=stop_gradient,
                trainable=trainable,
@@ -792,7 +793,7 @@ class Block(object):
        elif var_type == "Variable":
            var = Variable(
                self,
-                type=v.type,
+                type=orig_var_type,
                name=new_name,
                error_clip=error_clip,
                stop_gradient=stop_gradient)
@@ -955,9 +956,26 @@ class Program(object):
    def get_desc(self):
        return self.desc
-    def clone(self):
+    def clone(self, for_test=False):
+        """Clone the Program object
+        Set for_test to False when we want to clone the program for training.
+        Set for_test to True when we want to clone the program for testing.         
+        Args:
+            for_test(bool): Some operators, such as batch_norm and drop_out ops,
+                behave differently in training and testing. If for_test is True,
+                the is_test attributes in these operators will be set to True for
+                testing purposes, otherwise, they remain unchanged.  
+        Returns(Program):
+            The cloned Program object.
+        """
        p = Program()
-        p.desc = core.ProgramDesc(self.desc)
+        if for_test:
+            p.desc = core.inference_optimize(self.desc)
+        else:
+            p.desc = core.ProgramDesc(self.desc)
        p.blocks = [Block(p, i) for i in xrange(self.desc.num_blocks())]
        p.sync_with_cpp()
        p.copy_param_info_from(self)

--- a/python/paddle/fluid/layers/__init__.py
+++ b/python/paddle/fluid/layers/__init__.py
@@ -28,6 +28,9 @@ import math_op_patch
 from math_op_patch import *
 import detection
 from detection import *
+import metric
+from metric import *
+from learning_rate_scheduler import *
 __all__ = []
 __all__ += math_op_patch.__all__
@@ -38,3 +41,5 @@ __all__ += control_flow.__all__
 __all__ += ops.__all__
 __all__ += device.__all__
 __all__ += detection.__all__
+__all__ += metric.__all__
+__all__ += learning_rate_scheduler.__all__
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -54,11 +54,17 @@ def detection_output(loc,
                     score_threshold=0.01,
                     nms_eta=1.0):
    """
-    **Detection Output Layer**
+    **Detection Output Layer for Single Shot Multibox Detector (SSD).**
-    This layer applies the NMS to the output of network and computes the
+    This operation is to get the detection results by performing following
-    predict bounding box location. The output's shape of this layer could
+    two steps:
-    be zero if there is no valid bounding box.
+    1. Decode input bounding box predictions according to the prior boxes.
+    2. Get the final detection results by applying multi-class non maximum
+       suppression (NMS).
+    Please note, this operation doesn't clip the final output bounding boxes
+    to the image window.
    Args:
        loc(Variable): A 3-D Tensor with shape [N, M, 4] represents the
@@ -91,7 +97,15 @@ def detection_output(loc,
        nms_eta(float): The parameter for adaptive NMS.
    Returns:
-        The detected bounding boxes which are a Tensor.
+        Variable: The detection outputs is a LoDTensor with shape [No, 6].
+            Each row has six values: [label, confidence, xmin, ymin, xmax, ymax].
+            `No` is the total number of detections in this mini-batch. For each
+            instance, the offsets in first dimension are called LoD, the offset
+            number is N + 1, N is the batch size. The i-th image has
+            `LoD[i + 1] - LoD[i]` detected results, if it is 0, the i-th image
+            has no detected results. If all images have not detected results,
+            all the elements in LoD are 0, and output tensor only contains one
+            value, which is -1.
    Examples:
        .. code-block:: python
@@ -137,23 +151,36 @@ def detection_output(loc,
 @autodoc()
 def detection_map(detect_res,
                  label,
-                  pos_count=None,
+                  class_num,
-                  true_pos=None,
+                  background_label=0,
-                  false_pos=None,
                  overlap_threshold=0.3,
                  evaluate_difficult=True,
-                  ap_type='integral'):
+                  has_state=None,
+                  input_states=None,
+                  out_states=None,
+                  ap_version='integral'):
    helper = LayerHelper("detection_map", **locals())
-    map_out = helper.create_tmp_variable(dtype='float32')
+    def __create_var(type):
-    accum_pos_count_out = helper.create_tmp_variable(dtype='int32')
+        return helper.create_tmp_variable(dtype=type)
-    accum_true_pos_out = helper.create_tmp_variable(dtype='float32')
-    accum_false_pos_out = helper.create_tmp_variable(dtype='float32')
+    map_out = __create_var('float32')
+    accum_pos_count_out = out_states[0] if out_states else __create_var('int32')
+    accum_true_pos_out = out_states[1] if out_states else __create_var(
+        'float32')
+    accum_false_pos_out = out_states[2] if out_states else __create_var(
+        'float32')
+    pos_count = input_states[0] if input_states else None
+    true_pos = input_states[1] if input_states else None
+    false_pos = input_states[2] if input_states else None
    helper.append_op(
        type="detection_map",
        inputs={
            'Label': label,
            'DetectRes': detect_res,
+            'HasState': has_state,
            'PosCount': pos_count,
            'TruePos': true_pos,
            'FalsePos': false_pos
@@ -167,9 +194,10 @@ def detection_map(detect_res,
        attrs={
            'overlap_threshold': overlap_threshold,
            'evaluate_difficult': evaluate_difficult,
-            'ap_type': ap_type
+            'ap_type': ap_version,
+            'class_num': class_num,
        })
-    return map_out, accum_pos_count_out, accum_true_pos_out, accum_false_pos_out
+    return map_out
 def bipartite_match(dist_matrix,

--- a/python/paddle/fluid/learning_rate_decay.py
+++ b/python/paddle/fluid/learning_rate_decay.py
@@ -12,8 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import layers
+import control_flow
-from initializer import init_on_cpu
+import nn
+import ops
+import tensor
+from ..initializer import init_on_cpu
 __all__ = [
    'exponential_decay', 'natural_exp_decay', 'inverse_time_decay',
@@ -31,9 +34,9 @@ strategy according to this module.
 def _decay_step_counter():
    # the first global step is zero in learning rate decay
-    global_step = layers.autoincreased_step_counter(
+    global_step = nn.autoincreased_step_counter(
        counter_name='@LR_DECAY_COUNTER@', begin=0, step=1)
-    global_step = layers.cast(global_step, 'float32')
+    global_step = tensor.cast(global_step, 'float32')
    return global_step
@@ -60,7 +63,7 @@ def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False):
        # update learning_rate
        div_res = global_step / decay_steps
        if staircase:
-            div_res = layers.floor(x=div_res)
+            div_res = ops.floor(div_res)
        decayed_lr = learning_rate * (decay_rate**div_res)
    return decayed_lr
@@ -89,8 +92,8 @@ def natural_exp_decay(learning_rate, decay_steps, decay_rate, staircase=False):
    with init_on_cpu():
        div_res = global_step / decay_steps
        if staircase:
-            div_res = layers.floor(x=div_res)
+            div_res = ops.floor(div_res)
-        decayed_lr = learning_rate * layers.exp(x=(-1 * decay_rate * div_res))
+        decayed_lr = learning_rate * ops.exp(-1 * decay_rate * div_res)
    return decayed_lr
@@ -118,7 +121,7 @@ def inverse_time_decay(learning_rate, decay_steps, decay_rate, staircase=False):
    with init_on_cpu():
        div_res = global_step / decay_steps
        if staircase:
-            div_res = layers.floor(x=div_res)
+            div_res = ops.floor(div_res)
        decayed_lr = learning_rate / (1 + decay_rate * div_res)
@@ -154,21 +157,20 @@ def polynomial_decay(learning_rate,
    with init_on_cpu():
        if cycle:
-            div_res = layers.ceil(x=(global_step / decay_steps))
+            div_res = ops.ceil(global_step / decay_steps)
-            zero_var = layers.fill_constant(
+            zero_var = tensor.fill_constant(
                shape=[1], dtype='float32', value=0.0)
-            one_var = layers.fill_constant(
+            one_var = tensor.fill_constant(
                shape=[1], dtype='float32', value=1.0)
-            with layers.Switch() as switch:
+            with control_flow.Switch() as switch:
                with switch.case(global_step == zero_var):
-                    layers.assign(input=one_var, output=div_res)
+                    tensor.assign(input=one_var, output=div_res)
            decay_steps = decay_steps * div_res
        else:
-            decay_steps_var = layers.fill_constant(
+            decay_steps_var = tensor.fill_constant(
                shape=[1], dtype='float32', value=float(decay_steps))
-            global_step = layers.elementwise_min(
+            global_step = ops.elementwise_min(x=global_step, y=decay_steps_var)
-                x=global_step, y=decay_steps_var)
        decayed_lr = (learning_rate - end_learning_rate) * \
                     ((1 - global_step / decay_steps) ** power) + end_learning_rate
@@ -195,26 +197,26 @@ def piecewise_decay(boundaries, values):
    global_step = _decay_step_counter()
    with init_on_cpu():
-        lr = layers.create_global_var(
+        lr = tensor.create_global_var(
            shape=[1],
            value=0.0,
            dtype='float32',
            persistable=True,
            name="learning_rate")
-        with layers.Switch() as switch:
+        with control_flow.Switch() as switch:
            for i in range(len(boundaries)):
-                boundary_val = layers.fill_constant(
+                boundary_val = tensor.fill_constant(
                    shape=[1], dtype='float32', value=float(boundaries[i]))
-                value_var = layers.fill_constant(
+                value_var = tensor.fill_constant(
                    shape=[1], dtype='float32', value=float(values[i]))
                with switch.case(global_step < boundary_val):
-                    layers.assign(value_var, lr)
+                    tensor.assign(value_var, lr)
-            last_value_var = layers.fill_constant(
+            last_value_var = tensor.fill_constant(
                shape=[1],
                dtype='float32',
                value=float(values[len(values) - 1]))
            with switch.default():
-                layers.assign(last_value_var, lr)
+                tensor.assign(last_value_var, lr)
    return lr
--- a/python/paddle/fluid/tests/notest_csp.py
+++ b/python/paddle/fluid/tests/notest_csp.py
@@ -11,27 +11,47 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""
+All layers just related to metric.
+"""
-import unittest
+from ..layer_helper import LayerHelper
-import paddle.fluid as fluid
+from ..initializer import Normal, Constant
+from ..framework import Variable
+from ..param_attr import ParamAttr
+__all__ = ['accuracy']
-class TestCSPFramework(unittest.TestCase):
-    def daisy_chain(self):
-        n = 10000
-        leftmost = fluid.make_channel(dtype=int)
-        right = leftmost
-        left = leftmost
-        with fluid.While(steps=n):
-            right = fluid.make_channel(dtype=int)
-            with fluid.go():
-                fluid.send(left, 1 + fluid.recv(right))
-            left = right
-        with fluid.go():
+def accuracy(input, label, k=1, correct=None, total=None):
-            fluid.send(right, 1)
+    """
-        fluid.Print(fluid.recv(leftmost))
+    This function computes the accuracy using the input and label.
+    The output is the top_k inputs and their indices.
+    """
-if __name__ == '__main__':
+    helper = LayerHelper("accuracy", **locals())
-    unittest.main()
+    topk_out = helper.create_tmp_variable(dtype=input.dtype)
+    topk_indices = helper.create_tmp_variable(dtype="int64")
+    helper.append_op(
+        type="top_k",
+        inputs={"X": [input]},
+        outputs={"Out": [topk_out],
+                 "Indices": [topk_indices]},
+        attrs={"k": k})
+    acc_out = helper.create_tmp_variable(dtype="float32")
+    if correct is None:
+        correct = helper.create_tmp_variable(dtype="int64")
+    if total is None:
+        total = helper.create_tmp_variable(dtype="int64")
+    helper.append_op(
+        type="accuracy",
+        inputs={
+            "Out": [topk_out],
+            "Indices": [topk_indices],
+            "Label": [label]
+        },
+        outputs={
+            "Accuracy": [acc_out],
+            "Correct": [correct],
+            "Total": [total],
+        })
+    return acc_out
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -35,7 +35,6 @@ __all__ = [
    'cos_sim',
    'cross_entropy',
    'square_error_cost',
-    'accuracy',
    'chunk_eval',
    'sequence_conv',
    'conv2d',
@@ -1022,40 +1021,6 @@ def square_error_cost(input, label):
    return square_out
-def accuracy(input, label, k=1, correct=None, total=None):
-    """
-    This function computes the accuracy using the input and label.
-    The output is the top_k inputs and their indices.
-    """
-    helper = LayerHelper("accuracy", **locals())
-    topk_out = helper.create_tmp_variable(dtype=input.dtype)
-    topk_indices = helper.create_tmp_variable(dtype="int64")
-    helper.append_op(
-        type="top_k",
-        inputs={"X": [input]},
-        outputs={"Out": [topk_out],
-                 "Indices": [topk_indices]},
-        attrs={"k": k})
-    acc_out = helper.create_tmp_variable(dtype="float32")
-    if correct is None:
-        correct = helper.create_tmp_variable(dtype="int64")
-    if total is None:
-        total = helper.create_tmp_variable(dtype="int64")
-    helper.append_op(
-        type="accuracy",
-        inputs={
-            "Out": [topk_out],
-            "Indices": [topk_indices],
-            "Label": [label]
-        },
-        outputs={
-            "Accuracy": [acc_out],
-            "Correct": [correct],
-            "Total": [total],
-        })
-    return acc_out
 def chunk_eval(input,
               label,
               chunk_scheme,
@@ -1146,6 +1111,7 @@ def conv2d(input,
           param_attr=None,
           bias_attr=None,
           use_cudnn=True,
+           use_mkldnn=False,
           act=None):
    """
    **Convlution2D Layer**
@@ -1287,7 +1253,8 @@ def conv2d(input,
            'strides': stride,
            'paddings': padding,
            'groups': groups,
-            'use_cudnn': use_cudnn
+            'use_cudnn': use_cudnn,
+            'use_mkldnn': use_mkldnn
        })
    pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2)
@@ -1438,6 +1405,7 @@ def pool2d(input,
           pool_padding=0,
           global_pooling=False,
           use_cudnn=True,
+           ceil_mode=False,
           name=None):
    """
    This function adds the operator for pooling in 2 dimensions, using the
@@ -1474,7 +1442,8 @@ def pool2d(input,
            "global_pooling": global_pooling,
            "strides": pool_stride,
            "paddings": pool_padding,
-            "use_cudnn": use_cudnn
+            "use_cudnn": use_cudnn,
+            "ceil_mode": ceil_mode
        })
    return pool_out
@@ -2479,10 +2448,7 @@ def matmul(x, y, transpose_x=False, transpose_y=False, name=None):
    return out
-def edit_distance(input,
+def edit_distance(input, label, normalized=True, ignored_tokens=None,
-                  label,
-                  normalized=False,
-                  ignored_tokens=None,
                  name=None):
    """
    EditDistance operator computes the edit distances between a batch of
@@ -3183,7 +3149,7 @@ def smooth_l1(x, y, inside_weight=None, outside_weight=None, sigma=None):
            data = fluid.layers.data(name='data', shape=[128], dtype='float32')
            label = fluid.layers.data(name='label', shape=[100], dtype='int64')
            fc = fluid.layers.fc(input=data, size=100)
-            out = fluid.layers.smooth_l1(logits=fc, label=label)
+            out = fluid.layers.smooth_l1(x=fc, y=label)
    """
    helper = LayerHelper('smooth_l1_loss', **locals())
    diff = helper.create_tmp_variable(dtype=x.dtype)
@@ -3209,7 +3175,7 @@ def one_hot(input, depth):
    operator.
    Args:
-        input(Tensor/LodTensor):  A Tensor/LodTensor of indices, last dimension must be 1.
+        input(variable):  A Tensor/LodTensor of indices, last dimension must be 1.
        depth(scalar): an interger defining the depth of the one hot dimension.
    Returns:

--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -362,3 +362,75 @@ def zeros(shape, dtype, force_cpu=False):
          data = fluid.layers.zeros(shape=[1], dtype='int64')
    """
    return fill_constant(value=0.0, **locals())
+def save(x, file_path, overwrite=True):
+    """
+    Saves a variable as a file.
+    Args:
+        x(variable): The Tensor/LoDTensor to be saved.
+        file_path(str): The file path where the variable will be saved.
+        overwrite(bool): Whether or not cover the given file when it has already 
+            existed. If it's set 'False' and the file is existed, a runtime 
+            error will be thrown. 
+    """
+    helper = LayerHelper("save", **locals())
+    helper.append_op(
+        type="save",
+        inputs={"input": x},
+        outputs={},
+        args={"file_path": file_path,
+              "overwrite": overwrite})
+def save_combine(x, file_path, overwrite=True):
+    """
+    Saves a list of variables into a single file.
+    Args:
+        x(list): A list of Tensor/LoDTensor to be saved together in a single file.
+        file_path(str): The file path where variables will be saved.
+        overwrite(bool): Whether or not cover the given file when it has already 
+            existed. If it's set 'False' and the file is existed, a runtime 
+            error will be thrown. 
+    """
+    helper = LayerHelper("save_combine", **locals())
+    helper.append_op(
+        type="save_combine",
+        inputs={"input": x},
+        outputs={},
+        args={"file_path": file_path,
+              "overwrite": overwrite})
+def load(out, file_path):
+    """
+    Loads a variable from a given file.
+    Args:
+        out(variable): The variable to be read from the disk file.
+        file_path(str): The path of the disk file.
+    """
+    helper = LayerHelper("load", **locals())
+    helper.append_op(
+        type="load",
+        inputs={},
+        output={"Out": out},
+        args={"file_path": file_path})
+def load_combine(out, file_path):
+    """
+    Loads a list of vairables from a single file.
+    Args:
+        out(list): The list of variables to be read from the disk file.
+        file_path(str): The path of the disk file.
+    """
+    helper = LayerHelper("load_combine", **locals())
+    helper.append_op(
+        type="load_combine",
+        inputs={},
+        output={"Out": out},
+        args={"file_path": file_path})
--- a/python/paddle/fluid/memory_optimization_transpiler.py
+++ b/python/paddle/fluid/memory_optimization_transpiler.py
@@ -29,6 +29,8 @@ dtype_to_size = {
    core.VarDesc.VarType.BOOL: 1
 }
+sub_block_ops = ["while", "while_grad", "parallel_do", "parallel_do_grad"]
 class ControlFlowGraph(object):
    def __init__(self, Program, ops, forward_num, skip_opt):
@@ -141,7 +143,7 @@ class ControlFlowGraph(object):
        self.pool = []
        for i in range(self.op_size):
            op = self._ops[i]
-            if op.type() == "while" or op.type() == "while_grad":
+            if op.type() in sub_block_ops:
                continue
            block_desc = op.block()
            is_forward = i < self._forward_num
@@ -198,67 +200,75 @@ class ControlFlowGraph(object):
                        block_desc, var_name, is_forward).shape()))
-def get_cfgs(input_program):
+def _process_sub_block_pair(pdesc, sub_block_pair):
    ops_list = []
-    pdesc = input_program.get_desc()
    block_desc = pdesc.block(0)
    op_size = block_desc.op_size()
-    # Get global block ops
+    for fwd_op, bwd_op in sub_block_pair:
-    ops_list.append(
+        sub_block_ids = []
-        ([block_desc.op(i) for i in range(op_size)], op_size, set()))
+        grad_sub_block_ids = []
+        sub_block_id_pair = []
-    while_sub_block_ids = []
+        sub_op_dict = {}
-    while_grad_sub_block_ids = []
+        for i in range(op_size):
-    while_block_id_pair = []
+            op = block_desc.op(i)
-    while_op_dict = {}
+            if op.type() == fwd_op:
+                sub_block_ids.append(op.attr("sub_block").id)
+                sub_op_dict[op.attr("sub_block").id] = op
+            elif op.type() == bwd_op:
+                grad_sub_block_ids.append(op.attr("sub_block").id)
+                sub_op_dict[op.attr("sub_block").id] = op
-    for i in range(op_size):
+        # Find fwd_op/bwd_op block pair
-        op = block_desc.op(i)
+        for grad_id in grad_sub_block_ids:
-        if op.type() == "while":
+            fwd_id = pdesc.block(grad_id).get_forward_block_idx()
-            while_sub_block_ids.append(op.attr("sub_block").id)
+            if fwd_id in sub_block_ids:
-            while_op_dict[op.attr("sub_block").id] = op
+                sub_block_id_pair.append((fwd_id, grad_id))
-        elif op.type() == "while_grad":
+                sub_block_ids.remove(fwd_id)
-            while_grad_sub_block_ids.append(op.attr("sub_block").id)
-            while_op_dict[op.attr("sub_block").id] = op
-    # Find while/while_grad block pair
+        # Get fwd_op/bwd_op block ops
-    for grad_id in while_grad_sub_block_ids:
+        for fwd_id, grad_id in sub_block_id_pair:
-        forward_id = pdesc.block(grad_id).get_forward_block_idx()
+            sub_block_ops = []
-        if forward_id in while_sub_block_ids:
+            sub_block = pdesc.block(fwd_id)
-            while_block_id_pair.append((forward_id, grad_id))
+            block_op_size = sub_block.op_size()
-            while_sub_block_ids.remove(forward_id)
+            for i in range(block_op_size):
+                sub_block_ops.append(sub_block.op(i))
-    # Get while/while_grad block ops
+            grad_sub_block = pdesc.block(grad_id)
-    for forward_id, grad_id in while_block_id_pair:
+            grad_sub_block_op_size = grad_sub_block.op_size()
-        while_block_ops = []
+            for i in range(grad_sub_block_op_size):
-        while_block = pdesc.block(forward_id)
+                sub_block_ops.append(grad_sub_block.op(i))
-        while_block_op_size = while_block.op_size()
-        for i in range(while_block_op_size):
-            while_block_ops.append(while_block.op(i))
-        while_grad_block = pdesc.block(grad_id)
+            sub_op_output = set()
-        while_grad_block_op_size = while_grad_block.op_size()
+            sub_op_output.update(sub_op_dict[fwd_id].output_arg_names())
-        for i in range(while_grad_block_op_size):
+            sub_op_output.update(sub_op_dict[grad_id].output_arg_names())
-            while_block_ops.append(while_grad_block.op(i))
+            ops_list.append((sub_block_ops, block_op_size, sub_op_output))
-        while_op_output = set()
+        # Process rest fwd_op block ops
-        while_op_output.update(while_op_dict[forward_id].output_arg_names())
+        for fwd_id in sub_block_ids:
-        while_op_output.update(while_op_dict[grad_id].output_arg_names())
+            sub_block_ops = []
+            sub_block = pdesc.block(fwd_id)
+            sub_block_op_size = sub_block.op_size()
+            for i in range(sub_block_op_size):
+                sub_block_ops.append(sub_block.op(i))
+            sub_op_output = set()
+            sub_op_output.update(sub_op_dict[fwd_id].output_arg_names())
+            ops_list.append((sub_block_ops, sub_block_op_size, sub_op_output))
+    return ops_list
-        ops_list.append((while_block_ops, while_block_op_size, while_op_output))
-    # Process rest while block ops
+def _get_cfgs(input_program):
-    for forward_id in while_sub_block_ids:
+    ops_list = []
-        while_block_ops = []
+    pdesc = input_program.get_desc()
-        while_block = pdesc.block(forward_id)
+    block_desc = pdesc.block(0)
-        while_block_op_size = while_block.op_size()
+    op_size = block_desc.op_size()
-        for i in range(while_block_op_size):
+    # Get global block ops
-            while_block_ops.append(while_block.op(i))
+    ops_list.append(
+        ([block_desc.op(i) for i in range(op_size)], op_size, set()))
-        while_op_output = set()
+    sub_block_pair = [("while", "while_grad"), ("parallel_do",
-        while_op_output.update(while_op_dict[forward_id].output_arg_names())
+                                                "parallel_do_grad")]
-        ops_list.append((while_block_ops, while_block_op_size, while_op_output))
+    ops_list.extend(_process_sub_block_pair(pdesc, sub_block_pair))
    cfgs = [
        ControlFlowGraph(input_program, ops, forward_num, skip_opt)
@@ -268,6 +278,6 @@ def get_cfgs(input_program):
 def memory_optimize(input_program):
-    cfgs = get_cfgs(input_program)
+    cfgs = _get_cfgs(input_program)
    for cfg in cfgs:
        cfg.memory_optimize()
--- a/python/paddle/fluid/nets.py
+++ b/python/paddle/fluid/nets.py
@@ -29,14 +29,16 @@ def simple_img_conv_pool(input,
                         act,
                         param_attr=None,
                         pool_type='max',
-                         use_cudnn=True):
+                         use_cudnn=True,
+                         use_mkldnn=False):
    conv_out = layers.conv2d(
        input=input,
        num_filters=num_filters,
        filter_size=filter_size,
        param_attr=param_attr,
        act=act,
-        use_cudnn=use_cudnn)
+        use_cudnn=use_cudnn,
+        use_mkldnn=use_mkldnn)
    pool_out = layers.pool2d(
        input=conv_out,
@@ -58,7 +60,8 @@ def img_conv_group(input,
                   conv_batchnorm_drop_rate=0.0,
                   pool_stride=1,
                   pool_type=None,
-                   use_cudnn=True):
+                   use_cudnn=True,
+                   use_mkldnn=False):
    """
    Image Convolution Group, Used for vgg net.
    """
@@ -90,7 +93,8 @@ def img_conv_group(input,
            padding=conv_padding[i],
            param_attr=param_attr[i],
            act=local_conv_act,
-            use_cudnn=use_cudnn)
+            use_cudnn=use_cudnn,
+            use_mkldnn=use_mkldnn)
        if conv_with_batchnorm[i]:
            tmp = layers.batch_norm(input=tmp, act=conv_act)

--- a/python/paddle/fluid/tests/book/test_image_classification.py
+++ b/python/paddle/fluid/tests/book/test_image_classification.py
@@ -86,10 +86,10 @@ def vgg16_bn_drop(input):
    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
    drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
-    fc1 = fluid.layers.fc(input=drop, size=512, act=None)
+    fc1 = fluid.layers.fc(input=drop, size=4096, act=None)
    bn = fluid.layers.batch_norm(input=fc1, act='relu')
    drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
-    fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
+    fc2 = fluid.layers.fc(input=drop2, size=4096, act=None)
    return fc2
@@ -115,7 +115,7 @@ def train(net_type, use_cuda, save_dirname, is_local):
    acc = fluid.layers.accuracy(input=predict, label=label)
    # Test program 
-    test_program = fluid.default_main_program().clone()
+    test_program = fluid.default_main_program().clone(for_test=True)
    optimizer = fluid.optimizer.Adam(learning_rate=0.001)
    optimize_ops, params_grads = optimizer.minimize(avg_cost)

--- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
@@ -170,7 +170,7 @@ def train(use_cuda, save_dirname=None, is_local=True):
    # TODO(qiao)
    # check other optimizers and check why out will be NAN
    sgd_optimizer = fluid.optimizer.SGD(
-        learning_rate=fluid.learning_rate_decay.exponential_decay(
+        learning_rate=fluid.layers.exponential_decay(
            learning_rate=0.0001,
            decay_steps=100000,
            decay_rate=0.5,

--- a/python/paddle/fluid/tests/book/test_recognize_digits.py
+++ b/python/paddle/fluid/tests/book/test_recognize_digits.py
@@ -92,7 +92,7 @@ def train(nn_type,
    else:
        prediction, avg_loss, acc = net_conf(img, label)
-    test_program = fluid.default_main_program().clone()
+    test_program = fluid.default_main_program().clone(for_test=True)
    optimizer = fluid.optimizer.Adam(learning_rate=0.001)
    optimize_ops, params_grads = optimizer.minimize(avg_loss)

--- a/python/paddle/fluid/tests/book/test_recommender_system.py
+++ b/python/paddle/fluid/tests/book/test_recommender_system.py
@@ -157,7 +157,7 @@ def train(use_cuda, save_dirname, is_local=True):
    scale_infer, avg_cost = model()
    # test program
-    test_program = fluid.default_main_program().clone()
+    test_program = fluid.default_main_program().clone(for_test=True)
    sgd_optimizer = SGDOptimizer(learning_rate=0.2)
    optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost)

--- a/python/paddle/fluid/tests/book/test_understand_sentiment.py
+++ b/python/paddle/fluid/tests/book/test_understand_sentiment.py
@@ -274,7 +274,7 @@ def main(word_dict, net_method, use_cuda, parallel=False, save_dirname=None):
        use_cuda,
        parallel=parallel,
        save_dirname=save_dirname)
-    infer(use_cuda, save_dirname)
+    infer(word_dict, use_cuda, save_dirname)
 class TestUnderstandSentiment(unittest.TestCase):

--- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
+++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
@@ -24,15 +24,29 @@ import sys
 fluid.default_startup_program().random_seed = 111
 x = fluid.layers.data(name='x', shape=[13], dtype='float32')
-y_predict = fluid.layers.fc(input=x, size=1, act=None)
 y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+device_type = 'CPU'
-avg_cost = fluid.layers.mean(cost)
+use_nccl = False
+place = fluid.CPUPlace()
+if fluid.core.is_compiled_with_cuda():
+    device_type = 'CUDA'
+    use_nccl = False
+    place = fluid.CUDAPlace(0)
-sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1)
+places = fluid.layers.get_places(device_count=0, device_type=device_type)
+pd = fluid.layers.ParallelDo(places, use_nccl=use_nccl)
+with pd.do():
+    x_ = pd.read_input(x)
+    y_ = pd.read_input(y)
+    y_predict = fluid.layers.fc(input=x_, size=1, act=None)
+    cost = fluid.layers.square_error_cost(input=y_predict, label=y_)
+    avg_cost = fluid.layers.mean(x=cost)
+    pd.write_output(avg_cost)
+cost = pd()
+avg_cost = fluid.layers.mean(x=cost)
+sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.01)
 sgd_optimizer.minimize(avg_cost)
 fluid.memory_optimize(fluid.default_main_program())
@@ -48,7 +62,6 @@ train_reader = paddle.batch(
 #         paddle.dataset.uci_housing.train(), buf_size=500),
 #     batch_size=BATCH_SIZE)
-place = fluid.CPUPlace()
 feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
 exe = fluid.Executor(place)
@@ -65,6 +78,7 @@ for pass_id in range(PASS_NUM):
        if avg_loss_value[0] < 10.0:
            exit(0)  # if avg cost less than 10.0, we think our code is good.
+        print avg_loss_value[0]
        if math.isnan(float(avg_loss_value)):
            sys.exit("got NaN loss, training failed.")
 exit(1)
--- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
+++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
@@ -89,10 +89,10 @@ def vgg16_bn_drop(input):
    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
    drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
-    fc1 = fluid.layers.fc(input=drop, size=512, act=None)
+    fc1 = fluid.layers.fc(input=drop, size=4096, act=None)
    bn = fluid.layers.batch_norm(input=fc1, act='relu')
    drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
-    fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
+    fc2 = fluid.layers.fc(input=drop2, size=4096, act=None)
    return fc2
@@ -122,7 +122,8 @@ avg_cost = fluid.layers.mean(cost)
 optimizer = fluid.optimizer.Adam(learning_rate=0.001)
 opts = optimizer.minimize(avg_cost)
-accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
+batch_size = fluid.layers.create_tensor(dtype='int64')
+batch_acc = fluid.layers.accuracy(input=predict, label=label, total=batch_size)
 fluid.memory_optimize(fluid.default_main_program())
@@ -144,13 +145,17 @@ feeder = fluid.DataFeeder(place=place, feed_list=[images, label])
 exe.run(fluid.default_startup_program())
 i = 0
+accuracy = fluid.average.WeightedAverage()
 for pass_id in range(PASS_NUM):
-    accuracy.reset(exe)
+    accuracy.reset()
    for data in train_reader():
-        loss, acc = exe.run(fluid.default_main_program(),
+        loss, acc, weight = exe.run(
-                            feed=feeder.feed(data),
+            fluid.default_main_program(),
-                            fetch_list=[avg_cost] + accuracy.metrics)
+            feed=feeder.feed(data),
-        pass_acc = accuracy.eval(exe)
+            fetch_list=[avg_cost, batch_acc, batch_size])
+        accuracy.add(value=acc, weight=weight)
+        pass_acc = accuracy.eval()
        print("loss:" + str(loss) + " acc:" + str(acc) + " pass_acc:" + str(
            pass_acc))
        # this model is slow, so if we can train two mini batch, we think it works properly.

--- a/python/paddle/fluid/tests/test_concurrency.py
+++ b/python/paddle/fluid/tests/test_concurrency.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid import framework, unique_name
+from paddle.fluid.executor import Executor
+from paddle.fluid.layers import fill_constant
+class TestRoutineOp(unittest.TestCase):
+    def test_simple_routine(self):
+        ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
+        # Create LOD_TENSOR<INT64> and put it into the scope.  This placeholder
+        # variable will be filled in and returned by fluid.channel_recv
+        result = self._create_tensor('return_value',
+                                     core.VarDesc.VarType.LOD_TENSOR,
+                                     core.VarDesc.VarType.INT64)
+        with fluid.Go():
+            input_value = fill_constant(
+                shape=[1], dtype=core.VarDesc.VarType.FP64, value=1234)
+            fluid.channel_send(ch, input_value)
+        result, status = fluid.channel_recv(ch, result)
+        fluid.channel_close(ch)
+        cpu = core.CPUPlace()
+        exe = Executor(cpu)
+        outs = exe.run(fetch_list=[result])
+        self.assertEqual(outs[0], 1234)
+    def test_daisy_chain(self):
+        '''
+        Mimics classic Daisy-chain test:  https://talks.golang.org/2012/concurrency.slide#39
+        '''
+        n = 100
+        leftmost = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
+        left = leftmost
+        # TODO(thuan): Use fluid.While() after scope capture is implemented.
+        # https://github.com/PaddlePaddle/Paddle/issues/8502
+        for i in range(n):
+            right = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
+            with fluid.Go():
+                one_tensor = self._create_one_dim_tensor(1)
+                result = self._create_tensor('return_value',
+                                             core.VarDesc.VarType.LOD_TENSOR,
+                                             core.VarDesc.VarType.INT64)
+                result, status = fluid.channel_recv(right, result)
+                one_added = fluid.layers.elementwise_add(x=one_tensor, y=result)
+                fluid.channel_send(left, one_added)
+            left = right
+        # Trigger the channel propagation by sending a "1" to rightmost channel
+        with fluid.Go():
+            one_tensor = self._create_one_dim_tensor(1)
+            fluid.channel_send(right, one_tensor)
+        leftmost_result = self._create_tensor('return_value',
+                                              core.VarDesc.VarType.LOD_TENSOR,
+                                              core.VarDesc.VarType.INT64)
+        leftmost_result, status = fluid.channel_recv(leftmost, leftmost_result)
+        cpu = core.CPUPlace()
+        exe = Executor(cpu)
+        leftmost_data = exe.run(fetch_list=[leftmost_result])
+        # The leftmost_data should be equal to the number of channels + 1
+        self.assertEqual(leftmost_data[0][0], n + 1)
+    def _create_one_dim_tensor(self, value):
+        one_dim_tensor = fill_constant(
+            shape=[1], dtype=core.VarDesc.VarType.INT64, value=value)
+        one_dim_tensor.stop_gradient = True
+        return one_dim_tensor
+    def _create_tensor(self, name, type, dtype):
+        return framework.default_main_program().current_block().create_var(
+            name=unique_name.generate(name), type=type, dtype=dtype)
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
@@ -158,26 +158,9 @@ class TestDetectionMAP(unittest.TestCase):
                append_batch_size=False,
                dtype='float32')
-            map_out, accum_pos_count_out, accum_true_pos_out, accum_false_pos_out = layers.detection_map(
+            map_out = layers.detection_map(detect_res, label, 21)
-                detect_res=detect_res, label=label)
            self.assertIsNotNone(map_out)
-            self.assertIsNotNone(accum_pos_count_out)
-            self.assertIsNotNone(accum_true_pos_out)
-            self.assertIsNotNone(accum_false_pos_out)
            self.assertEqual(map_out.shape, (1, ))
-            map_out, accum_pos_count_out2, accum_true_pos_out2, accum_false_pos_out2 = layers.detection_map(
-                detect_res=detect_res, label=label)
-            self.assertIsNotNone(map_out)
-            self.assertIsNotNone(accum_pos_count_out2)
-            self.assertIsNotNone(accum_true_pos_out2)
-            self.assertIsNotNone(accum_false_pos_out2)
-            self.assertEqual(map_out.shape, (1, ))
-            self.assertEqual(accum_pos_count_out.shape,
-                             accum_pos_count_out2.shape)
-            self.assertEqual(accum_true_pos_out.shape,
-                             accum_true_pos_out2.shape)
-            self.assertEqual(accum_false_pos_out.shape,
-                             accum_false_pos_out2.shape)
        print(str(program))

--- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
@@ -64,6 +64,7 @@ def conv2d_forward_naive(input, filter, group, conv_param):
 class TestConv2dOp(OpTest):
    def setUp(self):
        self.use_cudnn = False
+        self.use_mkldnn = False
        self.init_op_type()
        self.init_group()
        self.init_dilation()
@@ -85,7 +86,8 @@ class TestConv2dOp(OpTest):
            'paddings': self.pad,
            'groups': self.groups,
            'dilations': self.dilations,
-            'use_cudnn': self.use_cudnn
+            'use_cudnn': self.use_cudnn,
+            'use_mkldnn': self.use_mkldnn
        }
        self.outputs = {'Output': output}
@@ -290,5 +292,25 @@ class TestDepthwiseConv2(TestConv2dOp):
 #     def init_op_type(self):
 #         self.op_type = "conv_cudnn"
+#----------------Conv2dMKLDNN----------------
+class TestMKLDNN(TestConv2dOp):
+    def init_op_type(self):
+        self.use_mkldnn = True
+        self.op_type = "conv2d"
+class TestMKLDNNWithPad(TestWithPad):
+    def init_op_type(self):
+        self.use_mkldnn = True
+        self.op_type = "conv2d"
+class TestMKLDNNWithStride(TestWithStride):
+    def init_op_type(self):
+        self.use_mkldnn = True
+        self.op_type = "conv2d"
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_detection_map_op.py
+++ b/python/paddle/fluid/tests/unittests/test_detection_map_op.py
@@ -22,8 +22,8 @@ from op_test import OpTest
 class TestDetectionMAPOp(OpTest):
    def set_data(self):
+        self.class_num = 4
        self.init_test_case()
        self.mAP = [self.calc_map(self.tf_pos, self.tf_pos_lod)]
        self.label = np.array(self.label).astype('float32')
        self.detect = np.array(self.detect).astype('float32')
@@ -34,10 +34,12 @@ class TestDetectionMAPOp(OpTest):
                'int32')
            self.true_pos = np.array(self.true_pos).astype('float32')
            self.false_pos = np.array(self.false_pos).astype('float32')
+            self.has_state = np.array([1]).astype('int32')
            self.inputs = {
                'Label': (self.label, self.label_lod),
                'DetectRes': (self.detect, self.detect_lod),
+                'HasState': self.has_state,
                'PosCount': self.class_pos_count,
                'TruePos': (self.true_pos, self.true_pos_lod),
                'FalsePos': (self.false_pos, self.false_pos_lod)
@@ -51,7 +53,8 @@ class TestDetectionMAPOp(OpTest):
        self.attrs = {
            'overlap_threshold': self.overlap_threshold,
            'evaluate_difficult': self.evaluate_difficult,
-            'ap_type': self.ap_type
+            'ap_type': self.ap_type,
+            'class_num': self.class_num
        }
        self.out_class_pos_count = np.array(self.out_class_pos_count).astype(
@@ -124,12 +127,7 @@ class TestDetectionMAPOp(OpTest):
            return class_pos_count_dict, true_pos_dict, false_pos_dict
        def get_output_pos(label_count, true_pos, false_pos):
-            max_label = 0
+            label_number = self.class_num
-            for (label, label_pos_num) in label_count.items():
-                if max_label < label:
-                    max_label = label
-            label_number = max_label + 1
            out_class_pos_count = []
            out_true_pos_lod = [0]
@@ -218,11 +216,16 @@ class TestDetectionMAPOp(OpTest):
                mAP += average_precisions
                count += 1
-        self.out_class_pos_count, self.out_true_pos, self.out_true_pos_lod, self.out_false_pos, self.out_false_pos_lod = get_output_pos(
+        pcnt, tp, tp_lod, fp, fp_lod = get_output_pos(label_count, true_pos,
-            label_count, true_pos, false_pos)
+                                                      false_pos)
+        self.out_class_pos_count = pcnt
+        self.out_true_pos = tp
+        self.out_true_pos_lod = tp_lod
+        self.out_false_pos = fp
+        self.out_false_pos_lod = fp_lod
        if count != 0:
            mAP /= count
-        return mAP * 100.0
+        return mAP
    def setUp(self):
        self.op_type = "detection_map"

--- a/python/paddle/fluid/tests/unittests/test_learning_rate_decay.py
+++ b/python/paddle/fluid/tests/unittests/test_learning_rate_decay.py
@@ -17,8 +17,8 @@ import math
 import unittest
 import paddle.fluid as fluid
+import paddle.fluid.layers as layers
 import paddle.fluid.framework as framework
-import paddle.fluid.learning_rate_decay as lr_decay
 def exponential_decay(learning_rate,
@@ -89,7 +89,7 @@ class TestLearningRateDecay(unittest.TestCase):
        exe.run(fluid.default_startup_program())
        for step in range(10):
            lr_val, = exe.run(fluid.default_main_program(),
-                              feed=[],
+                              feed={},
                              fetch_list=[decayed_lr])
            python_decayed_lr = python_decay_fn(
                global_step=float(step), **kwargs)
@@ -111,27 +111,24 @@ class TestLearningRateDecay(unittest.TestCase):
        common_kwargs_false["staircase"] = False
        decay_fns = [
-            (exponential_decay, lr_decay.exponential_decay, common_kwargs_true),
+            (exponential_decay, layers.exponential_decay, common_kwargs_true),
-            (exponential_decay, lr_decay.exponential_decay,
+            (exponential_decay, layers.exponential_decay, common_kwargs_false),
+            (natural_exp_decay, layers.natural_exp_decay, common_kwargs_true),
+            (natural_exp_decay, layers.natural_exp_decay, common_kwargs_false),
+            (inverse_time_decay, layers.inverse_time_decay, common_kwargs_true),
+            (inverse_time_decay, layers.inverse_time_decay,
             common_kwargs_false),
-            (natural_exp_decay, lr_decay.natural_exp_decay, common_kwargs_true),
+            (polynomial_decay, layers.polynomial_decay, {
-            (natural_exp_decay, lr_decay.natural_exp_decay,
-             common_kwargs_false),
-            (inverse_time_decay, lr_decay.inverse_time_decay,
-             common_kwargs_true),
-            (inverse_time_decay, lr_decay.inverse_time_decay,
-             common_kwargs_false),
-            (polynomial_decay, lr_decay.polynomial_decay, {
                "learning_rate": 1.0,
                "decay_steps": 5,
                "cycle": True
            }),
-            (polynomial_decay, lr_decay.polynomial_decay, {
+            (polynomial_decay, layers.polynomial_decay, {
                "learning_rate": 1.0,
                "decay_steps": 5,
                "cycle": False
            }),
-            (piecewise_decay, lr_decay.piecewise_decay, {
+            (piecewise_decay, layers.piecewise_decay, {
                "boundaries": [3, 6, 9],
                "values": [0.1, 0.2, 0.3, 0.4]
            }),

--- a/python/paddle/fluid/tests/unittests/test_pool2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool2d_op.py
@@ -19,12 +19,21 @@ import paddle.fluid.core as core
 from op_test import OpTest
-def max_pool2D_forward_naive(x, ksize, strides, paddings, global_pool=0):
+def max_pool2D_forward_naive(x,
+                             ksize,
+                             strides,
+                             paddings,
+                             global_pool=0,
+                             ceil_mode=False):
    N, C, H, W = x.shape
    if global_pool == 1:
        ksize = [H, W]
-    H_out = (H - ksize[0] + 2 * paddings[0]) / strides[0] + 1
+    H_out = (H - ksize[0] + 2 * paddings[0] + strides[0] - 1
-    W_out = (W - ksize[1] + 2 * paddings[1]) / strides[1] + 1
+             ) / strides[0] + 1 if ceil_mode else (H - ksize[0] + 2 *
+                                                   paddings[0]) / strides[0] + 1
+    W_out = (W - ksize[1] + 2 * paddings[1] + strides[1] - 1
+             ) / strides[1] + 1 if ceil_mode else (W - ksize[1] + 2 *
+                                                   paddings[1]) / strides[1] + 1
    out = np.zeros((N, C, H_out, W_out))
    for i in xrange(H_out):
        for j in xrange(W_out):
@@ -38,12 +47,21 @@ def max_pool2D_forward_naive(x, ksize, strides, paddings, global_pool=0):
    return out
-def avg_pool2D_forward_naive(x, ksize, strides, paddings, global_pool=0):
+def avg_pool2D_forward_naive(x,
+                             ksize,
+                             strides,
+                             paddings,
+                             global_pool=0,
+                             ceil_mode=False):
    N, C, H, W = x.shape
    if global_pool == 1:
        ksize = [H, W]
-    H_out = (H - ksize[0] + 2 * paddings[0]) / strides[0] + 1
+    H_out = (H - ksize[0] + 2 * paddings[0] + strides[0] - 1
-    W_out = (W - ksize[1] + 2 * paddings[1]) / strides[1] + 1
+             ) / strides[0] + 1 if ceil_mode else (H - ksize[0] + 2 *
+                                                   paddings[0]) / strides[0] + 1
+    W_out = (W - ksize[1] + 2 * paddings[1] + strides[1] - 1
+             ) / strides[1] + 1 if ceil_mode else (W - ksize[1] + 2 *
+                                                   paddings[1]) / strides[1] + 1
    out = np.zeros((N, C, H_out, W_out))
    for i in xrange(H_out):
        for j in xrange(W_out):
@@ -65,12 +83,13 @@ class TestPool2d_Op(OpTest):
        self.init_global_pool()
        self.init_op_type()
        self.init_pool_type()
+        self.init_ceil_mode()
        if self.global_pool:
            self.paddings = [0 for _ in range(len(self.paddings))]
        input = np.random.random(self.shape).astype("float32")
        output = self.pool2D_forward_naive(input, self.ksize, self.strides,
-                                           self.paddings,
+                                           self.paddings, self.global_pool,
-                                           self.global_pool).astype("float32")
+                                           self.ceil_mode).astype("float32")
        self.inputs = {'X': input}
        self.attrs = {
@@ -80,6 +99,7 @@ class TestPool2d_Op(OpTest):
            'pooling_type': self.pool_type,
            'global_pooling': self.global_pool,
            'use_cudnn': self.use_cudnn,
+            'ceil_mode': self.ceil_mode,
            'data_format': 'AnyLayout'  # TODO(dzhwinter) : should be fix latter
        }
@@ -116,6 +136,9 @@ class TestPool2d_Op(OpTest):
    def init_global_pool(self):
        self.global_pool = True
+    def init_ceil_mode(self):
+        self.ceil_mode = False
 class TestCase1(TestPool2d_Op):
    def init_test_case(self):
@@ -217,5 +240,25 @@ class TestCUDNNCase6(TestCase5):
        self.op_type = "pool2d"
+class TestCeilModeCase1(TestCUDNNCase1):
+    def init_ceil_mode(self):
+        self.ceil_mode = True
+class TestCeilModeCase2(TestCUDNNCase2):
+    def init_ceil_mode(self):
+        self.ceil_mode = True
+class TestCeilModeCase3(TestCase1):
+    def init_ceil_mode(self):
+        self.ceil_mode = True
+class TestCeilModeCase4(TestCase2):
+    def init_ceil_mode(self):
+        self.ceil_mode = True
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_pool3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool3d_op.py
@@ -19,13 +19,24 @@ import paddle.fluid.core as core
 from op_test import OpTest
-def max_pool3D_forward_naive(x, ksize, strides, paddings, global_pool=0):
+def max_pool3D_forward_naive(x,
+                             ksize,
+                             strides,
+                             paddings,
+                             global_pool=0,
+                             ceil_mode=False):
    N, C, D, H, W = x.shape
    if global_pool == 1:
        ksize = [D, H, W]
-    D_out = (D - ksize[0] + 2 * paddings[0]) / strides[0] + 1
+    D_out = (D - ksize[0] + 2 * paddings[0] + strides[0] - 1
-    H_out = (H - ksize[1] + 2 * paddings[1]) / strides[1] + 1
+             ) / strides[0] + 1 if ceil_mode else (H - ksize[0] + 2 *
-    W_out = (W - ksize[2] + 2 * paddings[2]) / strides[2] + 1
+                                                   paddings[0]) / strides[0] + 1
+    H_out = (H - ksize[1] + 2 * paddings[1] + strides[1] - 1
+             ) / strides[1] + 1 if ceil_mode else (W - ksize[1] + 2 *
+                                                   paddings[1]) / strides[1] + 1
+    W_out = (W - ksize[2] + 2 * paddings[2] + strides[2] - 1
+             ) / strides[2] + 1 if ceil_mode else (W - ksize[2] + 2 *
+                                                   paddings[2]) / strides[2] + 1
    out = np.zeros((N, C, D_out, H_out, W_out))
    for k in xrange(D_out):
        d_start = np.max((k * strides[0] - paddings[0], 0))
@@ -42,13 +53,24 @@ def max_pool3D_forward_naive(x, ksize, strides, paddings, global_pool=0):
    return out
-def avg_pool3D_forward_naive(x, ksize, strides, paddings, global_pool=0):
+def avg_pool3D_forward_naive(x,
+                             ksize,
+                             strides,
+                             paddings,
+                             global_pool=0,
+                             ceil_mode=False):
    N, C, D, H, W = x.shape
    if global_pool == 1:
        ksize = [D, H, W]
-    D_out = (D - ksize[0] + 2 * paddings[0]) / strides[0] + 1
+    D_out = (D - ksize[0] + 2 * paddings[0] + strides[0] - 1
-    H_out = (H - ksize[1] + 2 * paddings[1]) / strides[1] + 1
+             ) / strides[0] + 1 if ceil_mode else (H - ksize[0] + 2 *
-    W_out = (W - ksize[2] + 2 * paddings[2]) / strides[2] + 1
+                                                   paddings[0]) / strides[0] + 1
+    H_out = (H - ksize[1] + 2 * paddings[1] + strides[1] - 1
+             ) / strides[1] + 1 if ceil_mode else (W - ksize[1] + 2 *
+                                                   paddings[1]) / strides[1] + 1
+    W_out = (W - ksize[2] + 2 * paddings[2] + strides[2] - 1
+             ) / strides[2] + 1 if ceil_mode else (W - ksize[2] + 2 *
+                                                   paddings[2]) / strides[2] + 1
    out = np.zeros((N, C, D_out, H_out, W_out))
    for k in xrange(D_out):
        d_start = np.max((k * strides[0] - paddings[0], 0))
@@ -73,13 +95,14 @@ class TestPool3d_Op(OpTest):
        self.init_global_pool()
        self.init_op_type()
        self.init_pool_type()
+        self.init_ceil_mode()
        if self.global_pool:
            self.paddings = [0 for _ in range(len(self.paddings))]
        input = np.random.random(self.shape).astype("float32")
        output = self.pool3D_forward_naive(input, self.ksize, self.strides,
-                                           self.paddings,
+                                           self.paddings, self.global_pool,
-                                           self.global_pool).astype("float32")
+                                           self.ceil_mode).astype("float32")
        self.inputs = {'X': input}
        self.attrs = {
@@ -89,6 +112,7 @@ class TestPool3d_Op(OpTest):
            'pooling_type': self.pool_type,
            'global_pooling': self.global_pool,
            'use_cudnn': self.use_cudnn,
+            'ceil_mode': self.ceil_mode,
            'data_format': 'AnyLayout'  # TODO(dzhwinter) : should be fix latter
        }
@@ -125,6 +149,9 @@ class TestPool3d_Op(OpTest):
    def init_global_pool(self):
        self.global_pool = True
+    def init_ceil_mode(self):
+        self.ceil_mode = False
 class TestCase1(TestPool3d_Op):
    def init_test_case(self):
@@ -227,5 +254,25 @@ class TestCUDNNCase6(TestCase5):
        self.op_type = "pool3d"
+class TestCeilModeCase1(TestCUDNNCase1):
+    def init_ceil_mode(self):
+        self.ceil_mode = True
+class TestCeilModeCase2(TestCUDNNCase2):
+    def init_ceil_mode(self):
+        self.ceil_mode = True
+class TestCeilModeCase3(TestCase1):
+    def init_ceil_mode(self):
+        self.ceil_mode = True
+class TestCeilModeCase4(TestCase2):
+    def init_ceil_mode(self):
+        self.ceil_mode = True
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_profiler.py
+++ b/python/paddle/fluid/tests/unittests/test_profiler.py
@@ -37,7 +37,9 @@ class TestProfiler(unittest.TestCase):
            label = fluid.layers.data(name='y', shape=[1], dtype='int64')
            cost = fluid.layers.cross_entropy(input=predict, label=label)
            avg_cost = fluid.layers.mean(cost)
-            accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
+            batch_size = fluid.layers.create_tensor(dtype='int64')
+            batch_acc = fluid.layers.accuracy(
+                input=predict, label=label, total=batch_size)
        optimizer = fluid.optimizer.Momentum(learning_rate=0.001, momentum=0.9)
        opts = optimizer.minimize(avg_cost, startup_program=startup_program)
@@ -46,7 +48,7 @@ class TestProfiler(unittest.TestCase):
        exe = fluid.Executor(place)
        exe.run(startup_program)
-        accuracy.reset(exe)
+        pass_acc_calculator = fluid.average.WeightedAverage()
        with profiler.profiler(state, 'total', profile_path) as prof:
            for iter in range(10):
                if iter == 2:
@@ -57,9 +59,11 @@ class TestProfiler(unittest.TestCase):
                outs = exe.run(main_program,
                               feed={'x': x,
                                     'y': y},
-                               fetch_list=[avg_cost] + accuracy.metrics)
+                               fetch_list=[avg_cost, batch_acc, batch_size])
                acc = np.array(outs[1])
-                pass_acc = accuracy.eval(exe)
+                b_size = np.array(outs[2])
+                pass_acc_calculator.add(value=acc, weight=b_size)
+                pass_acc = pass_acc_calculator.eval()
    def test_cpu_profiler(self):
        self.net_profiler('CPU')

--- a/python/paddle/fluid/tests/unittests/test_reshape_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reshape_op.py
@@ -45,5 +45,33 @@ class TestReshapeOpDimInfer(OpTest):
        self.check_grad(["X"], "Out")
+class TestReshapeOpInplace(OpTest):
+    def setUp(self):
+        self.op_type = "reshape"
+        self.inputs = {'X': np.random.random((10, 20)).astype("float32")}
+        self.attrs = {'shape': [10 * 20], 'inplace': True}
+        self.outputs = {'Out': self.inputs['X'].reshape(self.attrs['shape'])}
+    def test_check_output(self):
+        self.check_output()
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+class TestReshapeOpDimInferInplace(OpTest):
+    def setUp(self):
+        self.op_type = "reshape"
+        self.inputs = {'X': np.random.random((10, 20)).astype("float32")}
+        self.attrs = {'shape': [4, -1, 5], 'inplace': True}
+        self.outputs = {'Out': self.inputs['X'].reshape(self.attrs['shape'])}
+    def test_check_output(self):
+        self.check_output()
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
 if __name__ == '__main__':
    unittest.main()
--- a/tools/timeline.py
+++ b/tools/timeline.py
@@ -124,15 +124,24 @@ class Timeline(object):
            if event.device_id not in self._devices:
                pid = self._allocate_pid()
                self._devices[event.device_id] = pid
-                self._chrome_trace.emit_pid("device:%s" % pid, pid)
+                if event.device_id >= 0:
+                    self._chrome_trace.emit_pid("gpu:%s:stream:%d" %
+                                                (pid, event.stream_id), pid)
+                elif event.device_id == -1:
+                    self._chrome_trace.emit_pid("cpu:thread_hash:%d" %
+                                                event.stream_id, pid)
    def _allocate_events(self):
        for event in self._profile_pb.events:
            pid = self._devices[event.device_id]
            args = {'name': event.name}
-            self._chrome_trace.emit_region(
+            if event.memcopy.bytes > 0:
-                event.start_ns, (event.end_ns - event.start_ns) / 1000000.0,
+                args = {'mem_bytes': event.memcopy.bytes}
-                pid, 0, 'Op', event.name, args)
+            # TODO(panyx0718): Chrome tracing only handles ms. However, some
+            # ops takes micro-seconds. Hence, we keep the ns here.
+            self._chrome_trace.emit_region(event.start_ns,
+                                           (event.end_ns - event.start_ns) /
+                                           1.0, pid, 0, 'Op', event.name, args)
    def generate_chrome_trace(self):
        self._allocate_pids()